#!/usr/bin/env python3 """Standalone scraper: collect magnet links from a rargb.to search. Given a search query it walks every results page (``https://rargb.to/search/?search=`` and ``/search//?search=``), opens each torrent's detail page and saves its magnet link. This is a self-contained tool — it only needs ``requests`` and ``beautifulsoup4`` and does not import anything from the Curator project. Examples: python scripts/rargb_magnets.py "ubuntu 24.04" python scripts/rargb_magnets.py test --output test_magnets.txt --max-pages 3 python scripts/rargb_magnets.py test --tsv # also write namemagnet Be considerate: a polite delay is inserted between requests by default. Use the results responsibly and respect the target site's terms and your local law. """ from __future__ import annotations import re import sys import time import argparse from pathlib import Path from urllib.parse import quote, urljoin import requests from bs4 import BeautifulSoup BASE_URL = "https://rargb.to" HEADERS = { "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", } MAGNET_RE = re.compile(r"magnet:\?[^\"'\s<>]+") def search_page_url(query: str, page: int) -> str: """URL of the N-th results page for a query (page 1 has no number).""" q = quote(query) if page <= 1: return f"{BASE_URL}/search/?search={q}" return f"{BASE_URL}/search/{page}/?search={q}" def fetch(session: requests.Session, url: str, timeout: float, retries: int) -> str | None: """GET ``url`` and return the HTML, or None after exhausting retries.""" for attempt in range(1, retries + 1): try: resp = session.get(url, headers=HEADERS, timeout=timeout) resp.raise_for_status() return resp.text except requests.RequestException as exc: wait = attempt * 2 print(f" ! chyba ({attempt}/{retries}) u {url}: {exc} — čekám {wait}s", file=sys.stderr) time.sleep(wait) return None def parse_result_links(html: str) -> list[tuple[str, str]]: """Return (name, detail_url) for each result row on a search page.""" soup = BeautifulSoup(html, "html.parser") results: list[tuple[str, str]] = [] seen: set[str] = set() for row in soup.select("tr.lista2"): link = row.find("a", href=re.compile(r"^/torrent/")) if not link: continue href = link.get("href") if not href or href in seen: continue seen.add(href) name = link.get("title") or link.get_text(strip=True) or href results.append((name.strip(), urljoin(BASE_URL, href))) return results def parse_last_page(html: str) -> int: """Best-effort highest page number from the pager (1 if none found).""" pages = [int(n) for n in re.findall(r"/search/(\d+)/\?search=", html)] return max(pages) if pages else 1 def extract_magnet(html: str) -> str | None: """First magnet link found on a torrent detail page, or None.""" match = MAGNET_RE.search(html) return match.group(0) if match else None def scrape(query: str, max_pages: int | None, delay: float, timeout: float, retries: int) -> list[tuple[str, str]]: """Walk all result pages and return a de-duplicated [(name, magnet)] list.""" session = requests.Session() collected: list[tuple[str, str]] = [] seen_magnets: set[str] = set() seen_details: set[str] = set() first_html = fetch(session, search_page_url(query, 1), timeout, retries) if first_html is None: print("Nepodařilo se načíst první stránku výsledků.", file=sys.stderr) return collected last_page = parse_last_page(first_html) if max_pages is not None: last_page = min(last_page, max_pages) print(f"Dotaz: {query!r} — stránek k projití: ~{last_page}") page = 1 while True: html = first_html if page == 1 else fetch( session, search_page_url(query, page), timeout, retries) if html is None: break rows = parse_result_links(html) new_rows = [(n, u) for n, u in rows if u not in seen_details] if not new_rows: # No fresh results → past the last real page; stop. break print(f"[strana {page}] nalezeno položek: {len(new_rows)}") for name, detail_url in new_rows: seen_details.add(detail_url) time.sleep(delay) detail_html = fetch(session, detail_url, timeout, retries) if detail_html is None: print(f" - {name}: detail se nenačetl", file=sys.stderr) continue magnet = extract_magnet(detail_html) if not magnet: print(f" - {name}: magnet nenalezen", file=sys.stderr) continue if magnet in seen_magnets: continue seen_magnets.add(magnet) collected.append((name, magnet)) print(f" + {name}") if max_pages is not None and page >= max_pages: break page += 1 if page > last_page: # Probe one page past the detected last page in case the pager was # windowed; the empty-results check above will stop us if it's truly # the end. last_page = page time.sleep(delay) return collected def main() -> None: parser = argparse.ArgumentParser( description="Vyparsuje magnet odkazy z vyhledávání na rargb.to.") parser.add_argument("query", help="Vyhledávací dotaz (např. \"ubuntu 24.04\")") parser.add_argument("-o", "--output", type=Path, help="Výstupní soubor (výchozí: magnets_.txt)") parser.add_argument("--max-pages", type=int, default=None, help="Maximální počet stránek (výchozí: všechny)") parser.add_argument("--delay", type=float, default=1.0, help="Prodleva mezi requesty v sekundách (výchozí: 1.0)") parser.add_argument("--timeout", type=float, default=20.0, help="Timeout requestu v sekundách (výchozí: 20)") parser.add_argument("--retries", type=int, default=3, help="Počet pokusů při chybě (výchozí: 3)") parser.add_argument("--tsv", action="store_true", help="Uložit i \\t vedle čistých magnetů") args = parser.parse_args() output = args.output or Path( f"magnets_{re.sub(r'[^A-Za-z0-9._-]+', '_', args.query).strip('_')}.txt") results = scrape(args.query, args.max_pages, args.delay, args.timeout, args.retries) if not results: print("Nenalezeny žádné magnet odkazy.") sys.exit(1) output.write_text("".join(f"{magnet}\n" for _, magnet in results), encoding="utf-8") print(f"\nUloženo {len(results)} magnet odkazů do: {output}") if args.tsv: tsv_path = output.with_suffix(".tsv") tsv_path.write_text( "".join(f"{name}\t{magnet}\n" for name, magnet in results), encoding="utf-8") print(f"Uloženo také název+magnet do: {tsv_path}") if __name__ == "__main__": main()