Curator/scripts/rargb_magnets.py

#!/usr/bin/env python3
"""Standalone scraper: collect magnet links from a rargb.to search.

Given a search query it walks every results page
(``https://rargb.to/search/?search=<query>`` and ``/search/<N>/?search=<query>``),
opens each torrent's detail page and saves its magnet link.

This is a self-contained tool — it only needs ``requests`` and
``beautifulsoup4`` and does not import anything from the Curator project.

Examples:
    python scripts/rargb_magnets.py "ubuntu 24.04"
    python scripts/rargb_magnets.py test --output test_magnets.txt --max-pages 3
    python scripts/rargb_magnets.py test --tsv          # also write name<TAB>magnet

Be considerate: a polite delay is inserted between requests by default. Use the
results responsibly and respect the target site's terms and your local law.
"""

from __future__ import annotations

import re
import sys
import time
import argparse
from pathlib import Path
from urllib.parse import quote, urljoin

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://rargb.to"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}
MAGNET_RE = re.compile(r"magnet:\?[^\"'\s<>]+")


def search_page_url(query: str, page: int) -> str:
    """URL of the N-th results page for a query (page 1 has no number)."""
    q = quote(query)
    if page <= 1:
        return f"{BASE_URL}/search/?search={q}"
    return f"{BASE_URL}/search/{page}/?search={q}"


def fetch(session: requests.Session, url: str, timeout: float, retries: int) -> str | None:
    """GET ``url`` and return the HTML, or None after exhausting retries."""
    for attempt in range(1, retries + 1):
        try:
            resp = session.get(url, headers=HEADERS, timeout=timeout)
            resp.raise_for_status()
            return resp.text
        except requests.RequestException as exc:
            wait = attempt * 2
            print(f"  ! chyba ({attempt}/{retries}) u {url}: {exc} — čekám {wait}s",
                  file=sys.stderr)
            time.sleep(wait)
    return None


def parse_result_links(html: str) -> list[tuple[str, str]]:
    """Return (name, detail_url) for each result row on a search page."""
    soup = BeautifulSoup(html, "html.parser")
    results: list[tuple[str, str]] = []
    seen: set[str] = set()
    for row in soup.select("tr.lista2"):
        link = row.find("a", href=re.compile(r"^/torrent/"))
        if not link:
            continue
        href = link.get("href")
        if not href or href in seen:
            continue
        seen.add(href)
        name = link.get("title") or link.get_text(strip=True) or href
        results.append((name.strip(), urljoin(BASE_URL, href)))
    return results


def parse_last_page(html: str) -> int:
    """Best-effort highest page number from the pager (1 if none found)."""
    pages = [int(n) for n in re.findall(r"/search/(\d+)/\?search=", html)]
    return max(pages) if pages else 1


def extract_magnet(html: str) -> str | None:
    """First magnet link found on a torrent detail page, or None."""
    match = MAGNET_RE.search(html)
    return match.group(0) if match else None


def scrape(query: str, max_pages: int | None, delay: float,
           timeout: float, retries: int) -> list[tuple[str, str]]:
    """Walk all result pages and return a de-duplicated [(name, magnet)] list."""
    session = requests.Session()
    collected: list[tuple[str, str]] = []
    seen_magnets: set[str] = set()
    seen_details: set[str] = set()

    first_html = fetch(session, search_page_url(query, 1), timeout, retries)
    if first_html is None:
        print("Nepodařilo se načíst první stránku výsledků.", file=sys.stderr)
        return collected

    last_page = parse_last_page(first_html)
    if max_pages is not None:
        last_page = min(last_page, max_pages)
    print(f"Dotaz: {query!r} — stránek k projití: ~{last_page}")

    page = 1
    while True:
        html = first_html if page == 1 else fetch(
            session, search_page_url(query, page), timeout, retries)
        if html is None:
            break

        rows = parse_result_links(html)
        new_rows = [(n, u) for n, u in rows if u not in seen_details]
        if not new_rows:
            # No fresh results → past the last real page; stop.
            break

        print(f"[strana {page}] nalezeno položek: {len(new_rows)}")
        for name, detail_url in new_rows:
            seen_details.add(detail_url)
            time.sleep(delay)
            detail_html = fetch(session, detail_url, timeout, retries)
            if detail_html is None:
                print(f"  - {name}: detail se nenačetl", file=sys.stderr)
                continue
            magnet = extract_magnet(detail_html)
            if not magnet:
                print(f"  - {name}: magnet nenalezen", file=sys.stderr)
                continue
            if magnet in seen_magnets:
                continue
            seen_magnets.add(magnet)
            collected.append((name, magnet))
            print(f"  + {name}")

        if max_pages is not None and page >= max_pages:
            break
        page += 1
        if page > last_page:
            # Probe one page past the detected last page in case the pager was
            # windowed; the empty-results check above will stop us if it's truly
            # the end.
            last_page = page
        time.sleep(delay)

    return collected


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Vyparsuje magnet odkazy z vyhledávání na rargb.to.")
    parser.add_argument("query", help="Vyhledávací dotaz (např. \"ubuntu 24.04\")")
    parser.add_argument("-o", "--output", type=Path,
                        help="Výstupní soubor (výchozí: magnets_<dotaz>.txt)")
    parser.add_argument("--max-pages", type=int, default=None,
                        help="Maximální počet stránek (výchozí: všechny)")
    parser.add_argument("--delay", type=float, default=1.0,
                        help="Prodleva mezi requesty v sekundách (výchozí: 1.0)")
    parser.add_argument("--timeout", type=float, default=20.0,
                        help="Timeout requestu v sekundách (výchozí: 20)")
    parser.add_argument("--retries", type=int, default=3,
                        help="Počet pokusů při chybě (výchozí: 3)")
    parser.add_argument("--tsv", action="store_true",
                        help="Uložit i <název>\\t<magnet> vedle čistých magnetů")
    args = parser.parse_args()

    output = args.output or Path(
        f"magnets_{re.sub(r'[^A-Za-z0-9._-]+', '_', args.query).strip('_')}.txt")

    results = scrape(args.query, args.max_pages, args.delay, args.timeout, args.retries)

    if not results:
        print("Nenalezeny žádné magnet odkazy.")
        sys.exit(1)

    output.write_text("".join(f"{magnet}\n" for _, magnet in results), encoding="utf-8")
    print(f"\nUloženo {len(results)} magnet odkazů do: {output}")

    if args.tsv:
        tsv_path = output.with_suffix(".tsv")
        tsv_path.write_text(
            "".join(f"{name}\t{magnet}\n" for name, magnet in results), encoding="utf-8")
        print(f"Uloženo také název+magnet do: {tsv_path}")


if __name__ == "__main__":
    main()