Files
Curator/scripts/rargb_magnets.py
T

197 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""Standalone scraper: collect magnet links from a rargb.to search.
Given a search query it walks every results page
(``https://rargb.to/search/?search=<query>`` and ``/search/<N>/?search=<query>``),
opens each torrent's detail page and saves its magnet link.
This is a self-contained tool — it only needs ``requests`` and
``beautifulsoup4`` and does not import anything from the Curator project.
Examples:
python scripts/rargb_magnets.py "ubuntu 24.04"
python scripts/rargb_magnets.py test --output test_magnets.txt --max-pages 3
python scripts/rargb_magnets.py test --tsv # also write name<TAB>magnet
Be considerate: a polite delay is inserted between requests by default. Use the
results responsibly and respect the target site's terms and your local law.
"""
from __future__ import annotations
import re
import sys
import time
import argparse
from pathlib import Path
from urllib.parse import quote, urljoin
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://rargb.to"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
}
MAGNET_RE = re.compile(r"magnet:\?[^\"'\s<>]+")
def search_page_url(query: str, page: int) -> str:
"""URL of the N-th results page for a query (page 1 has no number)."""
q = quote(query)
if page <= 1:
return f"{BASE_URL}/search/?search={q}"
return f"{BASE_URL}/search/{page}/?search={q}"
def fetch(session: requests.Session, url: str, timeout: float, retries: int) -> str | None:
"""GET ``url`` and return the HTML, or None after exhausting retries."""
for attempt in range(1, retries + 1):
try:
resp = session.get(url, headers=HEADERS, timeout=timeout)
resp.raise_for_status()
return resp.text
except requests.RequestException as exc:
wait = attempt * 2
print(f" ! chyba ({attempt}/{retries}) u {url}: {exc} — čekám {wait}s",
file=sys.stderr)
time.sleep(wait)
return None
def parse_result_links(html: str) -> list[tuple[str, str]]:
"""Return (name, detail_url) for each result row on a search page."""
soup = BeautifulSoup(html, "html.parser")
results: list[tuple[str, str]] = []
seen: set[str] = set()
for row in soup.select("tr.lista2"):
link = row.find("a", href=re.compile(r"^/torrent/"))
if not link:
continue
href = link.get("href")
if not href or href in seen:
continue
seen.add(href)
name = link.get("title") or link.get_text(strip=True) or href
results.append((name.strip(), urljoin(BASE_URL, href)))
return results
def parse_last_page(html: str) -> int:
"""Best-effort highest page number from the pager (1 if none found)."""
pages = [int(n) for n in re.findall(r"/search/(\d+)/\?search=", html)]
return max(pages) if pages else 1
def extract_magnet(html: str) -> str | None:
"""First magnet link found on a torrent detail page, or None."""
match = MAGNET_RE.search(html)
return match.group(0) if match else None
def scrape(query: str, max_pages: int | None, delay: float,
timeout: float, retries: int) -> list[tuple[str, str]]:
"""Walk all result pages and return a de-duplicated [(name, magnet)] list."""
session = requests.Session()
collected: list[tuple[str, str]] = []
seen_magnets: set[str] = set()
seen_details: set[str] = set()
first_html = fetch(session, search_page_url(query, 1), timeout, retries)
if first_html is None:
print("Nepodařilo se načíst první stránku výsledků.", file=sys.stderr)
return collected
last_page = parse_last_page(first_html)
if max_pages is not None:
last_page = min(last_page, max_pages)
print(f"Dotaz: {query!r} — stránek k projití: ~{last_page}")
page = 1
while True:
html = first_html if page == 1 else fetch(
session, search_page_url(query, page), timeout, retries)
if html is None:
break
rows = parse_result_links(html)
new_rows = [(n, u) for n, u in rows if u not in seen_details]
if not new_rows:
# No fresh results → past the last real page; stop.
break
print(f"[strana {page}] nalezeno položek: {len(new_rows)}")
for name, detail_url in new_rows:
seen_details.add(detail_url)
time.sleep(delay)
detail_html = fetch(session, detail_url, timeout, retries)
if detail_html is None:
print(f" - {name}: detail se nenačetl", file=sys.stderr)
continue
magnet = extract_magnet(detail_html)
if not magnet:
print(f" - {name}: magnet nenalezen", file=sys.stderr)
continue
if magnet in seen_magnets:
continue
seen_magnets.add(magnet)
collected.append((name, magnet))
print(f" + {name}")
if max_pages is not None and page >= max_pages:
break
page += 1
if page > last_page:
# Probe one page past the detected last page in case the pager was
# windowed; the empty-results check above will stop us if it's truly
# the end.
last_page = page
time.sleep(delay)
return collected
def main() -> None:
parser = argparse.ArgumentParser(
description="Vyparsuje magnet odkazy z vyhledávání na rargb.to.")
parser.add_argument("query", help="Vyhledávací dotaz (např. \"ubuntu 24.04\")")
parser.add_argument("-o", "--output", type=Path,
help="Výstupní soubor (výchozí: magnets_<dotaz>.txt)")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximální počet stránek (výchozí: všechny)")
parser.add_argument("--delay", type=float, default=1.0,
help="Prodleva mezi requesty v sekundách (výchozí: 1.0)")
parser.add_argument("--timeout", type=float, default=20.0,
help="Timeout requestu v sekundách (výchozí: 20)")
parser.add_argument("--retries", type=int, default=3,
help="Počet pokusů při chybě (výchozí: 3)")
parser.add_argument("--tsv", action="store_true",
help="Uložit i <název>\\t<magnet> vedle čistých magnetů")
args = parser.parse_args()
output = args.output or Path(
f"magnets_{re.sub(r'[^A-Za-z0-9._-]+', '_', args.query).strip('_')}.txt")
results = scrape(args.query, args.max_pages, args.delay, args.timeout, args.retries)
if not results:
print("Nenalezeny žádné magnet odkazy.")
sys.exit(1)
output.write_text("".join(f"{magnet}\n" for _, magnet in results), encoding="utf-8")
print(f"\nUloženo {len(results)} magnet odkazů do: {output}")
if args.tsv:
tsv_path = output.with_suffix(".tsv")
tsv_path.write_text(
"".join(f"{name}\t{magnet}\n" for name, magnet in results), encoding="utf-8")
print(f"Uloženo také název+magnet do: {tsv_path}")
if __name__ == "__main__":
main()