197 lines
7.2 KiB
Python
197 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Standalone scraper: collect magnet links from a rargb.to search.
|
|
|
|
Given a search query it walks every results page
|
|
(``https://rargb.to/search/?search=<query>`` and ``/search/<N>/?search=<query>``),
|
|
opens each torrent's detail page and saves its magnet link.
|
|
|
|
This is a self-contained tool — it only needs ``requests`` and
|
|
``beautifulsoup4`` and does not import anything from the Curator project.
|
|
|
|
Examples:
|
|
python scripts/rargb_magnets.py "ubuntu 24.04"
|
|
python scripts/rargb_magnets.py test --output test_magnets.txt --max-pages 3
|
|
python scripts/rargb_magnets.py test --tsv # also write name<TAB>magnet
|
|
|
|
Be considerate: a polite delay is inserted between requests by default. Use the
|
|
results responsibly and respect the target site's terms and your local law.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from pathlib import Path
|
|
from urllib.parse import quote, urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
BASE_URL = "https://rargb.to"
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
}
|
|
MAGNET_RE = re.compile(r"magnet:\?[^\"'\s<>]+")
|
|
|
|
|
|
def search_page_url(query: str, page: int) -> str:
|
|
"""URL of the N-th results page for a query (page 1 has no number)."""
|
|
q = quote(query)
|
|
if page <= 1:
|
|
return f"{BASE_URL}/search/?search={q}"
|
|
return f"{BASE_URL}/search/{page}/?search={q}"
|
|
|
|
|
|
def fetch(session: requests.Session, url: str, timeout: float, retries: int) -> str | None:
|
|
"""GET ``url`` and return the HTML, or None after exhausting retries."""
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
resp = session.get(url, headers=HEADERS, timeout=timeout)
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
except requests.RequestException as exc:
|
|
wait = attempt * 2
|
|
print(f" ! chyba ({attempt}/{retries}) u {url}: {exc} — čekám {wait}s",
|
|
file=sys.stderr)
|
|
time.sleep(wait)
|
|
return None
|
|
|
|
|
|
def parse_result_links(html: str) -> list[tuple[str, str]]:
|
|
"""Return (name, detail_url) for each result row on a search page."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
results: list[tuple[str, str]] = []
|
|
seen: set[str] = set()
|
|
for row in soup.select("tr.lista2"):
|
|
link = row.find("a", href=re.compile(r"^/torrent/"))
|
|
if not link:
|
|
continue
|
|
href = link.get("href")
|
|
if not href or href in seen:
|
|
continue
|
|
seen.add(href)
|
|
name = link.get("title") or link.get_text(strip=True) or href
|
|
results.append((name.strip(), urljoin(BASE_URL, href)))
|
|
return results
|
|
|
|
|
|
def parse_last_page(html: str) -> int:
|
|
"""Best-effort highest page number from the pager (1 if none found)."""
|
|
pages = [int(n) for n in re.findall(r"/search/(\d+)/\?search=", html)]
|
|
return max(pages) if pages else 1
|
|
|
|
|
|
def extract_magnet(html: str) -> str | None:
|
|
"""First magnet link found on a torrent detail page, or None."""
|
|
match = MAGNET_RE.search(html)
|
|
return match.group(0) if match else None
|
|
|
|
|
|
def scrape(query: str, max_pages: int | None, delay: float,
|
|
timeout: float, retries: int) -> list[tuple[str, str]]:
|
|
"""Walk all result pages and return a de-duplicated [(name, magnet)] list."""
|
|
session = requests.Session()
|
|
collected: list[tuple[str, str]] = []
|
|
seen_magnets: set[str] = set()
|
|
seen_details: set[str] = set()
|
|
|
|
first_html = fetch(session, search_page_url(query, 1), timeout, retries)
|
|
if first_html is None:
|
|
print("Nepodařilo se načíst první stránku výsledků.", file=sys.stderr)
|
|
return collected
|
|
|
|
last_page = parse_last_page(first_html)
|
|
if max_pages is not None:
|
|
last_page = min(last_page, max_pages)
|
|
print(f"Dotaz: {query!r} — stránek k projití: ~{last_page}")
|
|
|
|
page = 1
|
|
while True:
|
|
html = first_html if page == 1 else fetch(
|
|
session, search_page_url(query, page), timeout, retries)
|
|
if html is None:
|
|
break
|
|
|
|
rows = parse_result_links(html)
|
|
new_rows = [(n, u) for n, u in rows if u not in seen_details]
|
|
if not new_rows:
|
|
# No fresh results → past the last real page; stop.
|
|
break
|
|
|
|
print(f"[strana {page}] nalezeno položek: {len(new_rows)}")
|
|
for name, detail_url in new_rows:
|
|
seen_details.add(detail_url)
|
|
time.sleep(delay)
|
|
detail_html = fetch(session, detail_url, timeout, retries)
|
|
if detail_html is None:
|
|
print(f" - {name}: detail se nenačetl", file=sys.stderr)
|
|
continue
|
|
magnet = extract_magnet(detail_html)
|
|
if not magnet:
|
|
print(f" - {name}: magnet nenalezen", file=sys.stderr)
|
|
continue
|
|
if magnet in seen_magnets:
|
|
continue
|
|
seen_magnets.add(magnet)
|
|
collected.append((name, magnet))
|
|
print(f" + {name}")
|
|
|
|
if max_pages is not None and page >= max_pages:
|
|
break
|
|
page += 1
|
|
if page > last_page:
|
|
# Probe one page past the detected last page in case the pager was
|
|
# windowed; the empty-results check above will stop us if it's truly
|
|
# the end.
|
|
last_page = page
|
|
time.sleep(delay)
|
|
|
|
return collected
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Vyparsuje magnet odkazy z vyhledávání na rargb.to.")
|
|
parser.add_argument("query", help="Vyhledávací dotaz (např. \"ubuntu 24.04\")")
|
|
parser.add_argument("-o", "--output", type=Path,
|
|
help="Výstupní soubor (výchozí: magnets_<dotaz>.txt)")
|
|
parser.add_argument("--max-pages", type=int, default=None,
|
|
help="Maximální počet stránek (výchozí: všechny)")
|
|
parser.add_argument("--delay", type=float, default=1.0,
|
|
help="Prodleva mezi requesty v sekundách (výchozí: 1.0)")
|
|
parser.add_argument("--timeout", type=float, default=20.0,
|
|
help="Timeout requestu v sekundách (výchozí: 20)")
|
|
parser.add_argument("--retries", type=int, default=3,
|
|
help="Počet pokusů při chybě (výchozí: 3)")
|
|
parser.add_argument("--tsv", action="store_true",
|
|
help="Uložit i <název>\\t<magnet> vedle čistých magnetů")
|
|
args = parser.parse_args()
|
|
|
|
output = args.output or Path(
|
|
f"magnets_{re.sub(r'[^A-Za-z0-9._-]+', '_', args.query).strip('_')}.txt")
|
|
|
|
results = scrape(args.query, args.max_pages, args.delay, args.timeout, args.retries)
|
|
|
|
if not results:
|
|
print("Nenalezeny žádné magnet odkazy.")
|
|
sys.exit(1)
|
|
|
|
output.write_text("".join(f"{magnet}\n" for _, magnet in results), encoding="utf-8")
|
|
print(f"\nUloženo {len(results)} magnet odkazů do: {output}")
|
|
|
|
if args.tsv:
|
|
tsv_path = output.with_suffix(".tsv")
|
|
tsv_path.write_text(
|
|
"".join(f"{name}\t{magnet}\n" for name, magnet in results), encoding="utf-8")
|
|
print(f"Uloženo také název+magnet do: {tsv_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|