456 lines
16 KiB
Python
456 lines
16 KiB
Python
"""Tests for CSFD.cz scraper module."""
|
||
|
||
import pytest
|
||
from unittest.mock import patch, MagicMock
|
||
from src.core.csfd import (
|
||
CSFDMovie,
|
||
fetch_movie,
|
||
search_movies,
|
||
fetch_movie_by_id,
|
||
_extract_csfd_id,
|
||
_parse_duration,
|
||
_extract_json_ld,
|
||
_extract_rating,
|
||
_extract_poster,
|
||
_extract_plot,
|
||
_extract_genres,
|
||
_extract_origin_info,
|
||
_check_dependencies,
|
||
_solve_anubis_pow,
|
||
_split_countries,
|
||
rating_band,
|
||
clean_filename_to_query,
|
||
find_csfd_url,
|
||
)
|
||
|
||
|
||
def _mock_session(mock_requests):
|
||
"""Wire ``mock_requests`` so ``requests.Session()`` (also as a context
|
||
manager) yields a single configurable session mock and return it."""
|
||
session = MagicMock()
|
||
session.__enter__.return_value = session
|
||
mock_requests.Session.return_value = session
|
||
return session
|
||
|
||
|
||
# Sample HTML for testing
|
||
SAMPLE_JSON_LD = """
|
||
{
|
||
"@type": "Movie",
|
||
"name": "Test Movie",
|
||
"director": [{"@type": "Person", "name": "Test Director"}],
|
||
"actor": [{"@type": "Person", "name": "Actor 1"}, {"@type": "Person", "name": "Actor 2"}],
|
||
"aggregateRating": {"ratingValue": 85.5, "ratingCount": 1000},
|
||
"duration": "PT120M",
|
||
"description": "A test movie description."
|
||
}
|
||
"""
|
||
|
||
SAMPLE_HTML = """
|
||
<html>
|
||
<head>
|
||
<script type="application/ld+json">%s</script>
|
||
</head>
|
||
<body>
|
||
<div class="film-rating-average">85%%</div>
|
||
<div class="genres">
|
||
<a href="/zanry/1/">Drama</a> /
|
||
<a href="/zanry/2/">Thriller</a>
|
||
</div>
|
||
<div class="origin">Česko, 2020, 120 min</div>
|
||
<div class="film-poster">
|
||
<img src="//image.example.com/poster.jpg">
|
||
</div>
|
||
<div class="plot-full"><p>Full plot description.</p></div>
|
||
</body>
|
||
</html>
|
||
""" % SAMPLE_JSON_LD
|
||
|
||
|
||
class TestCSFDMovie:
|
||
"""Tests for CSFDMovie dataclass."""
|
||
|
||
def test_csfd_movie_basic(self):
|
||
"""Test basic CSFDMovie creation."""
|
||
movie = CSFDMovie(title="Test", url="https://csfd.cz/film/123/")
|
||
assert movie.title == "Test"
|
||
assert movie.url == "https://csfd.cz/film/123/"
|
||
assert movie.year is None
|
||
assert movie.genres == []
|
||
assert movie.rating is None
|
||
|
||
def test_csfd_movie_full(self):
|
||
"""Test CSFDMovie with all fields."""
|
||
movie = CSFDMovie(
|
||
title="Test Movie",
|
||
url="https://csfd.cz/film/123/",
|
||
year=2020,
|
||
genres=["Drama", "Thriller"],
|
||
directors=["Director 1"],
|
||
actors=["Actor 1", "Actor 2"],
|
||
rating=85,
|
||
rating_count=1000,
|
||
duration=120,
|
||
countries=["Česko"],
|
||
poster_url="https://image.example.com/poster.jpg",
|
||
plot="A test movie.",
|
||
csfd_id=123
|
||
)
|
||
assert movie.year == 2020
|
||
assert movie.genres == ["Drama", "Thriller"]
|
||
assert movie.rating == 85
|
||
assert movie.duration == 120
|
||
assert movie.countries == ["Česko"]
|
||
assert movie.csfd_id == 123
|
||
|
||
def test_csfd_movie_str(self):
|
||
"""Test CSFDMovie string representation."""
|
||
movie = CSFDMovie(
|
||
title="Test Movie",
|
||
url="https://csfd.cz/film/123/",
|
||
year=2020,
|
||
genres=["Drama"],
|
||
directors=["Director 1"],
|
||
rating=85
|
||
)
|
||
s = str(movie)
|
||
assert "Test Movie (2020)" in s
|
||
assert "85%" in s
|
||
assert "Drama" in s
|
||
assert "Director 1" in s
|
||
|
||
def test_csfd_movie_str_minimal(self):
|
||
"""Test CSFDMovie string with minimal data."""
|
||
movie = CSFDMovie(title="Test", url="https://csfd.cz/film/123/")
|
||
s = str(movie)
|
||
assert "Test" in s
|
||
|
||
|
||
class TestHelperFunctions:
|
||
"""Tests for helper functions."""
|
||
|
||
def test_extract_csfd_id_valid(self):
|
||
"""Test extracting CSFD ID from valid URL."""
|
||
assert _extract_csfd_id("https://www.csfd.cz/film/9423-pane-vy-jste-vdova/") == 9423
|
||
assert _extract_csfd_id("https://www.csfd.cz/film/123456/") == 123456
|
||
assert _extract_csfd_id("/film/999/prehled/") == 999
|
||
|
||
def test_extract_csfd_id_invalid(self):
|
||
"""Test extracting CSFD ID from invalid URL."""
|
||
assert _extract_csfd_id("https://www.csfd.cz/") is None
|
||
assert _extract_csfd_id("not-a-url") is None
|
||
|
||
def test_parse_duration_valid(self):
|
||
"""Test parsing ISO 8601 duration."""
|
||
assert _parse_duration("PT97M") == 97
|
||
assert _parse_duration("PT120M") == 120
|
||
assert _parse_duration("PT60M") == 60
|
||
|
||
def test_parse_duration_invalid(self):
|
||
"""Test parsing invalid duration."""
|
||
assert _parse_duration("") is None
|
||
assert _parse_duration("invalid") is None
|
||
|
||
def test_split_countries_single(self):
|
||
"""A single country yields a one-item list."""
|
||
assert _split_countries("USA") == ["USA"]
|
||
|
||
def test_split_countries_multiple(self):
|
||
"""Slash-separated co-production countries are split and trimmed."""
|
||
assert _split_countries("USA / Velká Británie") == ["USA", "Velká Británie"]
|
||
assert _split_countries("Japonsko/USA") == ["Japonsko", "USA"]
|
||
|
||
def test_split_countries_empty(self):
|
||
"""None/empty yields an empty list."""
|
||
assert _split_countries(None) == []
|
||
assert _split_countries("") == []
|
||
|
||
def test_from_dict_migrates_legacy_country(self):
|
||
"""Legacy cache with a single 'country' string maps to countries list."""
|
||
movie = CSFDMovie.from_dict({"title": "X", "country": "USA / Kanada"})
|
||
assert movie.countries == ["USA", "Kanada"]
|
||
|
||
def test_from_dict_uses_countries_when_present(self):
|
||
"""New cache with 'countries' is used verbatim."""
|
||
movie = CSFDMovie.from_dict({"title": "X", "countries": ["Japonsko", "USA"]})
|
||
assert movie.countries == ["Japonsko", "USA"]
|
||
|
||
def test_rating_band_buckets(self):
|
||
"""Rating is bucketed into ten-point bands, top band spans 90–100 %."""
|
||
assert rating_band(0) == "0–9 %"
|
||
assert rating_band(86) == "80–89 %"
|
||
assert rating_band(90) == "90–100 %"
|
||
assert rating_band(95) == "90–100 %"
|
||
assert rating_band(100) == "90–100 %"
|
||
|
||
def test_csfd_field_values_are_exact_no_transform(self):
|
||
from src.core.csfd import csfd_field_values
|
||
movie = CSFDMovie(title="X", url="u", year=1999, rating=86,
|
||
genres=["Akční", "Sci-Fi"], countries=["USA", "Kanada"])
|
||
assert csfd_field_values(movie, "genres") == ["Akční", "Sci-Fi"]
|
||
assert csfd_field_values(movie, "countries") == ["USA", "Kanada"]
|
||
assert csfd_field_values(movie, "year") == ["1999"]
|
||
# rating tag carries the EXACT value (transform happens only for folders)
|
||
assert csfd_field_values(movie, "rating") == ["86"]
|
||
# missing field / value → empty
|
||
assert csfd_field_values(CSFDMovie(title="X", url="u"), "rating") == []
|
||
assert csfd_field_values(movie, "genres") == csfd_field_values(movie, "genres")
|
||
|
||
def test_apply_transform_decade_band(self):
|
||
from src.core.csfd import apply_transform
|
||
assert apply_transform("86", "decade_band") == "80–89 %"
|
||
assert apply_transform("90", "decade_band") == "90–100 %"
|
||
assert apply_transform("Akční", None) == "Akční" # identity for non-rating
|
||
assert apply_transform("USA", "identity") == "USA"
|
||
assert _parse_duration("PT") is None
|
||
|
||
|
||
class TestHTMLExtraction:
|
||
"""Tests for HTML extraction functions."""
|
||
|
||
@pytest.fixture
|
||
def soup(self):
|
||
"""Create BeautifulSoup object from sample HTML."""
|
||
from bs4 import BeautifulSoup
|
||
return BeautifulSoup(SAMPLE_HTML, "html.parser")
|
||
|
||
def test_extract_json_ld(self, soup):
|
||
"""Test extracting data from JSON-LD."""
|
||
data = _extract_json_ld(soup)
|
||
assert data["title"] == "Test Movie"
|
||
assert data["directors"] == ["Test Director"]
|
||
assert data["actors"] == ["Actor 1", "Actor 2"]
|
||
assert data["rating"] == 86 # Rounded from 85.5
|
||
assert data["rating_count"] == 1000
|
||
assert data["duration"] == 120
|
||
|
||
def test_extract_rating(self, soup):
|
||
"""Test extracting rating from HTML."""
|
||
rating = _extract_rating(soup)
|
||
assert rating == 85
|
||
|
||
def test_extract_genres(self, soup):
|
||
"""Test extracting genres from HTML."""
|
||
genres = _extract_genres(soup)
|
||
assert "Drama" in genres
|
||
assert "Thriller" in genres
|
||
|
||
def test_extract_poster(self, soup):
|
||
"""Test extracting poster URL."""
|
||
poster = _extract_poster(soup)
|
||
assert poster == "https://image.example.com/poster.jpg"
|
||
|
||
def test_extract_plot(self, soup):
|
||
"""Test extracting plot."""
|
||
plot = _extract_plot(soup)
|
||
assert plot == "Full plot description."
|
||
|
||
def test_extract_origin_info(self, soup):
|
||
"""Test extracting origin info (comma-separated legacy format)."""
|
||
info = _extract_origin_info(soup)
|
||
assert info["countries"] == ["Česko"]
|
||
assert info["year"] == 2020
|
||
assert info["duration"] == 120
|
||
|
||
def test_extract_origin_info_bullet_format(self):
|
||
"""Test current CSFD format with inline bullet spans (no commas)."""
|
||
from bs4 import BeautifulSoup
|
||
html = (
|
||
'<div class="origin">USA <span class="bullet"></span>'
|
||
'<span>1999 <span class="bullet"></span> </span>'
|
||
'136 min (Alternativní 131 min)</div>'
|
||
)
|
||
info = _extract_origin_info(BeautifulSoup(html, "html.parser"))
|
||
assert info["countries"] == ["USA"]
|
||
assert info["year"] == 1999
|
||
assert info["duration"] == 136
|
||
|
||
def test_extract_origin_info_multiple_countries(self):
|
||
"""A co-production lists several slash-separated countries."""
|
||
from bs4 import BeautifulSoup
|
||
html = (
|
||
'<div class="origin">USA / Velká Británie '
|
||
'<span class="bullet"></span><span>2009 </span>'
|
||
'<span class="bullet"></span> 166 min</div>'
|
||
)
|
||
info = _extract_origin_info(BeautifulSoup(html, "html.parser"))
|
||
assert info["countries"] == ["USA", "Velká Británie"]
|
||
assert info["year"] == 2009
|
||
assert info["duration"] == 166
|
||
|
||
def test_extract_json_ld_year_from_date_created(self):
|
||
"""Year is taken from JSON-LD dateCreated when present."""
|
||
from bs4 import BeautifulSoup
|
||
html = (
|
||
'<script type="application/ld+json">'
|
||
'{"@type": "Movie", "name": "Matrix", "dateCreated": 1999}'
|
||
'</script>'
|
||
)
|
||
data = _extract_json_ld(BeautifulSoup(html, "html.parser"))
|
||
assert data["year"] == 1999
|
||
|
||
|
||
class TestCleanFilenameToQuery:
|
||
"""Tests for turning a filename into a ČSFD search query."""
|
||
|
||
def test_strips_release_tags_and_keeps_year(self):
|
||
assert clean_filename_to_query(
|
||
"Matrix.1999.1080p.BluRay.x264-GROUP.mkv") == "Matrix 1999"
|
||
|
||
def test_handles_spaces_and_parens_year(self):
|
||
assert clean_filename_to_query(
|
||
"Forrest Gump (1994) 2160p HDR.mkv") == "Forrest Gump 1994"
|
||
|
||
def test_no_year_no_markers(self):
|
||
assert clean_filename_to_query("Amelie.mkv") == "Amelie"
|
||
|
||
def test_underscores_and_resolution(self):
|
||
assert clean_filename_to_query("Sam_doma_720p.mkv") == "Sam doma"
|
||
|
||
def test_falls_back_to_stem_when_starting_with_marker(self):
|
||
# No real title words before the marker → fall back to the cleaned stem
|
||
assert clean_filename_to_query("1080p.mkv") == "1080p"
|
||
|
||
|
||
class TestFindCsfdUrl:
|
||
"""Tests for find_csfd_url (search is mocked)."""
|
||
|
||
def test_returns_first_result_url(self):
|
||
from unittest.mock import patch
|
||
movies = [
|
||
CSFDMovie(title="Matrix", url="https://www.csfd.cz/film/9499-matrix/"),
|
||
CSFDMovie(title="Matrix Reloaded", url="https://www.csfd.cz/film/9497-x/"),
|
||
]
|
||
with patch("src.core.csfd.search_movies", return_value=movies):
|
||
assert find_csfd_url("Matrix 1999") == "https://www.csfd.cz/film/9499-matrix/"
|
||
|
||
def test_returns_none_for_empty_query(self):
|
||
assert find_csfd_url(" ") is None
|
||
|
||
def test_returns_none_when_no_results(self):
|
||
from unittest.mock import patch
|
||
with patch("src.core.csfd.search_movies", return_value=[]):
|
||
assert find_csfd_url("nonexistent film") is None
|
||
|
||
|
||
class TestFetchMovie:
|
||
"""Tests for fetch_movie function."""
|
||
|
||
@patch("src.core.csfd.requests")
|
||
def test_fetch_movie_success(self, mock_requests):
|
||
"""Test successful movie fetch."""
|
||
mock_response = MagicMock()
|
||
mock_response.text = SAMPLE_HTML
|
||
mock_response.raise_for_status = MagicMock()
|
||
session = _mock_session(mock_requests)
|
||
session.get.return_value = mock_response
|
||
|
||
movie = fetch_movie("https://www.csfd.cz/film/123-test/")
|
||
|
||
assert movie.title == "Test Movie"
|
||
assert movie.csfd_id == 123
|
||
assert movie.rating == 86
|
||
assert "Drama" in movie.genres
|
||
session.get.assert_called_once()
|
||
|
||
@patch("src.core.csfd.requests")
|
||
def test_fetch_movie_caps_actors_at_ten(self, mock_requests):
|
||
"""Only the first MAX_ACTORS (10) of a long cast are kept."""
|
||
import json as _json
|
||
actors = [{"@type": "Person", "name": f"Actor {i}"} for i in range(25)]
|
||
json_ld = _json.dumps({
|
||
"@type": "Movie", "name": "Crowded", "actor": actors,
|
||
"director": [{"@type": "Person", "name": "Dir"}],
|
||
"aggregateRating": {"ratingValue": 70, "ratingCount": 5},
|
||
})
|
||
html = f'<html><head><script type="application/ld+json">{json_ld}</script></head></html>'
|
||
mock_response = MagicMock()
|
||
mock_response.text = html
|
||
mock_response.raise_for_status = MagicMock()
|
||
session = _mock_session(mock_requests)
|
||
session.get.return_value = mock_response
|
||
|
||
movie = fetch_movie("https://www.csfd.cz/film/1-crowded/")
|
||
|
||
assert movie.directors == ["Dir"]
|
||
assert movie.rating == 70
|
||
assert len(movie.actors) == 10
|
||
assert movie.actors[0] == "Actor 0"
|
||
assert movie.actors[-1] == "Actor 9"
|
||
|
||
@patch("src.core.csfd.requests")
|
||
def test_fetch_movie_network_error(self, mock_requests):
|
||
"""Test network error handling."""
|
||
import requests as real_requests
|
||
session = _mock_session(mock_requests)
|
||
session.get.side_effect = real_requests.RequestException("Network error")
|
||
|
||
with pytest.raises(real_requests.RequestException):
|
||
fetch_movie("https://www.csfd.cz/film/123/")
|
||
|
||
|
||
class TestSearchMovies:
|
||
"""Tests for search_movies function."""
|
||
|
||
@patch("src.core.csfd.requests")
|
||
def test_search_movies(self, mock_requests):
|
||
"""Test movie search."""
|
||
search_html = """
|
||
<html><body>
|
||
<a href="/film/123-test/" class="film-title-name">Test Movie</a>
|
||
<a href="/film/456-another/" class="film-title-name">Another Movie</a>
|
||
</body></html>
|
||
"""
|
||
mock_response = MagicMock()
|
||
mock_response.text = search_html
|
||
mock_response.raise_for_status = MagicMock()
|
||
session = _mock_session(mock_requests)
|
||
session.get.return_value = mock_response
|
||
mock_requests.utils.quote = lambda x: x
|
||
|
||
results = search_movies("test", limit=10)
|
||
|
||
assert len(results) >= 1
|
||
assert any(m.csfd_id == 123 for m in results)
|
||
|
||
|
||
class TestFetchMovieById:
|
||
"""Tests for fetch_movie_by_id function."""
|
||
|
||
@patch("src.core.csfd.fetch_movie")
|
||
def test_fetch_by_id(self, mock_fetch):
|
||
"""Test fetching movie by ID."""
|
||
mock_fetch.return_value = CSFDMovie(title="Test", url="https://csfd.cz/film/9423/")
|
||
|
||
movie = fetch_movie_by_id(9423)
|
||
|
||
mock_fetch.assert_called_once_with("https://www.csfd.cz/film/9423/")
|
||
assert movie.title == "Test"
|
||
|
||
|
||
class TestAnubisPoW:
|
||
"""Tests for the Anubis proof-of-work solver."""
|
||
|
||
def test_solve_pow_difficulty_one(self):
|
||
"""Difficulty 1 requires a single leading zero nibble in the hash."""
|
||
import hashlib
|
||
|
||
random_data = "abc123"
|
||
hash_hex, nonce, _ = _solve_anubis_pow(random_data, difficulty=1)
|
||
assert hash_hex[0] == "0"
|
||
assert hashlib.sha256(f"{random_data}{nonce}".encode()).hexdigest() == hash_hex
|
||
|
||
def test_solve_pow_difficulty_two(self):
|
||
"""Difficulty 2 requires two leading zero nibbles (one zero byte)."""
|
||
hash_hex, _, _ = _solve_anubis_pow("seed", difficulty=2)
|
||
assert hash_hex[:2] == "00"
|
||
|
||
|
||
class TestDependencyCheck:
|
||
"""Tests for dependency checking."""
|
||
|
||
def test_dependencies_available(self):
|
||
"""Test that dependencies are available (they should be in test env)."""
|
||
# Should not raise
|
||
_check_dependencies()
|