Files
Curator/tests/test_csfd.py
T

456 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for CSFD.cz scraper module."""
import pytest
from unittest.mock import patch, MagicMock
from src.core.csfd import (
CSFDMovie,
fetch_movie,
search_movies,
fetch_movie_by_id,
_extract_csfd_id,
_parse_duration,
_extract_json_ld,
_extract_rating,
_extract_poster,
_extract_plot,
_extract_genres,
_extract_origin_info,
_check_dependencies,
_solve_anubis_pow,
_split_countries,
rating_band,
clean_filename_to_query,
find_csfd_url,
)
def _mock_session(mock_requests):
"""Wire ``mock_requests`` so ``requests.Session()`` (also as a context
manager) yields a single configurable session mock and return it."""
session = MagicMock()
session.__enter__.return_value = session
mock_requests.Session.return_value = session
return session
# Sample HTML for testing
SAMPLE_JSON_LD = """
{
"@type": "Movie",
"name": "Test Movie",
"director": [{"@type": "Person", "name": "Test Director"}],
"actor": [{"@type": "Person", "name": "Actor 1"}, {"@type": "Person", "name": "Actor 2"}],
"aggregateRating": {"ratingValue": 85.5, "ratingCount": 1000},
"duration": "PT120M",
"description": "A test movie description."
}
"""
SAMPLE_HTML = """
<html>
<head>
<script type="application/ld+json">%s</script>
</head>
<body>
<div class="film-rating-average">85%%</div>
<div class="genres">
<a href="/zanry/1/">Drama</a> /
<a href="/zanry/2/">Thriller</a>
</div>
<div class="origin">Česko, 2020, 120 min</div>
<div class="film-poster">
<img src="//image.example.com/poster.jpg">
</div>
<div class="plot-full"><p>Full plot description.</p></div>
</body>
</html>
""" % SAMPLE_JSON_LD
class TestCSFDMovie:
"""Tests for CSFDMovie dataclass."""
def test_csfd_movie_basic(self):
"""Test basic CSFDMovie creation."""
movie = CSFDMovie(title="Test", url="https://csfd.cz/film/123/")
assert movie.title == "Test"
assert movie.url == "https://csfd.cz/film/123/"
assert movie.year is None
assert movie.genres == []
assert movie.rating is None
def test_csfd_movie_full(self):
"""Test CSFDMovie with all fields."""
movie = CSFDMovie(
title="Test Movie",
url="https://csfd.cz/film/123/",
year=2020,
genres=["Drama", "Thriller"],
directors=["Director 1"],
actors=["Actor 1", "Actor 2"],
rating=85,
rating_count=1000,
duration=120,
countries=["Česko"],
poster_url="https://image.example.com/poster.jpg",
plot="A test movie.",
csfd_id=123
)
assert movie.year == 2020
assert movie.genres == ["Drama", "Thriller"]
assert movie.rating == 85
assert movie.duration == 120
assert movie.countries == ["Česko"]
assert movie.csfd_id == 123
def test_csfd_movie_str(self):
"""Test CSFDMovie string representation."""
movie = CSFDMovie(
title="Test Movie",
url="https://csfd.cz/film/123/",
year=2020,
genres=["Drama"],
directors=["Director 1"],
rating=85
)
s = str(movie)
assert "Test Movie (2020)" in s
assert "85%" in s
assert "Drama" in s
assert "Director 1" in s
def test_csfd_movie_str_minimal(self):
"""Test CSFDMovie string with minimal data."""
movie = CSFDMovie(title="Test", url="https://csfd.cz/film/123/")
s = str(movie)
assert "Test" in s
class TestHelperFunctions:
"""Tests for helper functions."""
def test_extract_csfd_id_valid(self):
"""Test extracting CSFD ID from valid URL."""
assert _extract_csfd_id("https://www.csfd.cz/film/9423-pane-vy-jste-vdova/") == 9423
assert _extract_csfd_id("https://www.csfd.cz/film/123456/") == 123456
assert _extract_csfd_id("/film/999/prehled/") == 999
def test_extract_csfd_id_invalid(self):
"""Test extracting CSFD ID from invalid URL."""
assert _extract_csfd_id("https://www.csfd.cz/") is None
assert _extract_csfd_id("not-a-url") is None
def test_parse_duration_valid(self):
"""Test parsing ISO 8601 duration."""
assert _parse_duration("PT97M") == 97
assert _parse_duration("PT120M") == 120
assert _parse_duration("PT60M") == 60
def test_parse_duration_invalid(self):
"""Test parsing invalid duration."""
assert _parse_duration("") is None
assert _parse_duration("invalid") is None
def test_split_countries_single(self):
"""A single country yields a one-item list."""
assert _split_countries("USA") == ["USA"]
def test_split_countries_multiple(self):
"""Slash-separated co-production countries are split and trimmed."""
assert _split_countries("USA / Velká Británie") == ["USA", "Velká Británie"]
assert _split_countries("Japonsko/USA") == ["Japonsko", "USA"]
def test_split_countries_empty(self):
"""None/empty yields an empty list."""
assert _split_countries(None) == []
assert _split_countries("") == []
def test_from_dict_migrates_legacy_country(self):
"""Legacy cache with a single 'country' string maps to countries list."""
movie = CSFDMovie.from_dict({"title": "X", "country": "USA / Kanada"})
assert movie.countries == ["USA", "Kanada"]
def test_from_dict_uses_countries_when_present(self):
"""New cache with 'countries' is used verbatim."""
movie = CSFDMovie.from_dict({"title": "X", "countries": ["Japonsko", "USA"]})
assert movie.countries == ["Japonsko", "USA"]
def test_rating_band_buckets(self):
"""Rating is bucketed into ten-point bands, top band spans 90100 %."""
assert rating_band(0) == "09 %"
assert rating_band(86) == "8089 %"
assert rating_band(90) == "90100 %"
assert rating_band(95) == "90100 %"
assert rating_band(100) == "90100 %"
def test_csfd_field_values_are_exact_no_transform(self):
from src.core.csfd import csfd_field_values
movie = CSFDMovie(title="X", url="u", year=1999, rating=86,
genres=["Akční", "Sci-Fi"], countries=["USA", "Kanada"])
assert csfd_field_values(movie, "genres") == ["Akční", "Sci-Fi"]
assert csfd_field_values(movie, "countries") == ["USA", "Kanada"]
assert csfd_field_values(movie, "year") == ["1999"]
# rating tag carries the EXACT value (transform happens only for folders)
assert csfd_field_values(movie, "rating") == ["86"]
# missing field / value → empty
assert csfd_field_values(CSFDMovie(title="X", url="u"), "rating") == []
assert csfd_field_values(movie, "genres") == csfd_field_values(movie, "genres")
def test_apply_transform_decade_band(self):
from src.core.csfd import apply_transform
assert apply_transform("86", "decade_band") == "8089 %"
assert apply_transform("90", "decade_band") == "90100 %"
assert apply_transform("Akční", None) == "Akční" # identity for non-rating
assert apply_transform("USA", "identity") == "USA"
assert _parse_duration("PT") is None
class TestHTMLExtraction:
"""Tests for HTML extraction functions."""
@pytest.fixture
def soup(self):
"""Create BeautifulSoup object from sample HTML."""
from bs4 import BeautifulSoup
return BeautifulSoup(SAMPLE_HTML, "html.parser")
def test_extract_json_ld(self, soup):
"""Test extracting data from JSON-LD."""
data = _extract_json_ld(soup)
assert data["title"] == "Test Movie"
assert data["directors"] == ["Test Director"]
assert data["actors"] == ["Actor 1", "Actor 2"]
assert data["rating"] == 86 # Rounded from 85.5
assert data["rating_count"] == 1000
assert data["duration"] == 120
def test_extract_rating(self, soup):
"""Test extracting rating from HTML."""
rating = _extract_rating(soup)
assert rating == 85
def test_extract_genres(self, soup):
"""Test extracting genres from HTML."""
genres = _extract_genres(soup)
assert "Drama" in genres
assert "Thriller" in genres
def test_extract_poster(self, soup):
"""Test extracting poster URL."""
poster = _extract_poster(soup)
assert poster == "https://image.example.com/poster.jpg"
def test_extract_plot(self, soup):
"""Test extracting plot."""
plot = _extract_plot(soup)
assert plot == "Full plot description."
def test_extract_origin_info(self, soup):
"""Test extracting origin info (comma-separated legacy format)."""
info = _extract_origin_info(soup)
assert info["countries"] == ["Česko"]
assert info["year"] == 2020
assert info["duration"] == 120
def test_extract_origin_info_bullet_format(self):
"""Test current CSFD format with inline bullet spans (no commas)."""
from bs4 import BeautifulSoup
html = (
'<div class="origin">USA <span class="bullet"></span>'
'<span>1999 <span class="bullet"></span> </span>'
'136 min (Alternativní 131 min)</div>'
)
info = _extract_origin_info(BeautifulSoup(html, "html.parser"))
assert info["countries"] == ["USA"]
assert info["year"] == 1999
assert info["duration"] == 136
def test_extract_origin_info_multiple_countries(self):
"""A co-production lists several slash-separated countries."""
from bs4 import BeautifulSoup
html = (
'<div class="origin">USA / Velká Británie '
'<span class="bullet"></span><span>2009 </span>'
'<span class="bullet"></span> 166 min</div>'
)
info = _extract_origin_info(BeautifulSoup(html, "html.parser"))
assert info["countries"] == ["USA", "Velká Británie"]
assert info["year"] == 2009
assert info["duration"] == 166
def test_extract_json_ld_year_from_date_created(self):
"""Year is taken from JSON-LD dateCreated when present."""
from bs4 import BeautifulSoup
html = (
'<script type="application/ld+json">'
'{"@type": "Movie", "name": "Matrix", "dateCreated": 1999}'
'</script>'
)
data = _extract_json_ld(BeautifulSoup(html, "html.parser"))
assert data["year"] == 1999
class TestCleanFilenameToQuery:
"""Tests for turning a filename into a ČSFD search query."""
def test_strips_release_tags_and_keeps_year(self):
assert clean_filename_to_query(
"Matrix.1999.1080p.BluRay.x264-GROUP.mkv") == "Matrix 1999"
def test_handles_spaces_and_parens_year(self):
assert clean_filename_to_query(
"Forrest Gump (1994) 2160p HDR.mkv") == "Forrest Gump 1994"
def test_no_year_no_markers(self):
assert clean_filename_to_query("Amelie.mkv") == "Amelie"
def test_underscores_and_resolution(self):
assert clean_filename_to_query("Sam_doma_720p.mkv") == "Sam doma"
def test_falls_back_to_stem_when_starting_with_marker(self):
# No real title words before the marker → fall back to the cleaned stem
assert clean_filename_to_query("1080p.mkv") == "1080p"
class TestFindCsfdUrl:
"""Tests for find_csfd_url (search is mocked)."""
def test_returns_first_result_url(self):
from unittest.mock import patch
movies = [
CSFDMovie(title="Matrix", url="https://www.csfd.cz/film/9499-matrix/"),
CSFDMovie(title="Matrix Reloaded", url="https://www.csfd.cz/film/9497-x/"),
]
with patch("src.core.csfd.search_movies", return_value=movies):
assert find_csfd_url("Matrix 1999") == "https://www.csfd.cz/film/9499-matrix/"
def test_returns_none_for_empty_query(self):
assert find_csfd_url(" ") is None
def test_returns_none_when_no_results(self):
from unittest.mock import patch
with patch("src.core.csfd.search_movies", return_value=[]):
assert find_csfd_url("nonexistent film") is None
class TestFetchMovie:
"""Tests for fetch_movie function."""
@patch("src.core.csfd.requests")
def test_fetch_movie_success(self, mock_requests):
"""Test successful movie fetch."""
mock_response = MagicMock()
mock_response.text = SAMPLE_HTML
mock_response.raise_for_status = MagicMock()
session = _mock_session(mock_requests)
session.get.return_value = mock_response
movie = fetch_movie("https://www.csfd.cz/film/123-test/")
assert movie.title == "Test Movie"
assert movie.csfd_id == 123
assert movie.rating == 86
assert "Drama" in movie.genres
session.get.assert_called_once()
@patch("src.core.csfd.requests")
def test_fetch_movie_caps_actors_at_ten(self, mock_requests):
"""Only the first MAX_ACTORS (10) of a long cast are kept."""
import json as _json
actors = [{"@type": "Person", "name": f"Actor {i}"} for i in range(25)]
json_ld = _json.dumps({
"@type": "Movie", "name": "Crowded", "actor": actors,
"director": [{"@type": "Person", "name": "Dir"}],
"aggregateRating": {"ratingValue": 70, "ratingCount": 5},
})
html = f'<html><head><script type="application/ld+json">{json_ld}</script></head></html>'
mock_response = MagicMock()
mock_response.text = html
mock_response.raise_for_status = MagicMock()
session = _mock_session(mock_requests)
session.get.return_value = mock_response
movie = fetch_movie("https://www.csfd.cz/film/1-crowded/")
assert movie.directors == ["Dir"]
assert movie.rating == 70
assert len(movie.actors) == 10
assert movie.actors[0] == "Actor 0"
assert movie.actors[-1] == "Actor 9"
@patch("src.core.csfd.requests")
def test_fetch_movie_network_error(self, mock_requests):
"""Test network error handling."""
import requests as real_requests
session = _mock_session(mock_requests)
session.get.side_effect = real_requests.RequestException("Network error")
with pytest.raises(real_requests.RequestException):
fetch_movie("https://www.csfd.cz/film/123/")
class TestSearchMovies:
"""Tests for search_movies function."""
@patch("src.core.csfd.requests")
def test_search_movies(self, mock_requests):
"""Test movie search."""
search_html = """
<html><body>
<a href="/film/123-test/" class="film-title-name">Test Movie</a>
<a href="/film/456-another/" class="film-title-name">Another Movie</a>
</body></html>
"""
mock_response = MagicMock()
mock_response.text = search_html
mock_response.raise_for_status = MagicMock()
session = _mock_session(mock_requests)
session.get.return_value = mock_response
mock_requests.utils.quote = lambda x: x
results = search_movies("test", limit=10)
assert len(results) >= 1
assert any(m.csfd_id == 123 for m in results)
class TestFetchMovieById:
"""Tests for fetch_movie_by_id function."""
@patch("src.core.csfd.fetch_movie")
def test_fetch_by_id(self, mock_fetch):
"""Test fetching movie by ID."""
mock_fetch.return_value = CSFDMovie(title="Test", url="https://csfd.cz/film/9423/")
movie = fetch_movie_by_id(9423)
mock_fetch.assert_called_once_with("https://www.csfd.cz/film/9423/")
assert movie.title == "Test"
class TestAnubisPoW:
"""Tests for the Anubis proof-of-work solver."""
def test_solve_pow_difficulty_one(self):
"""Difficulty 1 requires a single leading zero nibble in the hash."""
import hashlib
random_data = "abc123"
hash_hex, nonce, _ = _solve_anubis_pow(random_data, difficulty=1)
assert hash_hex[0] == "0"
assert hashlib.sha256(f"{random_data}{nonce}".encode()).hexdigest() == hash_hex
def test_solve_pow_difficulty_two(self):
"""Difficulty 2 requires two leading zero nibbles (one zero byte)."""
hash_hex, _, _ = _solve_anubis_pow("seed", difficulty=2)
assert hash_hex[:2] == "00"
class TestDependencyCheck:
"""Tests for dependency checking."""
def test_dependencies_available(self):
"""Test that dependencies are available (they should be in test env)."""
# Should not raise
_check_dependencies()