Curator/tests/test_csfd.py

"""Tests for CSFD.cz scraper module."""

import pytest
from unittest.mock import patch, MagicMock
from src.core.csfd import (
    CSFDMovie,
    fetch_movie,
    search_movies,
    fetch_movie_by_id,
    _extract_csfd_id,
    _parse_duration,
    _extract_json_ld,
    _extract_rating,
    _extract_poster,
    _extract_plot,
    _extract_genres,
    _extract_origin_info,
    _check_dependencies,
    _solve_anubis_pow,
    _split_countries,
    rating_band,
    clean_filename_to_query,
    find_csfd_url,
)


def _mock_session(mock_requests):
    """Wire ``mock_requests`` so ``requests.Session()`` (also as a context
    manager) yields a single configurable session mock and return it."""
    session = MagicMock()
    session.__enter__.return_value = session
    mock_requests.Session.return_value = session
    return session


# Sample HTML for testing
SAMPLE_JSON_LD = """
{
    "@type": "Movie",
    "name": "Test Movie",
    "director": [{"@type": "Person", "name": "Test Director"}],
    "actor": [{"@type": "Person", "name": "Actor 1"}, {"@type": "Person", "name": "Actor 2"}],
    "aggregateRating": {"ratingValue": 85.5, "ratingCount": 1000},
    "duration": "PT120M",
    "description": "A test movie description."
}
"""

SAMPLE_HTML = """
<html>
<head>
<script type="application/ld+json">%s</script>
</head>
<body>
<div class="film-rating-average">85%%</div>
<div class="genres">
    <a href="/zanry/1/">Drama</a> /
    <a href="/zanry/2/">Thriller</a>
</div>
<div class="origin">Česko, 2020, 120 min</div>
<div class="film-poster">
    <img src="//image.example.com/poster.jpg">
</div>
<div class="plot-full"><p>Full plot description.</p></div>
</body>
</html>
""" % SAMPLE_JSON_LD


class TestCSFDMovie:
    """Tests for CSFDMovie dataclass."""

    def test_csfd_movie_basic(self):
        """Test basic CSFDMovie creation."""
        movie = CSFDMovie(title="Test", url="https://csfd.cz/film/123/")
        assert movie.title == "Test"
        assert movie.url == "https://csfd.cz/film/123/"
        assert movie.year is None
        assert movie.genres == []
        assert movie.rating is None

    def test_csfd_movie_full(self):
        """Test CSFDMovie with all fields."""
        movie = CSFDMovie(
            title="Test Movie",
            url="https://csfd.cz/film/123/",
            year=2020,
            genres=["Drama", "Thriller"],
            directors=["Director 1"],
            actors=["Actor 1", "Actor 2"],
            rating=85,
            rating_count=1000,
            duration=120,
            countries=["Česko"],
            poster_url="https://image.example.com/poster.jpg",
            plot="A test movie.",
            csfd_id=123
        )
        assert movie.year == 2020
        assert movie.genres == ["Drama", "Thriller"]
        assert movie.rating == 85
        assert movie.duration == 120
        assert movie.countries == ["Česko"]
        assert movie.csfd_id == 123

    def test_csfd_movie_str(self):
        """Test CSFDMovie string representation."""
        movie = CSFDMovie(
            title="Test Movie",
            url="https://csfd.cz/film/123/",
            year=2020,
            genres=["Drama"],
            directors=["Director 1"],
            rating=85
        )
        s = str(movie)
        assert "Test Movie (2020)" in s
        assert "85%" in s
        assert "Drama" in s
        assert "Director 1" in s

    def test_csfd_movie_str_minimal(self):
        """Test CSFDMovie string with minimal data."""
        movie = CSFDMovie(title="Test", url="https://csfd.cz/film/123/")
        s = str(movie)
        assert "Test" in s


class TestHelperFunctions:
    """Tests for helper functions."""

    def test_extract_csfd_id_valid(self):
        """Test extracting CSFD ID from valid URL."""
        assert _extract_csfd_id("https://www.csfd.cz/film/9423-pane-vy-jste-vdova/") == 9423
        assert _extract_csfd_id("https://www.csfd.cz/film/123456/") == 123456
        assert _extract_csfd_id("/film/999/prehled/") == 999

    def test_extract_csfd_id_invalid(self):
        """Test extracting CSFD ID from invalid URL."""
        assert _extract_csfd_id("https://www.csfd.cz/") is None
        assert _extract_csfd_id("not-a-url") is None

    def test_parse_duration_valid(self):
        """Test parsing ISO 8601 duration."""
        assert _parse_duration("PT97M") == 97
        assert _parse_duration("PT120M") == 120
        assert _parse_duration("PT60M") == 60

    def test_parse_duration_invalid(self):
        """Test parsing invalid duration."""
        assert _parse_duration("") is None
        assert _parse_duration("invalid") is None

    def test_split_countries_single(self):
        """A single country yields a one-item list."""
        assert _split_countries("USA") == ["USA"]

    def test_split_countries_multiple(self):
        """Slash-separated co-production countries are split and trimmed."""
        assert _split_countries("USA / Velká Británie") == ["USA", "Velká Británie"]
        assert _split_countries("Japonsko/USA") == ["Japonsko", "USA"]

    def test_split_countries_empty(self):
        """None/empty yields an empty list."""
        assert _split_countries(None) == []
        assert _split_countries("") == []

    def test_from_dict_migrates_legacy_country(self):
        """Legacy cache with a single 'country' string maps to countries list."""
        movie = CSFDMovie.from_dict({"title": "X", "country": "USA / Kanada"})
        assert movie.countries == ["USA", "Kanada"]

    def test_from_dict_uses_countries_when_present(self):
        """New cache with 'countries' is used verbatim."""
        movie = CSFDMovie.from_dict({"title": "X", "countries": ["Japonsko", "USA"]})
        assert movie.countries == ["Japonsko", "USA"]

    def test_rating_band_buckets(self):
        """Rating is bucketed into ten-point bands, top band spans 90–100 %."""
        assert rating_band(0) == "0–9 %"
        assert rating_band(86) == "80–89 %"
        assert rating_band(90) == "90–100 %"
        assert rating_band(95) == "90–100 %"
        assert rating_band(100) == "90–100 %"

    def test_csfd_field_values_are_exact_no_transform(self):
        from src.core.csfd import csfd_field_values
        movie = CSFDMovie(title="X", url="u", year=1999, rating=86,
                          genres=["Akční", "Sci-Fi"], countries=["USA", "Kanada"])
        assert csfd_field_values(movie, "genres") == ["Akční", "Sci-Fi"]
        assert csfd_field_values(movie, "countries") == ["USA", "Kanada"]
        assert csfd_field_values(movie, "year") == ["1999"]
        # rating tag carries the EXACT value (transform happens only for folders)
        assert csfd_field_values(movie, "rating") == ["86"]
        # missing field / value → empty
        assert csfd_field_values(CSFDMovie(title="X", url="u"), "rating") == []
        assert csfd_field_values(movie, "genres") == csfd_field_values(movie, "genres")

    def test_apply_transform_decade_band(self):
        from src.core.csfd import apply_transform
        assert apply_transform("86", "decade_band") == "80–89 %"
        assert apply_transform("90", "decade_band") == "90–100 %"
        assert apply_transform("Akční", None) == "Akční"      # identity for non-rating
        assert apply_transform("USA", "identity") == "USA"
        assert _parse_duration("PT") is None


class TestHTMLExtraction:
    """Tests for HTML extraction functions."""

    @pytest.fixture
    def soup(self):
        """Create BeautifulSoup object from sample HTML."""
        from bs4 import BeautifulSoup
        return BeautifulSoup(SAMPLE_HTML, "html.parser")

    def test_extract_json_ld(self, soup):
        """Test extracting data from JSON-LD."""
        data = _extract_json_ld(soup)
        assert data["title"] == "Test Movie"
        assert data["directors"] == ["Test Director"]
        assert data["actors"] == ["Actor 1", "Actor 2"]
        assert data["rating"] == 86  # Rounded from 85.5
        assert data["rating_count"] == 1000
        assert data["duration"] == 120

    def test_extract_rating(self, soup):
        """Test extracting rating from HTML."""
        rating = _extract_rating(soup)
        assert rating == 85

    def test_extract_genres(self, soup):
        """Test extracting genres from HTML."""
        genres = _extract_genres(soup)
        assert "Drama" in genres
        assert "Thriller" in genres

    def test_extract_poster(self, soup):
        """Test extracting poster URL."""
        poster = _extract_poster(soup)
        assert poster == "https://image.example.com/poster.jpg"

    def test_extract_plot(self, soup):
        """Test extracting plot."""
        plot = _extract_plot(soup)
        assert plot == "Full plot description."

    def test_extract_origin_info(self, soup):
        """Test extracting origin info (comma-separated legacy format)."""
        info = _extract_origin_info(soup)
        assert info["countries"] == ["Česko"]
        assert info["year"] == 2020
        assert info["duration"] == 120

    def test_extract_origin_info_bullet_format(self):
        """Test current CSFD format with inline bullet spans (no commas)."""
        from bs4 import BeautifulSoup
        html = (
            '<div class="origin">USA <span class="bullet"></span>'
            '<span>1999 <span class="bullet"></span> </span>'
            '136 min (Alternativní 131 min)</div>'
        )
        info = _extract_origin_info(BeautifulSoup(html, "html.parser"))
        assert info["countries"] == ["USA"]
        assert info["year"] == 1999
        assert info["duration"] == 136

    def test_extract_origin_info_multiple_countries(self):
        """A co-production lists several slash-separated countries."""
        from bs4 import BeautifulSoup
        html = (
            '<div class="origin">USA / Velká Británie '
            '<span class="bullet"></span><span>2009 </span>'
            '<span class="bullet"></span> 166 min</div>'
        )
        info = _extract_origin_info(BeautifulSoup(html, "html.parser"))
        assert info["countries"] == ["USA", "Velká Británie"]
        assert info["year"] == 2009
        assert info["duration"] == 166

    def test_extract_json_ld_year_from_date_created(self):
        """Year is taken from JSON-LD dateCreated when present."""
        from bs4 import BeautifulSoup
        html = (
            '<script type="application/ld+json">'
            '{"@type": "Movie", "name": "Matrix", "dateCreated": 1999}'
            '</script>'
        )
        data = _extract_json_ld(BeautifulSoup(html, "html.parser"))
        assert data["year"] == 1999


class TestCleanFilenameToQuery:
    """Tests for turning a filename into a ČSFD search query."""

    def test_strips_release_tags_and_keeps_year(self):
        assert clean_filename_to_query(
            "Matrix.1999.1080p.BluRay.x264-GROUP.mkv") == "Matrix 1999"

    def test_handles_spaces_and_parens_year(self):
        assert clean_filename_to_query(
            "Forrest Gump (1994) 2160p HDR.mkv") == "Forrest Gump 1994"

    def test_no_year_no_markers(self):
        assert clean_filename_to_query("Amelie.mkv") == "Amelie"

    def test_underscores_and_resolution(self):
        assert clean_filename_to_query("Sam_doma_720p.mkv") == "Sam doma"

    def test_falls_back_to_stem_when_starting_with_marker(self):
        # No real title words before the marker → fall back to the cleaned stem
        assert clean_filename_to_query("1080p.mkv") == "1080p"


class TestFindCsfdUrl:
    """Tests for find_csfd_url (search is mocked)."""

    def test_returns_first_result_url(self):
        from unittest.mock import patch
        movies = [
            CSFDMovie(title="Matrix", url="https://www.csfd.cz/film/9499-matrix/"),
            CSFDMovie(title="Matrix Reloaded", url="https://www.csfd.cz/film/9497-x/"),
        ]
        with patch("src.core.csfd.search_movies", return_value=movies):
            assert find_csfd_url("Matrix 1999") == "https://www.csfd.cz/film/9499-matrix/"

    def test_returns_none_for_empty_query(self):
        assert find_csfd_url("   ") is None

    def test_returns_none_when_no_results(self):
        from unittest.mock import patch
        with patch("src.core.csfd.search_movies", return_value=[]):
            assert find_csfd_url("nonexistent film") is None


class TestFetchMovie:
    """Tests for fetch_movie function."""

    @patch("src.core.csfd.requests")
    def test_fetch_movie_success(self, mock_requests):
        """Test successful movie fetch."""
        mock_response = MagicMock()
        mock_response.text = SAMPLE_HTML
        mock_response.raise_for_status = MagicMock()
        session = _mock_session(mock_requests)
        session.get.return_value = mock_response

        movie = fetch_movie("https://www.csfd.cz/film/123-test/")

        assert movie.title == "Test Movie"
        assert movie.csfd_id == 123
        assert movie.rating == 86
        assert "Drama" in movie.genres
        session.get.assert_called_once()

    @patch("src.core.csfd.requests")
    def test_fetch_movie_caps_actors_at_ten(self, mock_requests):
        """Only the first MAX_ACTORS (10) of a long cast are kept."""
        import json as _json
        actors = [{"@type": "Person", "name": f"Actor {i}"} for i in range(25)]
        json_ld = _json.dumps({
            "@type": "Movie", "name": "Crowded", "actor": actors,
            "director": [{"@type": "Person", "name": "Dir"}],
            "aggregateRating": {"ratingValue": 70, "ratingCount": 5},
        })
        html = f'<html><head><script type="application/ld+json">{json_ld}</script></head></html>'
        mock_response = MagicMock()
        mock_response.text = html
        mock_response.raise_for_status = MagicMock()
        session = _mock_session(mock_requests)
        session.get.return_value = mock_response

        movie = fetch_movie("https://www.csfd.cz/film/1-crowded/")

        assert movie.directors == ["Dir"]
        assert movie.rating == 70
        assert len(movie.actors) == 10
        assert movie.actors[0] == "Actor 0"
        assert movie.actors[-1] == "Actor 9"

    @patch("src.core.csfd.requests")
    def test_fetch_movie_network_error(self, mock_requests):
        """Test network error handling."""
        import requests as real_requests
        session = _mock_session(mock_requests)
        session.get.side_effect = real_requests.RequestException("Network error")

        with pytest.raises(real_requests.RequestException):
            fetch_movie("https://www.csfd.cz/film/123/")


class TestSearchMovies:
    """Tests for search_movies function."""

    @patch("src.core.csfd.requests")
    def test_search_movies(self, mock_requests):
        """Test movie search."""
        search_html = """
        <html><body>
        <a href="/film/123-test/" class="film-title-name">Test Movie</a>
        <a href="/film/456-another/" class="film-title-name">Another Movie</a>
        </body></html>
        """
        mock_response = MagicMock()
        mock_response.text = search_html
        mock_response.raise_for_status = MagicMock()
        session = _mock_session(mock_requests)
        session.get.return_value = mock_response
        mock_requests.utils.quote = lambda x: x

        results = search_movies("test", limit=10)

        assert len(results) >= 1
        assert any(m.csfd_id == 123 for m in results)


class TestFetchMovieById:
    """Tests for fetch_movie_by_id function."""

    @patch("src.core.csfd.fetch_movie")
    def test_fetch_by_id(self, mock_fetch):
        """Test fetching movie by ID."""
        mock_fetch.return_value = CSFDMovie(title="Test", url="https://csfd.cz/film/9423/")

        movie = fetch_movie_by_id(9423)

        mock_fetch.assert_called_once_with("https://www.csfd.cz/film/9423/")
        assert movie.title == "Test"


class TestAnubisPoW:
    """Tests for the Anubis proof-of-work solver."""

    def test_solve_pow_difficulty_one(self):
        """Difficulty 1 requires a single leading zero nibble in the hash."""
        import hashlib

        random_data = "abc123"
        hash_hex, nonce, _ = _solve_anubis_pow(random_data, difficulty=1)
        assert hash_hex[0] == "0"
        assert hashlib.sha256(f"{random_data}{nonce}".encode()).hexdigest() == hash_hex

    def test_solve_pow_difficulty_two(self):
        """Difficulty 2 requires two leading zero nibbles (one zero byte)."""
        hash_hex, _, _ = _solve_anubis_pow("seed", difficulty=2)
        assert hash_hex[:2] == "00"


class TestDependencyCheck:
    """Tests for dependency checking."""

    def test_dependencies_available(self):
        """Test that dependencies are available (they should be in test env)."""
        # Should not raise
        _check_dependencies()