Files
Curator/scripts/strip_rating_bands.py

110 lines
3.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""One-off migration: drop the old decade-band rating tags from a pool index.
Earlier the ČSFD rating was stored bucketed (e.g. ``Hodnocení/90100 %``). Now
the tag carries the exact value (``Hodnocení/90``) and the band is only a folder.
This removes the legacy band tags (``Hodnocení/<x><y> %``) so re-fetching from
ČSFD leaves only the exact ratings. Exact rating tags are kept. A timestamped
backup of the index is written first.
Usage:
poetry run python scripts/strip_rating_bands.py [<pool_dir>] [--category "Hodnocení"]
"""
from __future__ import annotations
import re
import sys
import json
import shutil
import argparse
from pathlib import Path
from datetime import datetime
from loguru import logger
# Allow running as a plain script (``python scripts/...``) by exposing the repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.core.config import load_global_config # noqa: E402
from src.core.pool_index import INDEX_FILENAME # noqa: E402
# A band tag looks like "Hodnocení/90100 %" — value has a dash range and a "%".
_BAND_RE = re.compile(r"\d+\s*[-]\s*\d+\s*%")
def _strip_bands(tags: list[str], category: str) -> tuple[list[str], int]:
"""Return (kept tags, removed count), dropping ``category`` band tags."""
prefix = f"{category}/"
kept = [
t for t in tags
if not (isinstance(t, str) and t.startswith(prefix) and _BAND_RE.search(t))
]
return kept, len(tags) - len(kept)
def migrate(index_path: Path, category: str) -> int:
"""Remove band rating tags in place; return number of tags removed."""
with open(index_path, "r", encoding="utf-8") as f:
data = json.load(f)
movies: dict[str, dict] = data.get("movies", {})
total_removed = 0
affected = 0
for key, record in movies.items():
tags = record.get("tags", [])
kept, removed = _strip_bands(tags, category)
if removed:
record["tags"] = kept
# also drop them from the ČSFD provenance set, if present
if isinstance(record.get("csfd_tags"), list):
record["csfd_tags"] = [
t for t in record["csfd_tags"]
if not (t.startswith(f"{category}/") and _BAND_RE.search(t))
]
total_removed += removed
affected += 1
logger.debug(f"{key}: removed {removed} band tag(s)")
if total_removed == 0:
logger.info(f"No '{category}/…–… %' band tags found — nothing to migrate")
return 0
backup = index_path.with_suffix(
index_path.suffix + f".bak-{datetime.now():%Y%m%d-%H%M%S}"
)
shutil.copy2(index_path, backup)
logger.info(f"Backup written: {backup}")
with open(index_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(
f"Removed {total_removed} band '{category}' tag(s) across {affected} record(s)"
)
return total_removed
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"pool_dir",
nargs="?",
help="Pool root (default: pool_dir from the global config)",
)
parser.add_argument("--category", default="Hodnocení", help="Rating category")
args = parser.parse_args()
pool_dir = args.pool_dir or load_global_config().get("pool_dir")
if not pool_dir:
parser.error("No pool_dir given and none configured in the global config")
index_path = Path(pool_dir) / INDEX_FILENAME
if not index_path.exists():
parser.error(f"No index found at {index_path}")
migrate(index_path, args.category)
if __name__ == "__main__":
main()