110 lines
3.6 KiB
Python
110 lines
3.6 KiB
Python
"""One-off migration: drop the old decade-band rating tags from a pool index.
|
||
|
||
Earlier the ČSFD rating was stored bucketed (e.g. ``Hodnocení/90–100 %``). Now
|
||
the tag carries the exact value (``Hodnocení/90``) and the band is only a folder.
|
||
This removes the legacy band tags (``Hodnocení/<x>–<y> %``) so re-fetching from
|
||
ČSFD leaves only the exact ratings. Exact rating tags are kept. A timestamped
|
||
backup of the index is written first.
|
||
|
||
Usage:
|
||
poetry run python scripts/strip_rating_bands.py [<pool_dir>] [--category "Hodnocení"]
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import sys
|
||
import json
|
||
import shutil
|
||
import argparse
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
from loguru import logger
|
||
|
||
# Allow running as a plain script (``python scripts/...``) by exposing the repo root.
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||
|
||
from src.core.config import load_global_config # noqa: E402
|
||
from src.core.pool_index import INDEX_FILENAME # noqa: E402
|
||
|
||
# A band tag looks like "Hodnocení/90–100 %" — value has a dash range and a "%".
|
||
_BAND_RE = re.compile(r"\d+\s*[–-]\s*\d+\s*%")
|
||
|
||
|
||
def _strip_bands(tags: list[str], category: str) -> tuple[list[str], int]:
|
||
"""Return (kept tags, removed count), dropping ``category`` band tags."""
|
||
prefix = f"{category}/"
|
||
kept = [
|
||
t for t in tags
|
||
if not (isinstance(t, str) and t.startswith(prefix) and _BAND_RE.search(t))
|
||
]
|
||
return kept, len(tags) - len(kept)
|
||
|
||
|
||
def migrate(index_path: Path, category: str) -> int:
|
||
"""Remove band rating tags in place; return number of tags removed."""
|
||
with open(index_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
movies: dict[str, dict] = data.get("movies", {})
|
||
total_removed = 0
|
||
affected = 0
|
||
for key, record in movies.items():
|
||
tags = record.get("tags", [])
|
||
kept, removed = _strip_bands(tags, category)
|
||
if removed:
|
||
record["tags"] = kept
|
||
# also drop them from the ČSFD provenance set, if present
|
||
if isinstance(record.get("csfd_tags"), list):
|
||
record["csfd_tags"] = [
|
||
t for t in record["csfd_tags"]
|
||
if not (t.startswith(f"{category}/") and _BAND_RE.search(t))
|
||
]
|
||
total_removed += removed
|
||
affected += 1
|
||
logger.debug(f"{key}: removed {removed} band tag(s)")
|
||
|
||
if total_removed == 0:
|
||
logger.info(f"No '{category}/…–… %' band tags found — nothing to migrate")
|
||
return 0
|
||
|
||
backup = index_path.with_suffix(
|
||
index_path.suffix + f".bak-{datetime.now():%Y%m%d-%H%M%S}"
|
||
)
|
||
shutil.copy2(index_path, backup)
|
||
logger.info(f"Backup written: {backup}")
|
||
|
||
with open(index_path, "w", encoding="utf-8") as f:
|
||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||
|
||
logger.info(
|
||
f"Removed {total_removed} band '{category}' tag(s) across {affected} record(s)"
|
||
)
|
||
return total_removed
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description=__doc__)
|
||
parser.add_argument(
|
||
"pool_dir",
|
||
nargs="?",
|
||
help="Pool root (default: pool_dir from the global config)",
|
||
)
|
||
parser.add_argument("--category", default="Hodnocení", help="Rating category")
|
||
args = parser.parse_args()
|
||
|
||
pool_dir = args.pool_dir or load_global_config().get("pool_dir")
|
||
if not pool_dir:
|
||
parser.error("No pool_dir given and none configured in the global config")
|
||
|
||
index_path = Path(pool_dir) / INDEX_FILENAME
|
||
if not index_path.exists():
|
||
parser.error(f"No index found at {index_path}")
|
||
|
||
migrate(index_path, args.category)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|