SQLmem/src/sqlmem/executor.py

from collections.abc import Callable
from typing import Any

from loguru import logger

from .cache import CacheManager
from .delta import ResolvedDelta
from .parser import ParsedQuery
from .registry import ColumnRegistry
from .stats import StatsCollector


class QueryExecutor:
    def __init__(
        self,
        cache: CacheManager,
        registry: ColumnRegistry,
        source_conn: Any,  # raw DBAPI connection (pyodbc/sqlite3/…) — only .execute() is used
        stats: StatsCollector,
        delta: dict[str, ResolvedDelta] | None = None,
        ttl: dict[str, int] | None = None,
        index_columns: dict[str, list[str]] | None = None,
    ) -> None:
        self._cache = cache
        self._registry = registry
        self._source_conn = source_conn
        self._stats = stats
        self._delta = delta or {}
        self._ttl = ttl or {}
        self._index_columns = index_columns or {}

    def _ttl_expired(self, table: str) -> bool:
        """True if *table* has a TTL and its cached copy is older than that TTL."""
        ttl = self._ttl.get(table)
        if ttl is None:
            return False
        age = self._cache.seconds_since_refresh(table)
        return age is not None and age > ttl

    def execute(self, parsed: ParsedQuery) -> list[dict]:
        for table in parsed.tables:
            self._ensure_table(table, parsed)
        return self._run_in_memory(parsed)

    def ensure_loaded(self, table: str, columns: list[str] | None) -> None:
        """Preload *table* into the cache without running a query.

        ``columns=None`` loads the whole table (``SELECT *`` semantics); otherwise
        only the listed columns. Reuses the same load path as a real query — delta
        key/change + index columns are augmented, the registry and watermark are
        updated, and double-checked locking skips a copy already fresh in the
        cache — but never materializes any rows (unlike :meth:`execute`).
        """
        if columns is None:
            self._ensure_full(table)
        else:
            self._ensure_columns(table, columns)

    def _ensure_table(self, table: str, parsed: ParsedQuery) -> None:
        if table in parsed.wildcard_tables:
            self._ensure_full(table)
        else:
            self._ensure_columns(table, parsed.columns_by_table[table])

    def _full_satisfied(self, table: str) -> bool:
        """True if *table* is cached in full and not TTL-expired (a SELECT * hit)."""
        return (
            self._cache.is_table_cached(table)
            and self._cache.is_table_full(table)
            and not self._ttl_expired(table)
        )

    def _columns_satisfied(self, table: str, columns: list[str]) -> bool:
        """True if *table* is cached with all *columns* present and not TTL-expired."""
        if not self._cache.is_table_cached(table) or self._ttl_expired(table):
            return False
        return set(columns).issubset(self._cache.get_table_columns(table))

    def _ensure_full(self, table: str) -> None:
        """Load every column of *table* (SELECT * / t.*), refetching unless already full."""
        cached = self._cache.is_table_cached(table)
        stale = cached and self._ttl_expired(table)

        if cached and self._cache.is_table_full(table) and not stale:
            logger.debug(f"Cache hit (full): {table!r}")
            self._stats.record_hit()
            return

        if cached and stale:
            logger.info(f"Cache expired (ttl) — reloading {table!r} in full.")
            self._stats.record_refetch()
        elif cached:
            logger.warning(f"Re-fetching {table!r} in full — SELECT * requested.")
            self._stats.record_refetch()
        else:
            self._stats.record_miss()

        columns = self._cache.discover_columns(table, self._source_conn)
        self._load(table, columns, full=True, satisfied=lambda cols: self._full_satisfied(table))

    def _ensure_columns(self, table: str, columns: list[str]) -> None:
        """Load *table* with at least *columns*, refetching on new columns or TTL expiry."""
        missing = self._registry.needs_refetch(table, columns)
        table_cached = self._cache.is_table_cached(table)
        stale = table_cached and self._ttl_expired(table)

        if table_cached and not missing and not stale:
            logger.debug(f"Cache hit: {table!r} columns={columns}")
            self._stats.record_hit()
            return

        if stale:
            logger.info(f"Cache expired (ttl) — reloading {table!r}.")
            self._stats.record_refetch()
        elif table_cached and missing:
            logger.warning(
                f"Re-fetching {table!r} — new columns requested: {missing}. "
                f"Expanding cache from {self._registry.get_columns(table)} + {missing}"
            )
            self._stats.record_refetch()
        else:
            self._stats.record_miss()

        all_columns = list(self._registry.get_columns(table)) + missing
        # Preserve a fully-cached table's status across a TTL reload.
        full = table_cached and self._cache.is_table_full(table)
        self._load(
            table,
            all_columns,
            full=full,
            satisfied=lambda cols: self._columns_satisfied(table, cols),
        )

    def _load(
        self,
        table: str,
        columns: list[str],
        full: bool,
        satisfied: Callable[[list[str]], bool] | None = None,
    ) -> None:
        """Fetch *table* into cache, adding delta key/timestamp and index columns.

        *satisfied* is the double-checked-locking predicate evaluated under the
        load lock (see :meth:`CacheManager.load_table`); it is given the final,
        augmented column list so a concurrent loader that already produced an
        equivalent (or wider) cache is detected and the redundant reload skipped.
        """
        cfg = self._delta.get(table)
        extra = list(self._index_columns.get(table, []))
        if cfg:
            # The cache must always hold the key (to upsert) and the change column
            # (to compute the watermark), even if no query referenced them.
            extra += [*cfg.key_columns, cfg.change_column]
        if extra:
            columns = list(dict.fromkeys([*columns, *extra]))

        recheck: Callable[[], bool] | None = None
        if satisfied is not None:
            final_columns = columns
            recheck = lambda: satisfied(final_columns)  # noqa: E731
        self._cache.load_table(table, columns, self._source_conn, full=full, recheck=recheck)
        self._registry.update(table, columns)

        if cfg:
            self._cache.create_unique_index(table, cfg.key_columns)
            watermark = self._cache.max_value(table, cfg.change_column)
            self._cache.set_last_synced_at(table, watermark)

    def _run_in_memory(self, parsed: ParsedQuery) -> list[dict]:
        logger.debug(f"Executing in SQLite RAM: {parsed.sqlite_sql!r} params={parsed.params!r}")
        col_names, rows = self._cache.execute_in_memory(
            parsed.sqlite_sql, parsed.params, parsed.tables
        )
        return [dict(zip(col_names, row)) for row in rows]