Fix frozen delta watermark and add error stats, lazy source, concurrent disk reads, and per-engine config

2026-06-08 19:35:33 +02:00
parent 209ae667ab
commit 6dc85e4f3c
17 changed files with 668 additions and 71 deletions
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Any

 from loguru import logger
@@ -15,13 +16,25 @@ _DEFAULT_FORMAT = (
    "<level>{message}</level>"
 )

+# Sinks already registered, keyed by a stable identity, so a repeated call (e.g.
+# a double import) doesn't add a second handler and duplicate every log line.
+_added_sinks: dict[object, int] = {}
+
+
+def _sink_key(sink: Any) -> object:
+    """A stable identity for *sink* so the same destination isn't added twice."""
+    if isinstance(sink, (str, Path)):
+        return ("path", str(Path(sink).resolve()))
+    return ("obj", id(sink))
+

 def add_sink(sink: Any, *, level: str | None = None, **kwargs: Any) -> None:
-    """Route sqlmem log records to *sink*.
+    """Route sqlmem log records to *sink* (idempotent).

    Accepts any sink supported by loguru (file path, stream, callable, …).
    *level* defaults to ``DEBUG`` when ``SQLMEM_DEBUG=true``, otherwise ``INFO``.
-    Extra keyword arguments are forwarded to :func:`loguru.logger.add`.
+    Extra keyword arguments are forwarded to :func:`loguru.logger.add`. Calling it
+    again for the same sink is a no-op, so a double import won't duplicate logs.

    Example::

@@ -31,9 +44,15 @@ def add_sink(sink: Any, *, level: str | None = None, **kwargs: Any) -> None:
        add_sink("sqlmem.log", rotation="10 MB")
    """
    logger.enable("sqlmem")
+    key = _sink_key(sink)
+    if key in _added_sinks:
+        return
    kwargs.setdefault("format", _DEFAULT_FORMAT)
    kwargs.setdefault("colorize", True)
-    logger.add(sink, level=level or ("DEBUG" if DEBUG else "INFO"), filter="sqlmem", **kwargs)
+    handler_id = logger.add(
+        sink, level=level or ("DEBUG" if DEBUG else "INFO"), filter="sqlmem", **kwargs
+    )
+    _added_sinks[key] = handler_id


 __all__ = [
@@ -0,0 +1,27 @@
+"""SQL identifier quoting.
+
+Table and column names are interpolated into statements as raw strings, so a
+name with a space, a reserved word, or an embedded quote would break the query
+(and is a latent injection vector). These helpers quote identifiers safely. The
+in-memory cache is SQLite, so it uses double-quote style; the source DB is quoted
+in its configured dialect (e.g. T-SQL ``[brackets]``).
+"""
+
+from collections.abc import Iterable
+
+from sqlglot import exp
+
+
+def quote(name: str) -> str:
+    """Quote an identifier for the in-memory SQLite cache."""
+    return '"' + name.replace('"', '""') + '"'
+
+
+def quote_list(names: Iterable[str]) -> str:
+    """Comma-join SQLite-quoted identifiers."""
+    return ", ".join(quote(n) for n in names)
+
+
+def quote_source(name: str, dialect: str) -> str:
+    """Quote an identifier for the source DB in its dialect (e.g. T-SQL ``[x]``)."""
+    return exp.to_identifier(name, quoted=True).sql(dialect=dialect)
@@ -10,7 +10,8 @@ from loguru import logger

 import sqlmem._meta as _meta
 from ._coerce import coerce_params, coerce_row
-from .config import FETCH_BATCH_SIZE
+from ._sql import quote, quote_list, quote_source
+from .config import FETCH_BATCH_SIZE, SQL_DIALECT
 from .stats import TableState

 SCHEMA_VERSION = 3
@@ -22,17 +23,37 @@ class _Index:
    columns: tuple[str, ...]


+@dataclass(frozen=True)
+class TableError:
+    """Most recent load/refresh failure for a table (see ``CacheManager.get_errors``)."""
+
+    message: str
+    at: str
+    consecutive: int
+
+
 class CacheManager:
    def __init__(
-        self, db_path: Path, backup_interval: int, in_memory: bool = True
+        self,
+        db_path: Path,
+        backup_interval: int,
+        in_memory: bool = True,
+        dialect: str = SQL_DIALECT,
+        fetch_batch: int = FETCH_BATCH_SIZE,
    ) -> None:
        self._db_path = db_path
        self._backup_interval = backup_interval
        self._in_memory = in_memory
+        self._dialect = dialect              # source-DB dialect, for identifier quoting
+        self._fetch_batch = fetch_batch      # rows fetched per source batch
        self._lock = threading.Lock()       # serializes connection access
        self._load_lock = threading.Lock()  # serializes full table loads
        self._states: dict[str, str] = {}   # table → live processing state
+        self._errors: dict[str, TableError] = {}  # table → last load/refresh failure
+        self._error_total = 0                # process-wide failure counter
        self._index_defs: dict[str, list[_Index]] = {}  # table → secondary indexes
+        self._read_local = threading.local()  # per-thread read conn (disk mode)
+        self._read_conns: list[sqlite3.Connection] = []  # read conns, for cleanup
        self._closed = False

        if in_memory:
@@ -124,7 +145,7 @@ class CacheManager:
            ).fetchall()
        ]
        for name in names:
-            self._conn.execute(f"DROP TABLE IF EXISTS {name}")
+            self._conn.execute(f"DROP TABLE IF EXISTS {quote(name)}")
        self._conn.commit()

    def _load_from_disk(self) -> None:
@@ -161,7 +182,7 @@ class CacheManager:
        ]
        for name in orphans:
            logger.warning(f"Dropping orphan staging table {name!r} from a previous interrupted load.")
-            self._conn.execute(f"DROP TABLE IF EXISTS {name}")
+            self._conn.execute(f"DROP TABLE IF EXISTS {quote(name)}")
        if orphans:
            self._conn.commit()

@@ -238,7 +259,9 @@ class CacheManager:
    def discover_columns(self, table: str, source_conn: sqlite3.Connection) -> list[str]:
        """Return all column names of *table* from the source DB without fetching rows."""
        logger.debug(f"Discovering columns of {table!r} from source DB")
-        cursor = source_conn.execute(f"SELECT * FROM {table} WHERE 1 = 0")
+        cursor = source_conn.execute(
+            f"SELECT * FROM {quote_source(table, self._dialect)} WHERE 1 = 0"
+        )
        columns = [desc[0] for desc in cursor.description]
        logger.debug(f"{table!r} has columns: {columns}")
        return columns
@@ -251,6 +274,28 @@ class CacheManager:

    def clear_state(self, table: str) -> None:
        self._states.pop(table, None)
+        self._errors.pop(table, None)
+
+    def record_error(self, table: str, message: str) -> None:
+        """Record a load/refresh failure for *table* (increments its failure streak)."""
+        prev = self._errors.get(table)
+        streak = (prev.consecutive if prev else 0) + 1
+        self._errors[table] = TableError(message=message, at=_now(), consecutive=streak)
+        self._error_total += 1
+        logger.debug(f"Recorded error for {table!r} (streak {streak}): {message}")
+
+    def record_success(self, table: str) -> None:
+        """Reset *table*'s failure streak to 0 after a successful load/refresh."""
+        prev = self._errors.get(table)
+        if prev and prev.consecutive:
+            self._errors[table] = TableError(prev.message, prev.at, 0)
+
+    def get_errors(self) -> dict[str, TableError]:
+        return dict(self._errors)
+
+    @property
+    def error_total(self) -> int:
+        return self._error_total

    def add_index(self, table: str, columns: list[str]) -> None:
        """Register a secondary index to (re)create on *columns* after each load."""
@@ -268,10 +313,10 @@ class CacheManager:
                    f"Skipping index {idx.name!r}: columns {idx.columns} not all cached."
                )
                continue
-            cols = ", ".join(idx.columns)
+            cols = quote_list(idx.columns)
            with self._lock:
                self._conn.execute(
-                    f"CREATE INDEX IF NOT EXISTS {idx.name} ON {table} ({cols})"
+                    f"CREATE INDEX IF NOT EXISTS {quote(idx.name)} ON {quote(table)} ({cols})"
                )
                self._conn.commit()
            logger.debug(f"Index {idx.name!r} ready on {table} ({cols})")
@@ -291,25 +336,29 @@ class CacheManager:
        until the swap. Concurrent loads are serialized by ``_load_lock``; the
        connection lock is only held for the brief per-batch inserts and the swap.
        """
-        cols = ", ".join(columns)
-        col_defs = ", ".join(f"{c} TEXT" for c in columns)
+        src_cols = ", ".join(quote_source(c, self._dialect) for c in columns)
+        col_defs = ", ".join(f"{quote(c)} TEXT" for c in columns)
        placeholders = ", ".join("?" * len(columns))
        staging = f"{table}__sqlmem_load"
+        q_staging = quote(staging)
+        q_table = quote(table)

        with self._load_lock:
            self.set_state(table, TableState.LOADING)
-            logger.info(f"Fetching {table!r} columns [{cols}] from source DB (batch={FETCH_BATCH_SIZE})")
+            logger.info(f"Fetching {table!r} columns {columns} from source DB (batch={self._fetch_batch})")
            try:
-                cursor = source_conn.execute(f"SELECT {cols} FROM {table}")
+                cursor = source_conn.execute(
+                    f"SELECT {src_cols} FROM {quote_source(table, self._dialect)}"
+                )
                with self._lock:
-                    self._conn.execute(f"DROP TABLE IF EXISTS {staging}")
-                    self._conn.execute(f"CREATE TABLE {staging} ({col_defs})")
+                    self._conn.execute(f"DROP TABLE IF EXISTS {q_staging}")
+                    self._conn.execute(f"CREATE TABLE {q_staging} ({col_defs})")
                    self._conn.commit()

                total = 0
-                insert_sql = f"INSERT INTO {staging} VALUES ({placeholders})"
+                insert_sql = f"INSERT INTO {q_staging} VALUES ({placeholders})"
                while True:
-                    batch = cursor.fetchmany(FETCH_BATCH_SIZE)  # network outside _lock
+                    batch = cursor.fetchmany(self._fetch_batch)  # network outside _lock
                    if not batch:
                        break
                    clean = [coerce_row(row) for row in batch]
@@ -319,46 +368,83 @@ class CacheManager:
                    total += len(batch)

                with self._lock:  # atomic swap — readers see old or new, never partial
-                    self._conn.execute(f"DROP TABLE IF EXISTS {table}")
-                    self._conn.execute(f"ALTER TABLE {staging} RENAME TO {table}")
+                    self._conn.execute(f"DROP TABLE IF EXISTS {q_table}")
+                    self._conn.execute(f"ALTER TABLE {q_staging} RENAME TO {q_table}")
                    self._conn.commit()
-            except BaseException:
+            except BaseException as exc:
                with self._lock:
-                    self._conn.execute(f"DROP TABLE IF EXISTS {staging}")
+                    self._conn.execute(f"DROP TABLE IF EXISTS {q_staging}")
                    self._conn.commit()
                self.set_state(table, TableState.ERROR)
+                self.record_error(table, f"{type(exc).__name__}: {exc}")
                raise

            self._create_indexes(table, columns)
            self.mark_table_refreshed(table, total, full)
            self.set_state(table, TableState.READY)
+            self.record_success(table)
            logger.info(f"Table {table!r} cached ({total} rows, columns: {columns})")

+    def _read_conn(self) -> sqlite3.Connection:
+        """A per-thread, read-only connection used for cache reads in disk mode.
+
+        Disk mode runs in WAL, which allows many concurrent readers alongside one
+        writer. Giving each thread its own read connection (rather than sharing the
+        single write connection under ``_lock``) means a slow ``SELECT`` no longer
+        blocks writers (loads/upserts) or other readers. In-memory mode can't do
+        this — each ``:memory:`` connection is a separate database — so it keeps
+        using the single locked connection.
+        """
+        conn = getattr(self._read_local, "conn", None)
+        if conn is None:
+            conn = sqlite3.connect(str(self._db_path), check_same_thread=False)
+            conn.execute("PRAGMA query_only=ON")  # read-only guard
+            self._read_local.conn = conn
+            with self._lock:
+                self._read_conns.append(conn)
+        return conn
+
    def execute_in_memory(
        self, sql: str, params: tuple | list | dict | None = None
    ) -> tuple[list[str], list[tuple]]:
-        """Run a read query against the in-memory cache, serialized with writers."""
+        """Run a read query against the cache.
+
+        In-memory mode serializes with writers on the single connection. Disk mode
+        reads from a per-thread WAL connection, so reads run concurrently with
+        writers and each other (see :meth:`_read_conn`).
+        """
        bound = coerce_params(params)
-        with self._lock:
-            cursor = self._conn.execute(sql) if bound is None else self._conn.execute(sql, bound)
-            col_names = [desc[0] for desc in cursor.description]
-            rows = cursor.fetchall()
+        if self._in_memory:
+            with self._lock:
+                cursor = (
+                    self._conn.execute(sql)
+                    if bound is None
+                    else self._conn.execute(sql, bound)
+                )
+                col_names = [desc[0] for desc in cursor.description]
+                rows = cursor.fetchall()
+            return col_names, rows
+
+        conn = self._read_conn()
+        cursor = conn.execute(sql) if bound is None else conn.execute(sql, bound)
+        col_names = [desc[0] for desc in cursor.description]
+        rows = cursor.fetchall()
        return col_names, rows

    # --- delta refresh support ---------------------------------------------

    def get_table_columns(self, table: str) -> list[str]:
        """Authoritative ordered column list of a cached table (via PRAGMA)."""
-        rows = self._conn.execute(f"PRAGMA table_info({table})").fetchall()
+        rows = self._conn.execute(f"PRAGMA table_info({quote(table)})").fetchall()
        return [r[1] for r in rows]

    def create_unique_index(self, table: str, key_columns: list[str]) -> None:
        """Create the unique index on *key_columns* that makes upsert-by-key work."""
-        cols = ", ".join(key_columns)
-        index = f"idx_{table}_pk"
+        cols = quote_list(key_columns)
+        index = quote(f"idx_{table}_pk")
        with self._lock:
            self._conn.execute(
-                f"CREATE UNIQUE INDEX IF NOT EXISTS {index} ON {table} ({cols})"
+                f"CREATE UNIQUE INDEX IF NOT EXISTS {index} ON {quote(table)} ({cols})"
            )
            self._conn.commit()

@@ -378,23 +464,25 @@ class CacheManager:

    def max_value(self, table: str, column: str) -> str | None:
        """Maximum value of *column* across cached rows (the delta watermark)."""
-        row = self._conn.execute(f"SELECT MAX({column}) FROM {table}").fetchone()
+        row = self._conn.execute(
+            f"SELECT MAX({quote(column)}) FROM {quote(table)}"
+        ).fetchone()
        return row[0] if row else None

    def upsert_rows(self, table: str, columns: list[str], rows: list[tuple]) -> None:
        """Insert-or-replace one batch of *rows* by the table's unique key."""
-        col_list = ", ".join(columns)
+        col_list = quote_list(columns)
        placeholders = ", ".join("?" * len(columns))
        clean_rows = [coerce_row(row) for row in rows]
        with self._lock:
            self._conn.executemany(
-                f"INSERT OR REPLACE INTO {table} ({col_list}) VALUES ({placeholders})",
+                f"INSERT OR REPLACE INTO {quote(table)} ({col_list}) VALUES ({placeholders})",
                clean_rows,
            )
            self._conn.commit()

    def count_rows(self, table: str) -> int:
-        row = self._conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
+        row = self._conn.execute(f"SELECT COUNT(*) FROM {quote(table)}").fetchone()
        return int(row[0]) if row else 0

    def reset(self) -> None:
@@ -411,7 +499,7 @@ class CacheManager:
                ).fetchall()
            ]
            for name in user_tables:
-                self._conn.execute(f"DROP TABLE IF EXISTS {name}")
+                self._conn.execute(f"DROP TABLE IF EXISTS {quote(name)}")
            self._conn.execute("DELETE FROM _sqlmem_tables")
            self._conn.execute("DELETE FROM _sqlmem_columns")
            self._conn.commit()
@@ -434,6 +522,13 @@ class CacheManager:
    def close(self) -> None:
        self._backup_to_disk()
        self._closed = True
+        with self._lock:
+            for conn in self._read_conns:
+                try:
+                    conn.close()
+                except sqlite3.Error:
+                    pass
+            self._read_conns.clear()
        self._conn.close()


@@ -1,13 +1,34 @@
-import sqlite3
 from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any

 from loguru import logger

+from ._sql import quote_source
 from .cache import CacheManager
-from .config import FETCH_BATCH_SIZE
 from .stats import TableState


+def _bind_watermark(watermark: str) -> datetime | str:
+    """Bind the delta watermark back to the source in its native type.
+
+    The cache stores the change column as an ISO ``TEXT`` string (see
+    ``_coerce.to_sqlite``), so ``max(change_column)`` comes back as a string such
+    as ``'2026-06-05T14:54:24.823000'``. Sending that straight back to the source
+    as an ``nvarchar`` makes SQL Server do an implicit ``varchar -> datetime``
+    conversion, which **fails** on the ``T``-separated, 6-digit-microsecond ISO
+    form (error 241 / SQLSTATE 22007 — ``datetime`` accepts at most 3 fractional
+    digits). Parsing it back to a real :class:`~datetime.datetime` makes the
+    driver send a typed timestamp, so the comparison happens natively with no
+    string conversion. Non-datetime change columns (e.g. an integer rowversion)
+    don't parse and are passed through unchanged.
+    """
+    try:
+        return datetime.fromisoformat(watermark)
+    except (TypeError, ValueError):
+        return watermark
+
+
@dataclass(frozen=True)
 class DeltaConfig:
    """Per-table configuration for incremental (delta) refresh.
@@ -43,28 +64,37 @@ class DeltaRefresher:
        self._cache = cache
        self._delta = delta

-    def refresh(self, source_conn: sqlite3.Connection) -> None:
+    def refresh(self, source_conn: Any) -> None:
        for table, cfg in self._delta.items():
            if not self._cache.is_table_cached(table):
                continue
            try:
                self._refresh_table(table, cfg, source_conn)
+                self._cache.record_success(table)
            except Exception as e:  # one bad table must not stop the others
                logger.error(f"Delta refresh failed for {table!r}: {e}")
+                # A delta can fail before streaming starts (e.g. a watermark the
+                # source rejects), leaving state misleadingly READY — mark it and
+                # record the error so stats reveal the stuck table.
+                self._cache.set_state(table, TableState.ERROR)
+                self._cache.record_error(table, f"{type(e).__name__}: {e}")

    def _refresh_table(
-        self, table: str, cfg: ResolvedDelta, source_conn: sqlite3.Connection
+        self, table: str, cfg: ResolvedDelta, source_conn: Any
    ) -> None:
        columns = self._cache.get_table_columns(table)
        watermark = self._cache.get_last_synced_at(table)
-        col_list = ", ".join(columns)
+        dialect = self._cache._dialect
+        col_list = ", ".join(quote_source(c, dialect) for c in columns)
+        q_table = quote_source(table, dialect)

        if watermark is None:
-            cursor = source_conn.execute(f"SELECT {col_list} FROM {table}")
+            cursor = source_conn.execute(f"SELECT {col_list} FROM {q_table}")
        else:
+            change_col = quote_source(cfg.change_column, dialect)
            cursor = source_conn.execute(
-                f"SELECT {col_list} FROM {table} WHERE {cfg.change_column} >= ?",
-                (watermark,),
+                f"SELECT {col_list} FROM {q_table} WHERE {change_col} >= ?",
+                (_bind_watermark(watermark),),
            )

        # Stream the delta in batches so a large catch-up never materializes at once.
@@ -72,7 +102,7 @@ class DeltaRefresher:
        self._cache.set_state(table, TableState.REFRESHING)
        try:
            while True:
-                batch = cursor.fetchmany(FETCH_BATCH_SIZE)
+                batch = cursor.fetchmany(self._cache._fetch_batch)
                if not batch:
                    break
                self._cache.upsert_rows(table, columns, batch)
@@ -1,18 +1,21 @@
-import sqlite3
 import threading
 from dataclasses import replace
-from typing import cast
+from pathlib import Path
+from typing import Any

 from loguru import logger
 from sqlalchemy import inspect
-from sqlalchemy.engine import Engine
+from sqlalchemy.engine import Connection, Engine

-from .cache import CacheManager
+from ._sql import quote
+from .cache import CacheManager, TableError
 from .config import (
    BACKUP_INTERVAL_SECONDS,
    CACHE_DB_PATH,
+    FETCH_BATCH_SIZE,
    IN_MEMORY,
    REFRESH_INTERVAL_SECONDS,
+    SQL_DIALECT,
 )
 from .delta import DeltaConfig, DeltaRefresher, ResolvedDelta
 from .executor import QueryExecutor
@@ -21,6 +24,32 @@ from .registry import ColumnRegistry
 from .stats import Stats, StatsCollector, TableState, TableStats


+class _LazySource:
+    """A source connection opened on first ``execute`` and shared across one query.
+
+    Most queries are cache hits that never touch the source, so opening it (and
+    occupying a connection-pool slot) eagerly is wasteful. This proxy forwards
+    ``execute`` to a real connection opened on demand, then released by ``close``.
+    """
+
+    def __init__(self, source_engine: Engine) -> None:
+        self._source_engine = source_engine
+        self._sa_conn: Connection | None = None
+        self._raw: Any = None
+
+    def execute(self, *args: Any, **kwargs: Any) -> Any:
+        if self._raw is None:
+            self._sa_conn = self._source_engine.connect()
+            self._raw = self._sa_conn.connection.dbapi_connection
+        return self._raw.execute(*args, **kwargs)
+
+    def close(self) -> None:
+        if self._sa_conn is not None:
+            self._sa_conn.close()
+            self._sa_conn = None
+            self._raw = None
+
+
 class CachingEngine:
    """Transparent SQLAlchemy-compatible cache layer."""

@@ -31,15 +60,28 @@ class CachingEngine:
        ttl: dict[str, int] | None = None,
        indexes: dict[str, list[str | list[str]]] | None = None,
        in_memory: bool | None = None,
+        cache_db_path: str | Path | None = None,
+        backup_interval: int | None = None,
+        refresh_interval: int | None = None,
+        fetch_batch: int | None = None,
+        dialect: str | None = None,
+        blocking_startup_refresh: bool = False,
    ) -> None:
        self._source_engine = source_engine
        use_memory = IN_MEMORY if in_memory is None else in_memory
+        self._dialect = dialect if dialect is not None else SQL_DIALECT
+        self._refresh_interval = (
+            refresh_interval if refresh_interval is not None else REFRESH_INTERVAL_SECONDS
+        )
        self._cache = CacheManager(
-            CACHE_DB_PATH, BACKUP_INTERVAL_SECONDS, in_memory=use_memory
+            Path(cache_db_path) if cache_db_path is not None else CACHE_DB_PATH,
+            backup_interval if backup_interval is not None else BACKUP_INTERVAL_SECONDS,
+            in_memory=use_memory,
+            dialect=self._dialect,
+            fetch_batch=fetch_batch if fetch_batch is not None else FETCH_BATCH_SIZE,
        )
        self._registry = ColumnRegistry(self._cache.connection)
        self._stats = StatsCollector()
-        self._refresh_interval = REFRESH_INTERVAL_SECONDS
        self._delta = self._resolve_delta(delta or {})
        self._ttl = dict(ttl or {})
        self._index_columns = self._register_indexes(indexes or {})
@@ -54,8 +96,13 @@ class CachingEngine:
            )

        if self._delta or self._ttl:
-            self._run_refresh()  # catch up tables restored from disk
-            self._start_refresh_thread()
+            # The startup catch-up (deltas/TTL reloads for tables restored from
+            # disk) can take a while on a cold start. By default it runs on the
+            # background thread so it never blocks application startup; callers
+            # who need the cache fully fresh before serving can opt back in.
+            if blocking_startup_refresh:
+                self._run_refresh()
+            self._start_refresh_thread(initial_catch_up=not blocking_startup_refresh)

        logger.info("CachingEngine initialized.")

@@ -97,12 +144,18 @@ class CachingEngine:
    @property
    def stats(self) -> Stats:
        states = self._cache.get_states()
+        errors = self._cache.get_errors()
        with self._cache._lock:
            base = self._stats.snapshot(self._cache.connection, states)
-        return replace(base, tables={n: self._enrich(n, t) for n, t in base.tables.items()})
+        base = replace(base, errors=self._cache.error_total)
+        return replace(
+            base, tables={n: self._enrich(n, t, errors) for n, t in base.tables.items()}
+        )

-    def _enrich(self, name: str, table_stats: TableStats) -> TableStats:
-        """Annotate a TableStats with how it is refreshed and TTL staleness."""
+    def _enrich(
+        self, name: str, table_stats: TableStats, errors: dict[str, TableError]
+    ) -> TableStats:
+        """Annotate a TableStats with refresh tracking, TTL staleness and errors."""
        if name in self._delta:
            tracking = "delta"
        elif name in self._ttl:
@@ -115,22 +168,37 @@ class CachingEngine:
            age = self._cache.seconds_since_refresh(name)
            if age is not None and age > self._ttl[name]:
                state = TableState.STALE
+
+        err = errors.get(name)
+        if err is not None:
+            return replace(
+                table_stats,
+                tracking=tracking,
+                state=state,
+                last_error=err.message,
+                last_error_at=err.at,
+                consecutive_failures=err.consecutive,
+            )
        return replace(table_stats, tracking=tracking, state=state)

    def execute(self, sql: str, params: Params = None) -> list[dict]:
-        parsed = parse(sql, params)
-        with self._source_engine.connect() as sa_conn:
-            raw_conn = cast(sqlite3.Connection, sa_conn.connection.dbapi_connection)
+        parsed = parse(sql, params, dialect=self._dialect)
+        # The source connection is opened lazily — a pure cache hit never touches
+        # the source and never occupies a pool slot.
+        source = _LazySource(self._source_engine)
+        try:
            executor = QueryExecutor(
                self._cache,
                self._registry,
-                raw_conn,
+                source,
                self._stats,
                self._delta,
                self._ttl,
                self._index_columns,
            )
            return executor.execute(parsed)
+        finally:
+            source.close()

    def refresh(self) -> None:
        """Pull deltas for all delta-tracked tables now (also runs on a timer)."""
@@ -139,13 +207,13 @@ class CachingEngine:
    def _run_refresh(self) -> None:
        try:
            with self._source_engine.connect() as sa_conn:
-                raw_conn = cast(sqlite3.Connection, sa_conn.connection.dbapi_connection)
+                raw_conn = sa_conn.connection.dbapi_connection
                self._refresher.refresh(raw_conn)
                self._refresh_ttl(raw_conn)
        except Exception as e:
            logger.error(f"Refresh cycle failed: {e}")

-    def _refresh_ttl(self, source_conn: sqlite3.Connection) -> None:
+    def _refresh_ttl(self, source_conn: Any) -> None:
        """Proactively full-reload TTL-tracked tables whose cache has expired."""
        for table, ttl in self._ttl.items():
            if not self._cache.is_table_cached(table):
@@ -161,8 +229,10 @@ class CachingEngine:
            except Exception as e:
                logger.error(f"TTL refresh failed for {table!r}: {e}")

-    def _start_refresh_thread(self) -> None:
+    def _start_refresh_thread(self, initial_catch_up: bool = True) -> None:
        def loop() -> None:
+            if initial_catch_up:
+                self._run_refresh()  # off-main-thread startup catch-up
            event = threading.Event()
            while not event.wait(self._refresh_interval):
                self._run_refresh()
@@ -174,7 +244,7 @@ class CachingEngine:
    def invalidate(self, table: str) -> None:
        logger.info(f"Manually invalidating cache for table {table!r}")
        with self._cache._lock:
-            self._cache.connection.execute(f"DROP TABLE IF EXISTS {table}")
+            self._cache.connection.execute(f"DROP TABLE IF EXISTS {quote(table)}")
            self._cache.connection.execute(
                "DELETE FROM _sqlmem_tables WHERE table_name = ?", (table,)
            )
@@ -1,4 +1,4 @@
-import sqlite3
+from typing import Any

 from loguru import logger

@@ -14,7 +14,7 @@ class QueryExecutor:
        self,
        cache: CacheManager,
        registry: ColumnRegistry,
-        source_conn: sqlite3.Connection,
+        source_conn: Any,  # raw DBAPI connection (pyodbc/sqlite3/…) — only .execute() is used
        stats: StatsCollector,
        delta: dict[str, ResolvedDelta] | None = None,
        ttl: dict[str, int] | None = None,
@@ -25,10 +25,10 @@ class ParsedQuery:
    wildcard_tables: set[str] = field(default_factory=set)


-def parse(sql: str, params: Params = None) -> ParsedQuery:
+def parse(sql: str, params: Params = None, dialect: str = SQL_DIALECT) -> ParsedQuery:
    logger.debug(f"Parsing SQL: {sql!r}")

-    statement = sqlglot.parse_one(sql, dialect=SQL_DIALECT)
+    statement = sqlglot.parse_one(sql, dialect=dialect)

    if isinstance(statement, WRITE_TYPES):
        raise ReadOnlyError(
@@ -20,6 +20,11 @@ class TableStats:
    last_refresh: str
    state: str = TableState.READY
    tracking: str = "static"  # "delta" | "ttl" | "static"
+    # Most recent load/refresh failure for this table, if any. ``consecutive_failures``
+    # resets to 0 on the next success, so > 0 means the table is currently failing.
+    last_error: str | None = None
+    last_error_at: str | None = None
+    consecutive_failures: int = 0


@dataclass(frozen=True)
@@ -28,6 +33,7 @@ class Stats:
    misses: int
    refetches: int
    tables: dict[str, TableStats]
+    errors: int = 0  # total load/refresh failures since start


 class StatsCollector: