Add declarative TableSpec API with preload and fail-fast; fix shared-connection race

This commit is contained in:
Jan Doubravský
2026-06-11 13:39:56 +02:00
parent 46370fe651
commit 4a86b2282f
11 changed files with 500 additions and 37 deletions
+6 -1
View File
@@ -7,7 +7,8 @@ from ._coerce import to_sqlite_datetime as datetime_to_epoch_us
from .config import DEBUG
from .delta import DeltaConfig
from .engine import CachingEngine
from .exceptions import ReadOnlyError, UnsupportedQueryError
from .exceptions import ReadOnlyError, UndeclaredError, UnsupportedQueryError
from .spec import TTL, Delta, TableSpec
from .stats import Stats, TableStats
_DEFAULT_FORMAT = (
@@ -59,8 +60,12 @@ def add_sink(sink: Any, *, level: str | None = None, **kwargs: Any) -> None:
__all__ = [
"CachingEngine",
"DeltaConfig",
"Delta",
"TTL",
"TableSpec",
"ReadOnlyError",
"UnsupportedQueryError",
"UndeclaredError",
"Stats",
"TableStats",
"add_sink",
+24 -17
View File
@@ -311,23 +311,26 @@ class CacheManager:
return dict(self._last_run)
def is_table_cached(self, table: str) -> bool:
row = self._conn.execute(
"SELECT 1 FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
with self._lock: # the shared _conn must not be read while a writer uses it
row = self._conn.execute(
"SELECT 1 FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
return row is not None
def is_table_full(self, table: str) -> bool:
"""True if the whole table (all columns) is cached — a SELECT * cache hit."""
row = self._conn.execute(
"SELECT is_full FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
with self._lock:
row = self._conn.execute(
"SELECT is_full FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
return bool(row and row[0])
def seconds_since_refresh(self, table: str) -> float | None:
"""Age of a cached table in seconds, or None if it is not cached."""
row = self._conn.execute(
"SELECT last_refresh_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
with self._lock:
row = self._conn.execute(
"SELECT last_refresh_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
if not row or not row[0]:
return None
last = datetime.fromisoformat(row[0])
@@ -576,7 +579,8 @@ class CacheManager:
def get_table_columns(self, table: str) -> list[str]:
"""Authoritative ordered column list of a cached table (via PRAGMA)."""
rows = self._conn.execute(f"PRAGMA table_info({quote(table)})").fetchall()
with self._lock:
rows = self._conn.execute(f"PRAGMA table_info({quote(table)})").fetchall()
return [r[1] for r in rows]
def create_unique_index(self, table: str, key_columns: list[str]) -> None:
@@ -590,9 +594,10 @@ class CacheManager:
self._conn.commit()
def get_last_synced_at(self, table: str) -> str | None:
row = self._conn.execute(
"SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
with self._lock:
row = self._conn.execute(
"SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
# Stored in a TEXT column: an INTEGER-µs watermark (datetime_columns) comes
# back as its digit string; delta._bind_watermark reconstructs the datetime.
return row[0] if row else None
@@ -610,9 +615,10 @@ class CacheManager:
Returns an ``int`` for a datetime column stored as INTEGER µs, else the
ISO ``TEXT`` string."""
row = self._conn.execute(
f"SELECT MAX({quote(column)}) FROM {quote(table)}"
).fetchone()
with self._lock:
row = self._conn.execute(
f"SELECT MAX({quote(column)}) FROM {quote(table)}"
).fetchone()
return row[0] if row else None
def upsert_rows(self, table: str, columns: list[str], rows: list[tuple]) -> None:
@@ -629,7 +635,8 @@ class CacheManager:
self._conn.commit()
def count_rows(self, table: str) -> int:
row = self._conn.execute(f"SELECT COUNT(*) FROM {quote(table)}").fetchone()
with self._lock:
row = self._conn.execute(f"SELECT COUNT(*) FROM {quote(table)}").fetchone()
return int(row[0]) if row else 0
def db_size_bytes(self) -> int:
+118 -16
View File
@@ -18,12 +18,50 @@ from .config import (
SQL_DIALECT,
)
from .delta import DeltaConfig, DeltaRefresher, ResolvedDelta
from .exceptions import UndeclaredError
from .executor import QueryExecutor
from .parser import Params, parse
from .parser import Params, ParsedQuery, parse
from .registry import ColumnRegistry
from .spec import TTL, TableSpec
from .stats import Stats, StatsCollector, TableState, TableStats
def _specs_to_config(
tables: list[TableSpec],
) -> tuple[
dict[str, DeltaConfig],
dict[str, int],
dict[str, list[str | list[str]]],
dict[str, list[str]],
dict[str, list[str] | None],
]:
"""Convert declarative ``TableSpec``s into the engine's internal config dicts.
Returns ``(delta, ttl, indexes, datetime_columns, declared)`` — the first four
mirror the legacy kwargs; ``declared`` maps each table to its allowed columns
(``None`` = whole table / any column) for fail-fast query checking.
"""
delta: dict[str, DeltaConfig] = {}
ttl: dict[str, int] = {}
indexes: dict[str, list[str | list[str]]] = {}
datetime_columns: dict[str, list[str]] = {}
declared: dict[str, list[str] | None] = {}
for spec in tables:
if spec.name in declared:
raise ValueError(f"Duplicate TableSpec for table {spec.name!r}.")
declared[spec.name] = list(spec.columns) if spec.columns is not None else None
if spec.indexes:
indexes[spec.name] = list(spec.indexes)
if spec.datetime_columns:
datetime_columns[spec.name] = list(spec.datetime_columns)
refresh = spec.refresh
if isinstance(refresh, TTL):
ttl[spec.name] = refresh.seconds
elif isinstance(refresh, DeltaConfig):
delta[spec.name] = refresh
return delta, ttl, indexes, datetime_columns, declared
class _LazySource:
"""A source connection opened on first ``execute`` and shared across one query.
@@ -68,9 +106,25 @@ class CachingEngine:
pragmas: dict[str, str | int] | None = None,
datetime_columns: dict[str, list[str]] | None = None,
return_datetime: bool = True,
tables: list[TableSpec] | None = None,
blocking_startup_refresh: bool = False,
) -> None:
self._source_engine = source_engine
# Declarative mode: a list of TableSpecs is converted to the same internal
# config the legacy delta=/ttl=/indexes=/datetime_columns= kwargs produce,
# plus a declared-columns allowlist (for fail-fast) and preload set.
self._declared: dict[str, list[str] | None] | None = None
self._preload_specs: list[TableSpec] = []
if tables is not None:
if any(x is not None for x in (delta, ttl, indexes, datetime_columns)):
raise ValueError(
"Pass either tables=[TableSpec(...)] or the legacy "
"delta=/ttl=/indexes=/datetime_columns= kwargs, not both."
)
delta, ttl, indexes, datetime_columns, self._declared = _specs_to_config(tables)
self._preload_specs = [s for s in tables if s.preload]
use_memory = IN_MEMORY if in_memory is None else in_memory
self._dialect = dialect if dialect is not None else SQL_DIALECT
self._refresh_interval = (
@@ -101,12 +155,14 @@ class CachingEngine:
"reload), not both."
)
if self._delta or self._ttl:
# The startup catch-up (deltas/TTL reloads for tables restored from
# disk) can take a while on a cold start. By default it runs on the
# background thread so it never blocks application startup; callers
# who need the cache fully fresh before serving can opt back in.
if self._delta or self._ttl or self._preload_specs:
# Startup work (preload of declared tables + delta/TTL catch-up for
# tables restored from disk) can take a while on a cold start. By
# default it runs on the background thread so it never blocks
# application startup; callers who need the cache fully warm before
# serving can opt back in.
if blocking_startup_refresh:
self._preload()
self._run_refresh()
self._start_refresh_thread(initial_catch_up=not blocking_startup_refresh)
@@ -199,22 +255,67 @@ class CachingEngine:
)
return replace(table_stats, tracking=tracking, state=state, last_refresh=last_refresh)
def _make_executor(self, source: Any) -> QueryExecutor:
return QueryExecutor(
self._cache,
self._registry,
source,
self._stats,
self._delta,
self._ttl,
self._index_columns,
)
def _check_declared(self, parsed: ParsedQuery) -> None:
"""In declarative mode, reject any table/column not declared up front."""
if self._declared is None:
return
for table in parsed.tables:
if table not in self._declared:
raise UndeclaredError(
f"Table {table!r} is not declared in tables=[TableSpec(...)]. "
"Add a TableSpec for it (declarative mode is a strict allowlist)."
)
allowed = self._declared[table]
if allowed is None:
continue # whole table declared — any column is fine
if table in parsed.wildcard_tables:
raise UndeclaredError(
f"SELECT * on {table!r} is not allowed: only columns {allowed} "
"are declared. List the columns explicitly or declare "
"columns=None for the whole table."
)
unknown = [c for c in parsed.columns_by_table.get(table, []) if c not in allowed]
if unknown:
raise UndeclaredError(
f"Column(s) {unknown} of {table!r} are not declared "
f"(declared: {allowed})."
)
def execute(self, sql: str, params: Params = None) -> list[dict]:
parsed = parse(sql, params, dialect=self._dialect)
self._check_declared(parsed)
# The source connection is opened lazily — a pure cache hit never touches
# the source and never occupies a pool slot.
source = _LazySource(self._source_engine)
try:
executor = QueryExecutor(
self._cache,
self._registry,
source,
self._stats,
self._delta,
self._ttl,
self._index_columns,
)
return executor.execute(parsed)
return self._make_executor(source).execute(parsed)
finally:
source.close()
def _preload(self) -> None:
"""Load declared ``preload=True`` tables into the cache (skipping fresh copies)."""
if not self._preload_specs:
return
source = _LazySource(self._source_engine)
try:
executor = self._make_executor(source)
for spec in self._preload_specs:
try:
logger.info(f"Preloading {spec.name!r}")
executor.ensure_loaded(spec.name, spec.columns)
except Exception as e:
logger.error(f"Preload failed for {spec.name!r}: {e}")
finally:
source.close()
@@ -250,6 +351,7 @@ class CachingEngine:
def _start_refresh_thread(self, initial_catch_up: bool = True) -> None:
def loop() -> None:
if initial_catch_up:
self._preload() # off-main-thread declared-table preload
self._run_refresh() # off-main-thread startup catch-up
event = threading.Event()
while not event.wait(self._refresh_interval):
+10
View File
@@ -4,3 +4,13 @@ class ReadOnlyError(Exception):
class UnsupportedQueryError(Exception):
"""Raised when a query uses unsupported features (JOIN, SELECT *)."""
class UndeclaredError(Exception):
"""Raised in declarative mode (``tables=[TableSpec(...)]``) when a query
references a table or column that was not declared up front.
Fail-fast by design: an undeclared table/column would otherwise trigger a
silent (potentially multi-hour) lazy load/column-expansion, so it is surfaced
immediately instead.
"""
+14
View File
@@ -42,6 +42,20 @@ class QueryExecutor:
self._ensure_table(table, parsed)
return self._run_in_memory(parsed)
def ensure_loaded(self, table: str, columns: list[str] | None) -> None:
"""Preload *table* into the cache without running a query.
``columns=None`` loads the whole table (``SELECT *`` semantics); otherwise
only the listed columns. Reuses the same load path as a real query — delta
key/change + index columns are augmented, the registry and watermark are
updated, and double-checked locking skips a copy already fresh in the
cache — but never materializes any rows (unlike :meth:`execute`).
"""
if columns is None:
self._ensure_full(table)
else:
self._ensure_columns(table, columns)
def _ensure_table(self, table: str, parsed: ParsedQuery) -> None:
if table in parsed.wildcard_tables:
self._ensure_full(table)
+49
View File
@@ -0,0 +1,49 @@
"""Declarative table specs for ``CachingEngine(tables=[...])``.
Instead of the lazy "learn columns from queries" mode, an application can declare
each table up front — its columns, indexes, refresh strategy and datetime columns —
so the engine preloads them and rejects anything undeclared (fail-fast) rather than
silently triggering an expensive lazy load. The legacy ``delta=/ttl=/indexes=``
kwargs keep working; ``tables=`` is converted to the same internal config.
"""
from dataclasses import dataclass, field
from .delta import DeltaConfig
# Friendly alias for the declarative API; ``Delta`` and ``DeltaConfig`` are the
# same type (``change_column`` + ``key_columns``), so either may be used as a
# ``TableSpec.refresh`` strategy.
Delta = DeltaConfig
@dataclass(frozen=True)
class TTL:
"""Time-based refresh strategy: full-reload the table when older than *seconds*."""
seconds: int
@dataclass(frozen=True)
class TableSpec:
"""Declarative specification of one cached table.
*columns* lists the columns to cache; leave it ``None`` to cache the whole
table (``SELECT *`` semantics) and allow any column. When columns are listed,
a query asking for a column outside the list raises
:class:`~sqlmem.exceptions.UndeclaredError`.
*refresh* is a :class:`Delta` (change-column incremental sync) or :class:`TTL`
(time-based full reload), or ``None`` for a static table loaded once.
*preload=True* loads the table at startup (in the background by default) so the
first query is a cache hit instead of paying a cold load; a copy already fresh
in the persistent cache is skipped.
"""
name: str
columns: list[str] | None = None
indexes: list[str | list[str]] = field(default_factory=list)
refresh: DeltaConfig | TTL | None = None
datetime_columns: list[str] = field(default_factory=list)
preload: bool = False