Add declarative TableSpec API with preload and fail-fast; fix shared-connection race
This commit is contained in:
@@ -7,7 +7,8 @@ from ._coerce import to_sqlite_datetime as datetime_to_epoch_us
|
||||
from .config import DEBUG
|
||||
from .delta import DeltaConfig
|
||||
from .engine import CachingEngine
|
||||
from .exceptions import ReadOnlyError, UnsupportedQueryError
|
||||
from .exceptions import ReadOnlyError, UndeclaredError, UnsupportedQueryError
|
||||
from .spec import TTL, Delta, TableSpec
|
||||
from .stats import Stats, TableStats
|
||||
|
||||
_DEFAULT_FORMAT = (
|
||||
@@ -59,8 +60,12 @@ def add_sink(sink: Any, *, level: str | None = None, **kwargs: Any) -> None:
|
||||
__all__ = [
|
||||
"CachingEngine",
|
||||
"DeltaConfig",
|
||||
"Delta",
|
||||
"TTL",
|
||||
"TableSpec",
|
||||
"ReadOnlyError",
|
||||
"UnsupportedQueryError",
|
||||
"UndeclaredError",
|
||||
"Stats",
|
||||
"TableStats",
|
||||
"add_sink",
|
||||
|
||||
+24
-17
@@ -311,23 +311,26 @@ class CacheManager:
|
||||
return dict(self._last_run)
|
||||
|
||||
def is_table_cached(self, table: str) -> bool:
|
||||
row = self._conn.execute(
|
||||
"SELECT 1 FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
with self._lock: # the shared _conn must not be read while a writer uses it
|
||||
row = self._conn.execute(
|
||||
"SELECT 1 FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
return row is not None
|
||||
|
||||
def is_table_full(self, table: str) -> bool:
|
||||
"""True if the whole table (all columns) is cached — a SELECT * cache hit."""
|
||||
row = self._conn.execute(
|
||||
"SELECT is_full FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
with self._lock:
|
||||
row = self._conn.execute(
|
||||
"SELECT is_full FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
return bool(row and row[0])
|
||||
|
||||
def seconds_since_refresh(self, table: str) -> float | None:
|
||||
"""Age of a cached table in seconds, or None if it is not cached."""
|
||||
row = self._conn.execute(
|
||||
"SELECT last_refresh_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
with self._lock:
|
||||
row = self._conn.execute(
|
||||
"SELECT last_refresh_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
if not row or not row[0]:
|
||||
return None
|
||||
last = datetime.fromisoformat(row[0])
|
||||
@@ -576,7 +579,8 @@ class CacheManager:
|
||||
|
||||
def get_table_columns(self, table: str) -> list[str]:
|
||||
"""Authoritative ordered column list of a cached table (via PRAGMA)."""
|
||||
rows = self._conn.execute(f"PRAGMA table_info({quote(table)})").fetchall()
|
||||
with self._lock:
|
||||
rows = self._conn.execute(f"PRAGMA table_info({quote(table)})").fetchall()
|
||||
return [r[1] for r in rows]
|
||||
|
||||
def create_unique_index(self, table: str, key_columns: list[str]) -> None:
|
||||
@@ -590,9 +594,10 @@ class CacheManager:
|
||||
self._conn.commit()
|
||||
|
||||
def get_last_synced_at(self, table: str) -> str | None:
|
||||
row = self._conn.execute(
|
||||
"SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
with self._lock:
|
||||
row = self._conn.execute(
|
||||
"SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
# Stored in a TEXT column: an INTEGER-µs watermark (datetime_columns) comes
|
||||
# back as its digit string; delta._bind_watermark reconstructs the datetime.
|
||||
return row[0] if row else None
|
||||
@@ -610,9 +615,10 @@ class CacheManager:
|
||||
|
||||
Returns an ``int`` for a datetime column stored as INTEGER µs, else the
|
||||
ISO ``TEXT`` string."""
|
||||
row = self._conn.execute(
|
||||
f"SELECT MAX({quote(column)}) FROM {quote(table)}"
|
||||
).fetchone()
|
||||
with self._lock:
|
||||
row = self._conn.execute(
|
||||
f"SELECT MAX({quote(column)}) FROM {quote(table)}"
|
||||
).fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def upsert_rows(self, table: str, columns: list[str], rows: list[tuple]) -> None:
|
||||
@@ -629,7 +635,8 @@ class CacheManager:
|
||||
self._conn.commit()
|
||||
|
||||
def count_rows(self, table: str) -> int:
|
||||
row = self._conn.execute(f"SELECT COUNT(*) FROM {quote(table)}").fetchone()
|
||||
with self._lock:
|
||||
row = self._conn.execute(f"SELECT COUNT(*) FROM {quote(table)}").fetchone()
|
||||
return int(row[0]) if row else 0
|
||||
|
||||
def db_size_bytes(self) -> int:
|
||||
|
||||
+118
-16
@@ -18,12 +18,50 @@ from .config import (
|
||||
SQL_DIALECT,
|
||||
)
|
||||
from .delta import DeltaConfig, DeltaRefresher, ResolvedDelta
|
||||
from .exceptions import UndeclaredError
|
||||
from .executor import QueryExecutor
|
||||
from .parser import Params, parse
|
||||
from .parser import Params, ParsedQuery, parse
|
||||
from .registry import ColumnRegistry
|
||||
from .spec import TTL, TableSpec
|
||||
from .stats import Stats, StatsCollector, TableState, TableStats
|
||||
|
||||
|
||||
def _specs_to_config(
|
||||
tables: list[TableSpec],
|
||||
) -> tuple[
|
||||
dict[str, DeltaConfig],
|
||||
dict[str, int],
|
||||
dict[str, list[str | list[str]]],
|
||||
dict[str, list[str]],
|
||||
dict[str, list[str] | None],
|
||||
]:
|
||||
"""Convert declarative ``TableSpec``s into the engine's internal config dicts.
|
||||
|
||||
Returns ``(delta, ttl, indexes, datetime_columns, declared)`` — the first four
|
||||
mirror the legacy kwargs; ``declared`` maps each table to its allowed columns
|
||||
(``None`` = whole table / any column) for fail-fast query checking.
|
||||
"""
|
||||
delta: dict[str, DeltaConfig] = {}
|
||||
ttl: dict[str, int] = {}
|
||||
indexes: dict[str, list[str | list[str]]] = {}
|
||||
datetime_columns: dict[str, list[str]] = {}
|
||||
declared: dict[str, list[str] | None] = {}
|
||||
for spec in tables:
|
||||
if spec.name in declared:
|
||||
raise ValueError(f"Duplicate TableSpec for table {spec.name!r}.")
|
||||
declared[spec.name] = list(spec.columns) if spec.columns is not None else None
|
||||
if spec.indexes:
|
||||
indexes[spec.name] = list(spec.indexes)
|
||||
if spec.datetime_columns:
|
||||
datetime_columns[spec.name] = list(spec.datetime_columns)
|
||||
refresh = spec.refresh
|
||||
if isinstance(refresh, TTL):
|
||||
ttl[spec.name] = refresh.seconds
|
||||
elif isinstance(refresh, DeltaConfig):
|
||||
delta[spec.name] = refresh
|
||||
return delta, ttl, indexes, datetime_columns, declared
|
||||
|
||||
|
||||
class _LazySource:
|
||||
"""A source connection opened on first ``execute`` and shared across one query.
|
||||
|
||||
@@ -68,9 +106,25 @@ class CachingEngine:
|
||||
pragmas: dict[str, str | int] | None = None,
|
||||
datetime_columns: dict[str, list[str]] | None = None,
|
||||
return_datetime: bool = True,
|
||||
tables: list[TableSpec] | None = None,
|
||||
blocking_startup_refresh: bool = False,
|
||||
) -> None:
|
||||
self._source_engine = source_engine
|
||||
|
||||
# Declarative mode: a list of TableSpecs is converted to the same internal
|
||||
# config the legacy delta=/ttl=/indexes=/datetime_columns= kwargs produce,
|
||||
# plus a declared-columns allowlist (for fail-fast) and preload set.
|
||||
self._declared: dict[str, list[str] | None] | None = None
|
||||
self._preload_specs: list[TableSpec] = []
|
||||
if tables is not None:
|
||||
if any(x is not None for x in (delta, ttl, indexes, datetime_columns)):
|
||||
raise ValueError(
|
||||
"Pass either tables=[TableSpec(...)] or the legacy "
|
||||
"delta=/ttl=/indexes=/datetime_columns= kwargs, not both."
|
||||
)
|
||||
delta, ttl, indexes, datetime_columns, self._declared = _specs_to_config(tables)
|
||||
self._preload_specs = [s for s in tables if s.preload]
|
||||
|
||||
use_memory = IN_MEMORY if in_memory is None else in_memory
|
||||
self._dialect = dialect if dialect is not None else SQL_DIALECT
|
||||
self._refresh_interval = (
|
||||
@@ -101,12 +155,14 @@ class CachingEngine:
|
||||
"reload), not both."
|
||||
)
|
||||
|
||||
if self._delta or self._ttl:
|
||||
# The startup catch-up (deltas/TTL reloads for tables restored from
|
||||
# disk) can take a while on a cold start. By default it runs on the
|
||||
# background thread so it never blocks application startup; callers
|
||||
# who need the cache fully fresh before serving can opt back in.
|
||||
if self._delta or self._ttl or self._preload_specs:
|
||||
# Startup work (preload of declared tables + delta/TTL catch-up for
|
||||
# tables restored from disk) can take a while on a cold start. By
|
||||
# default it runs on the background thread so it never blocks
|
||||
# application startup; callers who need the cache fully warm before
|
||||
# serving can opt back in.
|
||||
if blocking_startup_refresh:
|
||||
self._preload()
|
||||
self._run_refresh()
|
||||
self._start_refresh_thread(initial_catch_up=not blocking_startup_refresh)
|
||||
|
||||
@@ -199,22 +255,67 @@ class CachingEngine:
|
||||
)
|
||||
return replace(table_stats, tracking=tracking, state=state, last_refresh=last_refresh)
|
||||
|
||||
def _make_executor(self, source: Any) -> QueryExecutor:
|
||||
return QueryExecutor(
|
||||
self._cache,
|
||||
self._registry,
|
||||
source,
|
||||
self._stats,
|
||||
self._delta,
|
||||
self._ttl,
|
||||
self._index_columns,
|
||||
)
|
||||
|
||||
def _check_declared(self, parsed: ParsedQuery) -> None:
|
||||
"""In declarative mode, reject any table/column not declared up front."""
|
||||
if self._declared is None:
|
||||
return
|
||||
for table in parsed.tables:
|
||||
if table not in self._declared:
|
||||
raise UndeclaredError(
|
||||
f"Table {table!r} is not declared in tables=[TableSpec(...)]. "
|
||||
"Add a TableSpec for it (declarative mode is a strict allowlist)."
|
||||
)
|
||||
allowed = self._declared[table]
|
||||
if allowed is None:
|
||||
continue # whole table declared — any column is fine
|
||||
if table in parsed.wildcard_tables:
|
||||
raise UndeclaredError(
|
||||
f"SELECT * on {table!r} is not allowed: only columns {allowed} "
|
||||
"are declared. List the columns explicitly or declare "
|
||||
"columns=None for the whole table."
|
||||
)
|
||||
unknown = [c for c in parsed.columns_by_table.get(table, []) if c not in allowed]
|
||||
if unknown:
|
||||
raise UndeclaredError(
|
||||
f"Column(s) {unknown} of {table!r} are not declared "
|
||||
f"(declared: {allowed})."
|
||||
)
|
||||
|
||||
def execute(self, sql: str, params: Params = None) -> list[dict]:
|
||||
parsed = parse(sql, params, dialect=self._dialect)
|
||||
self._check_declared(parsed)
|
||||
# The source connection is opened lazily — a pure cache hit never touches
|
||||
# the source and never occupies a pool slot.
|
||||
source = _LazySource(self._source_engine)
|
||||
try:
|
||||
executor = QueryExecutor(
|
||||
self._cache,
|
||||
self._registry,
|
||||
source,
|
||||
self._stats,
|
||||
self._delta,
|
||||
self._ttl,
|
||||
self._index_columns,
|
||||
)
|
||||
return executor.execute(parsed)
|
||||
return self._make_executor(source).execute(parsed)
|
||||
finally:
|
||||
source.close()
|
||||
|
||||
def _preload(self) -> None:
|
||||
"""Load declared ``preload=True`` tables into the cache (skipping fresh copies)."""
|
||||
if not self._preload_specs:
|
||||
return
|
||||
source = _LazySource(self._source_engine)
|
||||
try:
|
||||
executor = self._make_executor(source)
|
||||
for spec in self._preload_specs:
|
||||
try:
|
||||
logger.info(f"Preloading {spec.name!r}…")
|
||||
executor.ensure_loaded(spec.name, spec.columns)
|
||||
except Exception as e:
|
||||
logger.error(f"Preload failed for {spec.name!r}: {e}")
|
||||
finally:
|
||||
source.close()
|
||||
|
||||
@@ -250,6 +351,7 @@ class CachingEngine:
|
||||
def _start_refresh_thread(self, initial_catch_up: bool = True) -> None:
|
||||
def loop() -> None:
|
||||
if initial_catch_up:
|
||||
self._preload() # off-main-thread declared-table preload
|
||||
self._run_refresh() # off-main-thread startup catch-up
|
||||
event = threading.Event()
|
||||
while not event.wait(self._refresh_interval):
|
||||
|
||||
@@ -4,3 +4,13 @@ class ReadOnlyError(Exception):
|
||||
|
||||
class UnsupportedQueryError(Exception):
|
||||
"""Raised when a query uses unsupported features (JOIN, SELECT *)."""
|
||||
|
||||
|
||||
class UndeclaredError(Exception):
|
||||
"""Raised in declarative mode (``tables=[TableSpec(...)]``) when a query
|
||||
references a table or column that was not declared up front.
|
||||
|
||||
Fail-fast by design: an undeclared table/column would otherwise trigger a
|
||||
silent (potentially multi-hour) lazy load/column-expansion, so it is surfaced
|
||||
immediately instead.
|
||||
"""
|
||||
|
||||
@@ -42,6 +42,20 @@ class QueryExecutor:
|
||||
self._ensure_table(table, parsed)
|
||||
return self._run_in_memory(parsed)
|
||||
|
||||
def ensure_loaded(self, table: str, columns: list[str] | None) -> None:
|
||||
"""Preload *table* into the cache without running a query.
|
||||
|
||||
``columns=None`` loads the whole table (``SELECT *`` semantics); otherwise
|
||||
only the listed columns. Reuses the same load path as a real query — delta
|
||||
key/change + index columns are augmented, the registry and watermark are
|
||||
updated, and double-checked locking skips a copy already fresh in the
|
||||
cache — but never materializes any rows (unlike :meth:`execute`).
|
||||
"""
|
||||
if columns is None:
|
||||
self._ensure_full(table)
|
||||
else:
|
||||
self._ensure_columns(table, columns)
|
||||
|
||||
def _ensure_table(self, table: str, parsed: ParsedQuery) -> None:
|
||||
if table in parsed.wildcard_tables:
|
||||
self._ensure_full(table)
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Declarative table specs for ``CachingEngine(tables=[...])``.
|
||||
|
||||
Instead of the lazy "learn columns from queries" mode, an application can declare
|
||||
each table up front — its columns, indexes, refresh strategy and datetime columns —
|
||||
so the engine preloads them and rejects anything undeclared (fail-fast) rather than
|
||||
silently triggering an expensive lazy load. The legacy ``delta=/ttl=/indexes=``
|
||||
kwargs keep working; ``tables=`` is converted to the same internal config.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from .delta import DeltaConfig
|
||||
|
||||
# Friendly alias for the declarative API; ``Delta`` and ``DeltaConfig`` are the
|
||||
# same type (``change_column`` + ``key_columns``), so either may be used as a
|
||||
# ``TableSpec.refresh`` strategy.
|
||||
Delta = DeltaConfig
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TTL:
|
||||
"""Time-based refresh strategy: full-reload the table when older than *seconds*."""
|
||||
|
||||
seconds: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TableSpec:
|
||||
"""Declarative specification of one cached table.
|
||||
|
||||
*columns* lists the columns to cache; leave it ``None`` to cache the whole
|
||||
table (``SELECT *`` semantics) and allow any column. When columns are listed,
|
||||
a query asking for a column outside the list raises
|
||||
:class:`~sqlmem.exceptions.UndeclaredError`.
|
||||
|
||||
*refresh* is a :class:`Delta` (change-column incremental sync) or :class:`TTL`
|
||||
(time-based full reload), or ``None`` for a static table loaded once.
|
||||
|
||||
*preload=True* loads the table at startup (in the background by default) so the
|
||||
first query is a cache hit instead of paying a cold load; a copy already fresh
|
||||
in the persistent cache is skipped.
|
||||
"""
|
||||
|
||||
name: str
|
||||
columns: list[str] | None = None
|
||||
indexes: list[str | list[str]] = field(default_factory=list)
|
||||
refresh: DeltaConfig | TTL | None = None
|
||||
datetime_columns: list[str] = field(default_factory=list)
|
||||
preload: bool = False
|
||||
Reference in New Issue
Block a user