Add secondary indexes to accelerate cache lookups

This commit is contained in:
Jan Doubravský
2026-06-05 18:17:55 +02:00
parent 286a5f207d
commit 757a8f4eba
8 changed files with 213 additions and 4 deletions
+10
View File
@@ -6,6 +6,16 @@ All notable changes to this project will be documented in this file.
--- ---
## [1.6.0] - 2026-06-05
### Added
- **Secondary indexes** — `CachingEngine(engine, indexes={"VW_X": ["col", ["a", "b"]]})` creates indexes on the in-memory cache to accelerate `WHERE`/`JOIN` lookups. Index columns are auto-loaded so the index exists from the first load, and indexes are recreated after every (re)load and persist in `cache.db`. Combines freely with `delta` and `ttl`.
### Changed
- `pyproject.toml` — bumped version to `1.6.0`
---
## [1.5.0] - 2026-06-05 ## [1.5.0] - 2026-06-05
### Added ### Added
+20
View File
@@ -218,6 +218,26 @@ engine = CachingEngine(
engine.refresh() # also reloads any expired TTL tables on demand engine.refresh() # also reloads any expired TTL tables on demand
``` ```
## Secondary indexes
To accelerate lookups, you can declare **secondary indexes** per table — they are created on the in-memory SQLite copy so `WHERE`/`JOIN` filters on those columns run as indexed searches instead of full scans:
```python
engine = CachingEngine(
base_engine,
indexes={
"VW_P_PRATVALUES": ["PRODUCT_PRODUCTNR"], # single-column index
"VW_ELEMENTS": [["ELEMENT_ID", "ELEMENTVARIANT_ID"], "ELEMENTVARIANT_NAME"],
},
)
```
Each value is a list of index definitions: a string is a single-column index, a nested list is a composite index.
- **Index columns are pulled into the cache automatically** (like delta key columns), so the index exists from the first load even if your queries don't select those columns.
- Indexes are **recreated after every (re)load** — full loads, TTL reloads, and `invalidate()` + re-fetch all rebuild them — so they're always present, and they persist in `cache.db` across restarts.
- Delta-tracked tables already get a unique index on their key columns; secondary indexes are independent and can be combined with `delta` or `ttl`.
## Persistence ## Persistence
The in-memory cache is persisted to `cache.db` on disk: The in-memory cache is persisted to `cache.db` on disk:
+1
View File
@@ -193,6 +193,7 @@ SQLMEM_DEBUG=true # DEBUG level — podrobný výpis každého dotazu, cache o
- [x] **Třídílné názvy tabulek**: `[catalog].[schema].[table]` se cachuje pod base name, in-memory dotaz prefix stripuje. - [x] **Třídílné názvy tabulek**: `[catalog].[schema].[table]` se cachuje pod base name, in-memory dotaz prefix stripuje.
- [x] **Inkrementální (delta) refresh**: per-tabulku `DeltaConfig(change_column, key_columns)` — sync jen změněných řádků přes datový watermark `max(change_column)` (`>=` + idempotentní upsert podle klíče), catch-up na startu + background thread (`SQLMEM_REFRESH_INTERVAL`, default 300 s). PK se auto-zjistí ze zdrojové DB, pro views nutno zadat ručně. - [x] **Inkrementální (delta) refresh**: per-tabulku `DeltaConfig(change_column, key_columns)` — sync jen změněných řádků přes datový watermark `max(change_column)` (`>=` + idempotentní upsert podle klíče), catch-up na startu + background thread (`SQLMEM_REFRESH_INTERVAL`, default 300 s). PK se auto-zjistí ze zdrojové DB, pro views nutno zadat ručně.
- [x] **`engine.reset()`**: smaže celou cache (RAM + `cache.db`) pro čistý rebuild po strukturální změně. - [x] **`engine.reset()`**: smaže celou cache (RAM + `cache.db`) pro čistý rebuild po strukturální změně.
- [x] **Sekundární indexy**: `indexes={"VW_X": ["col", ["a","b"]]}` — indexy na in-memory cache pro zrychlení `WHERE`/`JOIN`; index-sloupce se auto-dotáhnou, indexy se obnoví po každém (re)loadu.
- [x] **TTL na úrovni tabulky**: `ttl={"VW_X": 300}` — pro tabulky bez timestamp sloupce. Garantuje, že cache není starší než interval (full reload při čtení po expiraci + proaktivně na pozadí). - [x] **TTL na úrovni tabulky**: `ttl={"VW_X": 300}` — pro tabulky bez timestamp sloupce. Garantuje, že cache není starší než interval (full reload při čtení po expiraci + proaktivně na pozadí).
## TODO — budoucí funkce ## TODO — budoucí funkce
+1 -1
View File
@@ -1,6 +1,6 @@
[project] [project]
name = "sqlmem" name = "sqlmem"
version = "1.5.0" version = "1.6.0"
description = "" description = ""
authors = [ authors = [
{name = "jan.doubravsky@gmail.com"} {name = "jan.doubravsky@gmail.com"}
+33
View File
@@ -2,6 +2,7 @@ import atexit
import signal import signal
import sqlite3 import sqlite3
import threading import threading
from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -15,6 +16,12 @@ from .stats import TableState
SCHEMA_VERSION = 3 SCHEMA_VERSION = 3
@dataclass(frozen=True)
class _Index:
name: str
columns: tuple[str, ...]
class CacheManager: class CacheManager:
def __init__(self, db_path: Path, backup_interval: int) -> None: def __init__(self, db_path: Path, backup_interval: int) -> None:
self._db_path = db_path self._db_path = db_path
@@ -23,6 +30,7 @@ class CacheManager:
self._lock = threading.Lock() # serializes connection access self._lock = threading.Lock() # serializes connection access
self._load_lock = threading.Lock() # serializes full table loads self._load_lock = threading.Lock() # serializes full table loads
self._states: dict[str, str] = {} # table → live processing state self._states: dict[str, str] = {} # table → live processing state
self._index_defs: dict[str, list[_Index]] = {} # table → secondary indexes
self._closed = False self._closed = False
self._ensure_meta_tables() self._ensure_meta_tables()
@@ -190,6 +198,30 @@ class CacheManager:
def clear_state(self, table: str) -> None: def clear_state(self, table: str) -> None:
self._states.pop(table, None) self._states.pop(table, None)
def add_index(self, table: str, columns: list[str]) -> None:
"""Register a secondary index to (re)create on *columns* after each load."""
name = "sqlmem_idx_" + "_".join([table, *columns])
defs = self._index_defs.setdefault(table, [])
if all(d.name != name for d in defs):
defs.append(_Index(name=name, columns=tuple(columns)))
def _create_indexes(self, table: str, available: list[str]) -> None:
"""Create the registered secondary indexes whose columns are all cached."""
available_set = set(available)
for idx in self._index_defs.get(table, []):
if not set(idx.columns) <= available_set:
logger.warning(
f"Skipping index {idx.name!r}: columns {idx.columns} not all cached."
)
continue
cols = ", ".join(idx.columns)
with self._lock:
self._mem_conn.execute(
f"CREATE INDEX IF NOT EXISTS {idx.name} ON {table} ({cols})"
)
self._mem_conn.commit()
logger.debug(f"Index {idx.name!r} ready on {table} ({cols})")
def load_table( def load_table(
self, self,
table: str, table: str,
@@ -243,6 +275,7 @@ class CacheManager:
self.set_state(table, TableState.ERROR) self.set_state(table, TableState.ERROR)
raise raise
self._create_indexes(table, columns)
self.mark_table_refreshed(table, total, full) self.mark_table_refreshed(table, total, full)
self.set_state(table, TableState.READY) self.set_state(table, TableState.READY)
logger.info(f"Table {table!r} cached ({total} rows, columns: {columns})") logger.info(f"Table {table!r} cached ({total} rows, columns: {columns})")
+25 -1
View File
@@ -24,6 +24,7 @@ class CachingEngine:
source_engine: Engine, source_engine: Engine,
delta: dict[str, DeltaConfig] | None = None, delta: dict[str, DeltaConfig] | None = None,
ttl: dict[str, int] | None = None, ttl: dict[str, int] | None = None,
indexes: dict[str, list[str | list[str]]] | None = None,
) -> None: ) -> None:
self._source_engine = source_engine self._source_engine = source_engine
self._cache = CacheManager(CACHE_DB_PATH, BACKUP_INTERVAL_SECONDS) self._cache = CacheManager(CACHE_DB_PATH, BACKUP_INTERVAL_SECONDS)
@@ -32,6 +33,7 @@ class CachingEngine:
self._refresh_interval = REFRESH_INTERVAL_SECONDS self._refresh_interval = REFRESH_INTERVAL_SECONDS
self._delta = self._resolve_delta(delta or {}) self._delta = self._resolve_delta(delta or {})
self._ttl = dict(ttl or {}) self._ttl = dict(ttl or {})
self._index_columns = self._register_indexes(indexes or {})
self._refresher = DeltaRefresher(self._cache, self._delta) self._refresher = DeltaRefresher(self._cache, self._delta)
overlap = set(self._delta) & set(self._ttl) overlap = set(self._delta) & set(self._ttl)
@@ -48,6 +50,22 @@ class CachingEngine:
logger.info("CachingEngine initialized.") logger.info("CachingEngine initialized.")
def _register_indexes(
self, indexes: dict[str, list[str | list[str]]]
) -> dict[str, list[str]]:
"""Register secondary indexes on the cache; return columns to load per table."""
index_columns: dict[str, list[str]] = {}
for table, specs in indexes.items():
wanted: list[str] = []
for spec in specs:
columns = [spec] if isinstance(spec, str) else list(spec)
self._cache.add_index(table, columns)
for col in columns:
if col not in wanted:
wanted.append(col)
index_columns[table] = wanted
return index_columns
def _resolve_delta(self, delta: dict[str, DeltaConfig]) -> dict[str, ResolvedDelta]: def _resolve_delta(self, delta: dict[str, DeltaConfig]) -> dict[str, ResolvedDelta]:
"""Resolve each DeltaConfig, auto-discovering the primary key when omitted.""" """Resolve each DeltaConfig, auto-discovering the primary key when omitted."""
resolved: dict[str, ResolvedDelta] = {} resolved: dict[str, ResolvedDelta] = {}
@@ -95,7 +113,13 @@ class CachingEngine:
with self._source_engine.connect() as sa_conn: with self._source_engine.connect() as sa_conn:
raw_conn = cast(sqlite3.Connection, sa_conn.connection.dbapi_connection) raw_conn = cast(sqlite3.Connection, sa_conn.connection.dbapi_connection)
executor = QueryExecutor( executor = QueryExecutor(
self._cache, self._registry, raw_conn, self._stats, self._delta, self._ttl self._cache,
self._registry,
raw_conn,
self._stats,
self._delta,
self._ttl,
self._index_columns,
) )
return executor.execute(parsed) return executor.execute(parsed)
+7 -2
View File
@@ -18,6 +18,7 @@ class QueryExecutor:
stats: StatsCollector, stats: StatsCollector,
delta: dict[str, ResolvedDelta] | None = None, delta: dict[str, ResolvedDelta] | None = None,
ttl: dict[str, int] | None = None, ttl: dict[str, int] | None = None,
index_columns: dict[str, list[str]] | None = None,
) -> None: ) -> None:
self._cache = cache self._cache = cache
self._registry = registry self._registry = registry
@@ -25,6 +26,7 @@ class QueryExecutor:
self._stats = stats self._stats = stats
self._delta = delta or {} self._delta = delta or {}
self._ttl = ttl or {} self._ttl = ttl or {}
self._index_columns = index_columns or {}
def _ttl_expired(self, table: str) -> bool: def _ttl_expired(self, table: str) -> bool:
"""True if *table* has a TTL and its cached copy is older than that TTL.""" """True if *table* has a TTL and its cached copy is older than that TTL."""
@@ -96,12 +98,15 @@ class QueryExecutor:
self._load(table, all_columns, full=full) self._load(table, all_columns, full=full)
def _load(self, table: str, columns: list[str], full: bool) -> None: def _load(self, table: str, columns: list[str], full: bool) -> None:
"""Fetch *table* into cache, adding delta key/timestamp columns when tracked.""" """Fetch *table* into cache, adding delta key/timestamp and index columns."""
cfg = self._delta.get(table) cfg = self._delta.get(table)
extra = list(self._index_columns.get(table, []))
if cfg: if cfg:
# The cache must always hold the key (to upsert) and the change column # The cache must always hold the key (to upsert) and the change column
# (to compute the watermark), even if no query referenced them. # (to compute the watermark), even if no query referenced them.
columns = list(dict.fromkeys([*columns, *cfg.key_columns, cfg.change_column])) extra += [*cfg.key_columns, cfg.change_column]
if extra:
columns = list(dict.fromkeys([*columns, *extra]))
self._cache.load_table(table, columns, self._source_conn, full=full) self._cache.load_table(table, columns, self._source_conn, full=full)
self._registry.update(table, columns) self._registry.update(table, columns)
+116
View File
@@ -0,0 +1,116 @@
import sqlite3
import pytest
from sqlalchemy import create_engine
import sqlmem.engine as eng_mod
from sqlmem import CachingEngine
from sqlmem.cache import CacheManager
def index_names(conn, table=None):
sql = "SELECT name FROM sqlite_master WHERE type = 'index'"
return {r[0] for r in conn.execute(sql).fetchall()}
# --- cache level ------------------------------------------------------------
@pytest.fixture
def source_conn():
conn = sqlite3.connect(":memory:")
conn.execute("CREATE TABLE big (id TEXT, val TEXT)")
conn.executemany(
"INSERT INTO big VALUES (?, ?)", [(str(i), f"v{i}") for i in range(100)]
)
conn.commit()
yield conn
conn.close()
@pytest.fixture
def cache(tmp_path):
c = CacheManager(db_path=tmp_path / "cache.db", backup_interval=9999)
yield c
c.close()
def test_index_created_on_load(cache, source_conn):
cache.add_index("big", ["val"])
cache.load_table("big", ["id", "val"], source_conn)
assert "sqlmem_idx_big_val" in index_names(cache.connection)
def test_index_used_by_query_planner(cache, source_conn):
cache.add_index("big", ["val"])
cache.load_table("big", ["id", "val"], source_conn)
plan = cache.connection.execute(
"EXPLAIN QUERY PLAN SELECT id FROM big WHERE val = 'v50'"
).fetchall()
assert any("sqlmem_idx_big_val" in str(row) for row in plan)
def test_index_skipped_when_columns_not_cached(cache, source_conn):
cache.add_index("big", ["missing_col"])
cache.load_table("big", ["id", "val"], source_conn) # must not raise
assert "sqlmem_idx_big_missing_col" not in index_names(cache.connection)
def test_index_recreated_on_reload(cache, source_conn):
cache.add_index("big", ["val"])
cache.load_table("big", ["id", "val"], source_conn)
cache.load_table("big", ["id", "val"], source_conn) # reload (staging swap)
assert "sqlmem_idx_big_val" in index_names(cache.connection)
# --- engine level -----------------------------------------------------------
@pytest.fixture
def source_engine(tmp_path):
db_path = tmp_path / "source.db"
conn = sqlite3.connect(db_path)
conn.execute("CREATE TABLE products (id TEXT, name TEXT, price TEXT)")
conn.executemany(
"INSERT INTO products VALUES (?, ?, ?)",
[(str(i), f"n{i}", f"{i}.00") for i in range(20)],
)
conn.commit()
conn.close()
engine = create_engine(f"sqlite:///{db_path}")
yield engine
engine.dispose()
@pytest.fixture
def patched_cache(tmp_path, monkeypatch):
monkeypatch.setattr(eng_mod, "CACHE_DB_PATH", tmp_path / "cache.db")
monkeypatch.setattr(eng_mod, "BACKUP_INTERVAL_SECONDS", 9999)
def test_index_column_auto_loaded_even_if_not_selected(source_engine, patched_cache):
engine = CachingEngine(source_engine, indexes={"products": ["name"]})
engine.execute("SELECT id FROM products") # does not select 'name'
cols = {
r[1]
for r in engine._cache.connection.execute("PRAGMA table_info(products)").fetchall()
}
assert "name" in cols # pulled in so the index can be built
assert "sqlmem_idx_products_name" in index_names(engine._cache.connection)
engine.close()
def test_composite_index(source_engine, patched_cache):
engine = CachingEngine(source_engine, indexes={"products": [["name", "price"]]})
engine.execute("SELECT id FROM products")
assert "sqlmem_idx_products_name_price" in index_names(engine._cache.connection)
engine.close()
def test_index_survives_invalidate_and_reload(source_engine, patched_cache):
engine = CachingEngine(source_engine, indexes={"products": ["name"]})
engine.execute("SELECT id, name FROM products")
engine.invalidate("products")
engine.execute("SELECT id, name FROM products")
assert "sqlmem_idx_products_name" in index_names(engine._cache.connection)
engine.close()