Fix frozen delta watermark and add error stats, lazy source, concurrent disk reads, and per-engine config

This commit is contained in:
Jan Doubravský
2026-06-08 19:35:33 +02:00
parent 209ae667ab
commit 6dc85e4f3c
17 changed files with 668 additions and 71 deletions
+63
View File
@@ -1,4 +1,5 @@
import sqlite3
import threading
import pytest
@@ -96,6 +97,68 @@ def test_disk_mode_reload_in_new_instance(tmp_path, source_conn):
c2.close()
def test_quoted_reserved_and_spaced_identifiers(tmp_path):
"""Table/column names that are reserved words or contain spaces must work."""
src = sqlite3.connect(":memory:")
src.execute('CREATE TABLE "weird tbl" ("order" TEXT, "group by" TEXT)')
src.executemany('INSERT INTO "weird tbl" VALUES (?, ?)', [("1", "a"), ("2", "b")])
src.commit()
c = CacheManager(db_path=tmp_path / "c.db", backup_interval=9999)
c.load_table("weird tbl", ["order", "group by"], src)
assert c.is_table_cached("weird tbl") is True
_, rows = c.execute_in_memory('SELECT "order", "group by" FROM "weird tbl"')
assert ("1", "a") in rows
c.close()
src.close()
def test_disk_mode_uses_separate_read_connection(tmp_path, source_conn):
"""Disk-mode reads go through a per-thread read connection, not the writer."""
c = CacheManager(db_path=tmp_path / "c.db", backup_interval=9999, in_memory=False)
c.load_table("users", ["name", "email"], source_conn)
_, rows = c.execute_in_memory("SELECT name FROM users ORDER BY name")
assert [r[0] for r in rows] == ["alice", "bob"]
assert len(c._read_conns) == 1
assert c._read_conns[0] is not c.connection # dedicated read conn
c.close()
def test_disk_mode_concurrent_reads(tmp_path, source_conn):
"""Several reader threads each get their own connection and correct results."""
c = CacheManager(db_path=tmp_path / "c.db", backup_interval=9999, in_memory=False)
c.load_table("users", ["name"], source_conn)
results: list[int] = []
errors: list[Exception] = []
def reader() -> None:
try:
_, rows = c.execute_in_memory("SELECT name FROM users")
results.append(len(rows))
except Exception as e: # noqa: BLE001
errors.append(e)
threads = [threading.Thread(target=reader) for _ in range(5)]
for t in threads:
t.start()
for t in threads:
t.join(5)
assert not errors
assert results == [2] * 5
assert len(c._read_conns) == 5 # one read connection per reader thread
c.close()
def test_memory_mode_uses_shared_connection(cache, source_conn):
"""In-memory mode can't share :memory: across connections — no read conns."""
cache.load_table("users", ["name"], source_conn)
cache.execute_in_memory("SELECT name FROM users")
assert cache._read_conns == []
def test_disk_mode_reset_keeps_file(tmp_path, source_conn):
db_path = tmp_path / "cache.db"
c = CacheManager(db_path=db_path, backup_interval=9999, in_memory=False)
+128 -1
View File
@@ -1,4 +1,6 @@
import sqlite3
import threading
from datetime import datetime
from types import SimpleNamespace
import pytest
@@ -7,7 +9,7 @@ from sqlalchemy import create_engine
import sqlmem.engine as eng_mod
from sqlmem import CachingEngine, DeltaConfig
from sqlmem.cache import CacheManager
from sqlmem.delta import DeltaRefresher, ResolvedDelta
from sqlmem.delta import DeltaRefresher, ResolvedDelta, _bind_watermark
from sqlmem.executor import QueryExecutor
from sqlmem.parser import parse
from sqlmem.registry import ColumnRegistry
@@ -117,6 +119,89 @@ def test_refresh_without_changes_is_noop(env):
assert before == after
# ---------------------------------------------------------------------------
# Watermark binding — regression for the datetime-as-string delta bug
# (SQL Server error 241: 'T'-separated 6-digit-microsecond ISO string can't be
# implicitly converted varchar->datetime, freezing the delta watermark).
# ---------------------------------------------------------------------------
def test_bind_watermark_parses_iso_datetime():
assert _bind_watermark("2026-06-05T14:54:24.823000") == datetime(
2026, 6, 5, 14, 54, 24, 823000
)
def test_bind_watermark_parses_space_separated():
assert _bind_watermark("2026-06-01 10:05:00") == datetime(2026, 6, 1, 10, 5, 0)
def test_bind_watermark_passes_through_non_datetime():
# Integer rowversion / non-datetime change column — left untouched.
assert _bind_watermark("12345") == "12345"
class _SpyCursor:
def __init__(self, rows):
self._rows = list(rows)
def fetchmany(self, n):
batch, self._rows = self._rows[:n], self._rows[n:]
return batch
class _SpySource:
"""Records the parameters bound to each query (stands in for the pyodbc source)."""
def __init__(self, rows):
self._rows = rows
self.bound = []
def execute(self, sql, params=()):
self.bound.append((sql, params))
return _SpyCursor(self._rows)
def test_refresh_binds_watermark_as_datetime(env):
"""The watermark must reach the source as a real datetime, not a raw ISO
string — otherwise SQL Server raises error 241 and the delta freezes."""
env.cache.set_last_synced_at("products", "2026-06-05T14:54:24.823000")
spy = _SpySource(rows=[("1", "Widget", "9.99", "2026-06-05T14:54:24.823000")])
env.refresher.refresh(spy)
assert spy.bound, "source query was never issued"
_, params = spy.bound[-1]
assert params == (datetime(2026, 6, 5, 14, 54, 24, 823000),)
# ---------------------------------------------------------------------------
# Refresh failures are recorded (4.3) so a stuck delta is visible in stats
# ---------------------------------------------------------------------------
class _RaisingSource:
def execute(self, sql, params=()):
raise RuntimeError("boom 241")
def test_failed_delta_refresh_records_error(env):
env.refresher.refresh(_RaisingSource())
err = env.cache.get_errors()["products"]
assert err.consecutive == 1
assert "boom 241" in err.message
assert env.cache.error_total == 1
# State is marked error even though the cache still holds the last-good data.
assert env.cache.get_states()["products"] == "error"
def test_delta_success_resets_failure_streak(env):
env.refresher.refresh(_RaisingSource())
assert env.cache.get_errors()["products"].consecutive == 1
env.refresher.refresh(env.source) # real source — succeeds
assert env.cache.get_errors()["products"].consecutive == 0
# ---------------------------------------------------------------------------
# Engine-level: PK auto-discovery, reset, end-to-end refresh
# ---------------------------------------------------------------------------
@@ -170,6 +255,48 @@ def test_engine_reset(source_engine, patched_cache):
engine.close()
def test_startup_catch_up_is_non_blocking_by_default(source_engine, patched_cache, monkeypatch):
"""By default the startup catch-up runs on the background thread, not the
main thread, so it never blocks application startup."""
threads: list[str] = []
started = threading.Event()
real = eng_mod.CachingEngine._run_refresh
def spy(self):
threads.append(threading.current_thread().name)
started.set()
return real(self)
monkeypatch.setattr(eng_mod.CachingEngine, "_run_refresh", spy)
engine = CachingEngine(
source_engine, delta={"products": DeltaConfig("changed", ["id"])}
)
# __init__ has returned; the main thread must not have run the catch-up.
assert "MainThread" not in threads
assert started.wait(2), "background catch-up never ran"
assert threads == ["sqlmem-delta"]
engine.close()
def test_blocking_startup_refresh_runs_synchronously(source_engine, patched_cache, monkeypatch):
threads: list[str] = []
real = eng_mod.CachingEngine._run_refresh
def spy(self):
threads.append(threading.current_thread().name)
return real(self)
monkeypatch.setattr(eng_mod.CachingEngine, "_run_refresh", spy)
engine = CachingEngine(
source_engine,
delta={"products": DeltaConfig("changed", ["id"])},
blocking_startup_refresh=True,
)
# Opt-in: the catch-up ran on the main thread before __init__ returned.
assert "MainThread" in threads
engine.close()
def test_engine_delta_refresh_end_to_end(source_engine, source_db, patched_cache):
engine = CachingEngine(
source_engine, delta={"products": DeltaConfig(change_column="changed", key_columns=["id"])}
+54
View File
@@ -124,6 +124,22 @@ def test_second_query_same_columns_is_cache_hit(engine):
assert len(rows) == 3
def test_cache_hit_does_not_open_source(engine, source_engine, monkeypatch):
"""A pure cache hit must not open a source connection (lazy source)."""
engine.execute("SELECT id, name FROM products") # miss → caches
calls = {"n": 0}
original_connect = source_engine.connect
def counting_connect(*args, **kwargs):
calls["n"] += 1
return original_connect(*args, **kwargs)
monkeypatch.setattr(source_engine, "connect", counting_connect)
engine.execute("SELECT id, name FROM products") # hit → no source access
assert calls["n"] == 0
# ---------------------------------------------------------------------------
# SQL file creation — backup to disk
# ---------------------------------------------------------------------------
@@ -331,3 +347,41 @@ def test_in_memory_override_respects_config(source_engine, cache_path, monkeypat
ce = CachingEngine(source_engine) # no explicit in_memory
assert ce._cache._in_memory is False
ce.close()
# ---------------------------------------------------------------------------
# Per-engine configuration (constructor overrides env defaults)
# ---------------------------------------------------------------------------
def test_constructor_config_overrides(source_engine, tmp_path):
p = tmp_path / "explicit_cache.db"
ce = CachingEngine(
source_engine,
cache_db_path=p,
fetch_batch=3,
dialect="sqlite",
backup_interval=12345,
refresh_interval=42,
in_memory=False,
)
ce.execute("SELECT id, name FROM products")
assert p.exists()
assert ce._cache._fetch_batch == 3
assert ce._cache._dialect == "sqlite"
assert ce._dialect == "sqlite"
assert ce._cache._backup_interval == 12345
assert ce._refresh_interval == 42
ce.close()
def test_two_engines_separate_cache_files(source_engine, tmp_path):
"""Two engines in one process can target different cache files."""
a = CachingEngine(source_engine, cache_db_path=tmp_path / "a.db", in_memory=False)
b = CachingEngine(source_engine, cache_db_path=tmp_path / "b.db", in_memory=False)
a.execute("SELECT id FROM products")
assert (tmp_path / "a.db").exists()
assert a._cache.is_table_cached("products") is True
assert b._cache.is_table_cached("products") is False # independent cache
a.close()
b.close()
+24
View File
@@ -0,0 +1,24 @@
from loguru import logger
import sqlmem
def test_add_sink_idempotent_no_duplicate_lines():
"""Calling add_sink twice for the same sink must not duplicate log lines."""
sqlmem._added_sinks.clear()
msgs: list[str] = []
sink = lambda message: msgs.append(str(message)) # noqa: E731
try:
sqlmem.add_sink(sink, level="DEBUG", colorize=False)
sqlmem.add_sink(sink, level="DEBUG", colorize=False) # second call: no-op
assert len(sqlmem._added_sinks) == 1
# Emit one record that passes the "sqlmem" name filter.
logger.patch(lambda r: r.update(name="sqlmem")).info("hello sqlmem")
assert sum("hello sqlmem" in m for m in msgs) == 1
finally:
for handler_id in sqlmem._added_sinks.values():
logger.remove(handler_id)
sqlmem._added_sinks.clear()
logger.disable("sqlmem") # restore the default-silent state for other tests
+23
View File
@@ -73,6 +73,29 @@ def test_counters_still_reported(source_engine, patched_cache):
engine.close()
def test_stats_exposes_table_error(source_engine, patched_cache):
engine = CachingEngine(source_engine)
engine.execute("SELECT id, name FROM products")
engine._cache.record_error("products", "ValueError: boom")
s = engine.stats
assert s.errors == 1
assert s.tables["products"].consecutive_failures == 1
assert s.tables["products"].last_error == "ValueError: boom"
assert s.tables["products"].last_error_at is not None
engine.close()
def test_stats_no_error_by_default(source_engine, patched_cache):
engine = CachingEngine(source_engine)
engine.execute("SELECT id, name FROM products")
s = engine.stats
assert s.errors == 0
assert s.tables["products"].consecutive_failures == 0
assert s.tables["products"].last_error is None
engine.close()
# --- a table being loaded for the first time shows up as "loading" ----------