Fix frozen delta watermark and add error stats, lazy source, concurrent disk reads, and per-engine config

2026-06-08 19:35:33 +02:00
parent 209ae667ab
commit 6dc85e4f3c
17 changed files with 668 additions and 71 deletions
@@ -1,4 +1,5 @@
 import sqlite3
+import threading

 import pytest

@@ -96,6 +97,68 @@ def test_disk_mode_reload_in_new_instance(tmp_path, source_conn):
    c2.close()


+def test_quoted_reserved_and_spaced_identifiers(tmp_path):
+    """Table/column names that are reserved words or contain spaces must work."""
+    src = sqlite3.connect(":memory:")
+    src.execute('CREATE TABLE "weird tbl" ("order" TEXT, "group by" TEXT)')
+    src.executemany('INSERT INTO "weird tbl" VALUES (?, ?)', [("1", "a"), ("2", "b")])
+    src.commit()
+
+    c = CacheManager(db_path=tmp_path / "c.db", backup_interval=9999)
+    c.load_table("weird tbl", ["order", "group by"], src)
+    assert c.is_table_cached("weird tbl") is True
+    _, rows = c.execute_in_memory('SELECT "order", "group by" FROM "weird tbl"')
+    assert ("1", "a") in rows
+    c.close()
+    src.close()
+
+
+def test_disk_mode_uses_separate_read_connection(tmp_path, source_conn):
+    """Disk-mode reads go through a per-thread read connection, not the writer."""
+    c = CacheManager(db_path=tmp_path / "c.db", backup_interval=9999, in_memory=False)
+    c.load_table("users", ["name", "email"], source_conn)
+
+    _, rows = c.execute_in_memory("SELECT name FROM users ORDER BY name")
+    assert [r[0] for r in rows] == ["alice", "bob"]
+    assert len(c._read_conns) == 1
+    assert c._read_conns[0] is not c.connection  # dedicated read conn
+    c.close()
+
+
+def test_disk_mode_concurrent_reads(tmp_path, source_conn):
+    """Several reader threads each get their own connection and correct results."""
+    c = CacheManager(db_path=tmp_path / "c.db", backup_interval=9999, in_memory=False)
+    c.load_table("users", ["name"], source_conn)
+
+    results: list[int] = []
+    errors: list[Exception] = []
+
+    def reader() -> None:
+        try:
+            _, rows = c.execute_in_memory("SELECT name FROM users")
+            results.append(len(rows))
+        except Exception as e:  # noqa: BLE001
+            errors.append(e)
+
+    threads = [threading.Thread(target=reader) for _ in range(5)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(5)
+
+    assert not errors
+    assert results == [2] * 5
+    assert len(c._read_conns) == 5  # one read connection per reader thread
+    c.close()
+
+
+def test_memory_mode_uses_shared_connection(cache, source_conn):
+    """In-memory mode can't share :memory: across connections — no read conns."""
+    cache.load_table("users", ["name"], source_conn)
+    cache.execute_in_memory("SELECT name FROM users")
+    assert cache._read_conns == []
+
+
 def test_disk_mode_reset_keeps_file(tmp_path, source_conn):
    db_path = tmp_path / "cache.db"
    c = CacheManager(db_path=db_path, backup_interval=9999, in_memory=False)
@@ -1,4 +1,6 @@
 import sqlite3
+import threading
+from datetime import datetime
 from types import SimpleNamespace

 import pytest
@@ -7,7 +9,7 @@ from sqlalchemy import create_engine
 import sqlmem.engine as eng_mod
 from sqlmem import CachingEngine, DeltaConfig
 from sqlmem.cache import CacheManager
-from sqlmem.delta import DeltaRefresher, ResolvedDelta
+from sqlmem.delta import DeltaRefresher, ResolvedDelta, _bind_watermark
 from sqlmem.executor import QueryExecutor
 from sqlmem.parser import parse
 from sqlmem.registry import ColumnRegistry
@@ -117,6 +119,89 @@ def test_refresh_without_changes_is_noop(env):
    assert before == after


+# ---------------------------------------------------------------------------
+# Watermark binding — regression for the datetime-as-string delta bug
+# (SQL Server error 241: 'T'-separated 6-digit-microsecond ISO string can't be
+#  implicitly converted varchar->datetime, freezing the delta watermark).
+# ---------------------------------------------------------------------------
+
+def test_bind_watermark_parses_iso_datetime():
+    assert _bind_watermark("2026-06-05T14:54:24.823000") == datetime(
+        2026, 6, 5, 14, 54, 24, 823000
+    )
+
+
+def test_bind_watermark_parses_space_separated():
+    assert _bind_watermark("2026-06-01 10:05:00") == datetime(2026, 6, 1, 10, 5, 0)
+
+
+def test_bind_watermark_passes_through_non_datetime():
+    # Integer rowversion / non-datetime change column — left untouched.
+    assert _bind_watermark("12345") == "12345"
+
+
+class _SpyCursor:
+    def __init__(self, rows):
+        self._rows = list(rows)
+
+    def fetchmany(self, n):
+        batch, self._rows = self._rows[:n], self._rows[n:]
+        return batch
+
+
+class _SpySource:
+    """Records the parameters bound to each query (stands in for the pyodbc source)."""
+
+    def __init__(self, rows):
+        self._rows = rows
+        self.bound = []
+
+    def execute(self, sql, params=()):
+        self.bound.append((sql, params))
+        return _SpyCursor(self._rows)
+
+
+def test_refresh_binds_watermark_as_datetime(env):
+    """The watermark must reach the source as a real datetime, not a raw ISO
+    string — otherwise SQL Server raises error 241 and the delta freezes."""
+    env.cache.set_last_synced_at("products", "2026-06-05T14:54:24.823000")
+    spy = _SpySource(rows=[("1", "Widget", "9.99", "2026-06-05T14:54:24.823000")])
+
+    env.refresher.refresh(spy)
+
+    assert spy.bound, "source query was never issued"
+    _, params = spy.bound[-1]
+    assert params == (datetime(2026, 6, 5, 14, 54, 24, 823000),)
+
+
+# ---------------------------------------------------------------------------
+# Refresh failures are recorded (4.3) so a stuck delta is visible in stats
+# ---------------------------------------------------------------------------
+
+class _RaisingSource:
+    def execute(self, sql, params=()):
+        raise RuntimeError("boom 241")
+
+
+def test_failed_delta_refresh_records_error(env):
+    env.refresher.refresh(_RaisingSource())
+
+    err = env.cache.get_errors()["products"]
+    assert err.consecutive == 1
+    assert "boom 241" in err.message
+    assert env.cache.error_total == 1
+    # State is marked error even though the cache still holds the last-good data.
+    assert env.cache.get_states()["products"] == "error"
+
+
+def test_delta_success_resets_failure_streak(env):
+    env.refresher.refresh(_RaisingSource())
+    assert env.cache.get_errors()["products"].consecutive == 1
+
+    env.refresher.refresh(env.source)  # real source — succeeds
+    assert env.cache.get_errors()["products"].consecutive == 0
+
+
 # ---------------------------------------------------------------------------
 # Engine-level: PK auto-discovery, reset, end-to-end refresh
 # ---------------------------------------------------------------------------
@@ -170,6 +255,48 @@ def test_engine_reset(source_engine, patched_cache):
    engine.close()


+def test_startup_catch_up_is_non_blocking_by_default(source_engine, patched_cache, monkeypatch):
+    """By default the startup catch-up runs on the background thread, not the
+    main thread, so it never blocks application startup."""
+    threads: list[str] = []
+    started = threading.Event()
+    real = eng_mod.CachingEngine._run_refresh
+
+    def spy(self):
+        threads.append(threading.current_thread().name)
+        started.set()
+        return real(self)
+
+    monkeypatch.setattr(eng_mod.CachingEngine, "_run_refresh", spy)
+    engine = CachingEngine(
+        source_engine, delta={"products": DeltaConfig("changed", ["id"])}
+    )
+    # __init__ has returned; the main thread must not have run the catch-up.
+    assert "MainThread" not in threads
+    assert started.wait(2), "background catch-up never ran"
+    assert threads == ["sqlmem-delta"]
+    engine.close()
+
+
+def test_blocking_startup_refresh_runs_synchronously(source_engine, patched_cache, monkeypatch):
+    threads: list[str] = []
+    real = eng_mod.CachingEngine._run_refresh
+
+    def spy(self):
+        threads.append(threading.current_thread().name)
+        return real(self)
+
+    monkeypatch.setattr(eng_mod.CachingEngine, "_run_refresh", spy)
+    engine = CachingEngine(
+        source_engine,
+        delta={"products": DeltaConfig("changed", ["id"])},
+        blocking_startup_refresh=True,
+    )
+    # Opt-in: the catch-up ran on the main thread before __init__ returned.
+    assert "MainThread" in threads
+    engine.close()
+
+
 def test_engine_delta_refresh_end_to_end(source_engine, source_db, patched_cache):
    engine = CachingEngine(
        source_engine, delta={"products": DeltaConfig(change_column="changed", key_columns=["id"])}
@@ -124,6 +124,22 @@ def test_second_query_same_columns_is_cache_hit(engine):
    assert len(rows) == 3


+def test_cache_hit_does_not_open_source(engine, source_engine, monkeypatch):
+    """A pure cache hit must not open a source connection (lazy source)."""
+    engine.execute("SELECT id, name FROM products")  # miss → caches
+
+    calls = {"n": 0}
+    original_connect = source_engine.connect
+
+    def counting_connect(*args, **kwargs):
+        calls["n"] += 1
+        return original_connect(*args, **kwargs)
+
+    monkeypatch.setattr(source_engine, "connect", counting_connect)
+    engine.execute("SELECT id, name FROM products")  # hit → no source access
+    assert calls["n"] == 0
+
+
 # ---------------------------------------------------------------------------
 # SQL file creation — backup to disk
 # ---------------------------------------------------------------------------
@@ -331,3 +347,41 @@ def test_in_memory_override_respects_config(source_engine, cache_path, monkeypat
    ce = CachingEngine(source_engine)  # no explicit in_memory
    assert ce._cache._in_memory is False
    ce.close()
+
+
+# ---------------------------------------------------------------------------
+# Per-engine configuration (constructor overrides env defaults)
+# ---------------------------------------------------------------------------
+
+def test_constructor_config_overrides(source_engine, tmp_path):
+    p = tmp_path / "explicit_cache.db"
+    ce = CachingEngine(
+        source_engine,
+        cache_db_path=p,
+        fetch_batch=3,
+        dialect="sqlite",
+        backup_interval=12345,
+        refresh_interval=42,
+        in_memory=False,
+    )
+    ce.execute("SELECT id, name FROM products")
+    assert p.exists()
+    assert ce._cache._fetch_batch == 3
+    assert ce._cache._dialect == "sqlite"
+    assert ce._dialect == "sqlite"
+    assert ce._cache._backup_interval == 12345
+    assert ce._refresh_interval == 42
+    ce.close()
+
+
+def test_two_engines_separate_cache_files(source_engine, tmp_path):
+    """Two engines in one process can target different cache files."""
+    a = CachingEngine(source_engine, cache_db_path=tmp_path / "a.db", in_memory=False)
+    b = CachingEngine(source_engine, cache_db_path=tmp_path / "b.db", in_memory=False)
+    a.execute("SELECT id FROM products")
+
+    assert (tmp_path / "a.db").exists()
+    assert a._cache.is_table_cached("products") is True
+    assert b._cache.is_table_cached("products") is False  # independent cache
+    a.close()
+    b.close()
@@ -0,0 +1,24 @@
+from loguru import logger
+
+import sqlmem
+
+
+def test_add_sink_idempotent_no_duplicate_lines():
+    """Calling add_sink twice for the same sink must not duplicate log lines."""
+    sqlmem._added_sinks.clear()
+    msgs: list[str] = []
+    sink = lambda message: msgs.append(str(message))  # noqa: E731
+
+    try:
+        sqlmem.add_sink(sink, level="DEBUG", colorize=False)
+        sqlmem.add_sink(sink, level="DEBUG", colorize=False)  # second call: no-op
+        assert len(sqlmem._added_sinks) == 1
+
+        # Emit one record that passes the "sqlmem" name filter.
+        logger.patch(lambda r: r.update(name="sqlmem")).info("hello sqlmem")
+        assert sum("hello sqlmem" in m for m in msgs) == 1
+    finally:
+        for handler_id in sqlmem._added_sinks.values():
+            logger.remove(handler_id)
+        sqlmem._added_sinks.clear()
+        logger.disable("sqlmem")  # restore the default-silent state for other tests
@@ -73,6 +73,29 @@ def test_counters_still_reported(source_engine, patched_cache):
    engine.close()


+def test_stats_exposes_table_error(source_engine, patched_cache):
+    engine = CachingEngine(source_engine)
+    engine.execute("SELECT id, name FROM products")
+    engine._cache.record_error("products", "ValueError: boom")
+
+    s = engine.stats
+    assert s.errors == 1
+    assert s.tables["products"].consecutive_failures == 1
+    assert s.tables["products"].last_error == "ValueError: boom"
+    assert s.tables["products"].last_error_at is not None
+    engine.close()
+
+
+def test_stats_no_error_by_default(source_engine, patched_cache):
+    engine = CachingEngine(source_engine)
+    engine.execute("SELECT id, name FROM products")
+    s = engine.stats
+    assert s.errors == 0
+    assert s.tables["products"].consecutive_failures == 0
+    assert s.tables["products"].last_error is None
+    engine.close()
+
+
 # --- a table being loaded for the first time shows up as "loading" ----------