Store named datetime columns as INTEGER microseconds (datetime_columns)

2026-06-09 18:18:38 +02:00
parent a21b5a2a04
commit 8e46ee3547
11 changed files with 255 additions and 22 deletions
@@ -15,6 +15,8 @@ from typing import Any

 Params = tuple | list | dict | None

+_EPOCH = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
+

 def to_sqlite(value: Any) -> Any:
    if isinstance(value, decimal.Decimal):
@@ -28,6 +30,29 @@ def to_sqlite(value: Any) -> Any:
    return value


+def to_sqlite_datetime(value: Any) -> int | None:
+    """Store a datetime as INTEGER microseconds since the Unix epoch (UTC).
+
+    Used for columns the caller marks via ``datetime_columns``: 8 bytes as an
+    INTEGER instead of a ~28-byte ISO ``TEXT`` string, and integer comparison on
+    the change column instead of string collation. ``None`` passes through; a
+    naive datetime is treated as UTC. A non-datetime value is parsed from its ISO
+    string form (so ``date``/ISO-``str`` inputs work too); anything unparseable
+    becomes ``None``.
+    """
+    if value is None:
+        return None
+    if isinstance(value, datetime.datetime):
+        if value.tzinfo is None:
+            value = value.replace(tzinfo=datetime.timezone.utc)
+        delta = value - _EPOCH  # exact integer arithmetic (no float rounding)
+        return delta.days * 86_400_000_000 + delta.seconds * 1_000_000 + delta.microseconds
+    try:
+        return to_sqlite_datetime(datetime.datetime.fromisoformat(str(value)))
+    except (TypeError, ValueError):
+        return None
+
+
 def coerce_row(row: tuple) -> tuple:
    return tuple(to_sqlite(v) for v in row)

@@ -9,12 +9,12 @@ from pathlib import Path
 from loguru import logger

 import sqlmem._meta as _meta
-from ._coerce import coerce_params, coerce_row
+from ._coerce import coerce_params, coerce_row, to_sqlite, to_sqlite_datetime
 from ._sql import quote, quote_list, quote_source
 from .config import FETCH_BATCH_SIZE, SQL_DIALECT
 from .stats import TableState

-SCHEMA_VERSION = 3
+SCHEMA_VERSION = 4


@dataclass(frozen=True)
@@ -41,6 +41,7 @@ class CacheManager:
        dialect: str = SQL_DIALECT,
        fetch_batch: int = FETCH_BATCH_SIZE,
        pragmas: dict[str, str | int] | None = None,
+        datetime_columns: dict[str, list[str]] | None = None,
    ) -> None:
        self._db_path = db_path
        self._backup_interval = backup_interval
@@ -48,6 +49,8 @@ class CacheManager:
        self._dialect = dialect              # source-DB dialect, for identifier quoting
        self._fetch_batch = fetch_batch      # rows fetched per source batch
        self._pragmas = dict(pragmas or {})  # extra read/layout PRAGMAs (disk mode)
+        # table → columns stored as INTEGER µs-since-epoch instead of ISO TEXT
+        self._datetime_columns = {t: list(c) for t, c in (datetime_columns or {}).items()}
        self._lock = threading.Lock()       # serializes connection access
        self._load_lock = threading.Lock()  # serializes full table loads
        self._states: dict[str, str] = {}   # table → live processing state
@@ -387,6 +390,27 @@ class CacheManager:
                self._conn.commit()
            logger.debug(f"Index {idx.name!r} ready on {table} ({cols})")

+    def _row_coercer(self, table: str, columns: list[str]):
+        """Return a per-row coercer for *columns* in source order.
+
+        Columns registered in ``datetime_columns`` for *table* are coerced to
+        INTEGER µs-since-epoch (``to_sqlite_datetime``); everything else keeps the
+        default stringifying coercion (``to_sqlite``). With no datetime columns it
+        is the plain :func:`coerce_row`, so the common path is unchanged.
+        """
+        dt_cols = set(self._datetime_columns.get(table, ()))
+        dt_idx = {i for i, c in enumerate(columns) if c in dt_cols}
+        if not dt_idx:
+            return coerce_row
+
+        def coerce(row: tuple) -> tuple:
+            return tuple(
+                to_sqlite_datetime(v) if i in dt_idx else to_sqlite(v)
+                for i, v in enumerate(row)
+            )
+
+        return coerce
+
    def load_table(
        self,
        table: str,
@@ -403,7 +427,11 @@ class CacheManager:
        connection lock is only held for the brief per-batch inserts and the swap.
        """
        src_cols = ", ".join(quote_source(c, self._dialect) for c in columns)
-        col_defs = ", ".join(f"{quote(c)} TEXT" for c in columns)
+        dt_cols = set(self._datetime_columns.get(table, ()))
+        col_defs = ", ".join(
+            f"{quote(c)} {'INTEGER' if c in dt_cols else 'TEXT'}" for c in columns
+        )
+        coerce = self._row_coercer(table, columns)
        placeholders = ", ".join("?" * len(columns))
        staging = f"{table}__sqlmem_load"
        q_staging = quote(staging)
@@ -427,7 +455,7 @@ class CacheManager:
                    batch = cursor.fetchmany(self._fetch_batch)  # network outside _lock
                    if not batch:
                        break
-                    clean = [coerce_row(row) for row in batch]
+                    clean = [coerce(row) for row in batch]
                    with self._lock:
                        self._conn.executemany(insert_sql, clean)
                        self._conn.commit()
@@ -518,9 +546,11 @@ class CacheManager:
        row = self._conn.execute(
            "SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
        ).fetchone()
+        # Stored in a TEXT column: an INTEGER-µs watermark (datetime_columns) comes
+        # back as its digit string; delta._bind_watermark reconstructs the datetime.
        return row[0] if row else None

-    def set_last_synced_at(self, table: str, value: str | None) -> None:
+    def set_last_synced_at(self, table: str, value: str | int | None) -> None:
        with self._lock:
            self._conn.execute(
                "UPDATE _sqlmem_tables SET last_synced_at = ? WHERE table_name = ?",
@@ -528,8 +558,11 @@ class CacheManager:
            )
            self._conn.commit()

-    def max_value(self, table: str, column: str) -> str | None:
-        """Maximum value of *column* across cached rows (the delta watermark)."""
+    def max_value(self, table: str, column: str) -> str | int | None:
+        """Maximum value of *column* across cached rows (the delta watermark).
+
+        Returns an ``int`` for a datetime column stored as INTEGER µs, else the
+        ISO ``TEXT`` string."""
        row = self._conn.execute(
            f"SELECT MAX({quote(column)}) FROM {quote(table)}"
        ).fetchone()
@@ -539,7 +572,8 @@ class CacheManager:
        """Insert-or-replace one batch of *rows* by the table's unique key."""
        col_list = quote_list(columns)
        placeholders = ", ".join("?" * len(columns))
-        clean_rows = [coerce_row(row) for row in rows]
+        coerce = self._row_coercer(table, columns)
+        clean_rows = [coerce(row) for row in rows]
        with self._lock:
            self._conn.executemany(
                f"INSERT OR REPLACE INTO {quote(table)} ({col_list}) VALUES ({placeholders})",
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from typing import Any

 from loguru import logger
@@ -8,8 +8,10 @@ from ._sql import quote_source
 from .cache import CacheManager
 from .stats import TableState

+_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)

-def _bind_watermark(watermark: str) -> datetime | str:
+
+def _bind_watermark(watermark: str | int, epoch_us: bool = False) -> datetime | str:
    """Bind the delta watermark back to the source in its native type.

    The cache stores the change column as an ISO ``TEXT`` string (see
@@ -22,11 +24,21 @@ def _bind_watermark(watermark: str) -> datetime | str:
    driver send a typed timestamp, so the comparison happens natively with no
    string conversion. Non-datetime change columns (e.g. an integer rowversion)
    don't parse and are passed through unchanged.
+
+    When the change column is stored as INTEGER µs-since-epoch (``datetime_columns``)
+    *epoch_us* is set: the watermark is a microsecond count (an ``int`` or its digit
+    string, since it round-trips through a TEXT column) and is reconstructed into a
+    UTC :class:`~datetime.datetime` so the source still receives a typed timestamp.
    """
+    if epoch_us:
+        try:
+            return _EPOCH + timedelta(microseconds=int(watermark))
+        except (TypeError, ValueError):
+            return watermark if isinstance(watermark, str) else str(watermark)
    try:
-        return datetime.fromisoformat(watermark)
+        return datetime.fromisoformat(watermark)  # type: ignore[arg-type]
    except (TypeError, ValueError):
-        return watermark
+        return watermark if isinstance(watermark, str) else str(watermark)


@dataclass(frozen=True)
@@ -92,9 +104,10 @@ class DeltaRefresher:
            cursor = source_conn.execute(f"SELECT {col_list} FROM {q_table}")
        else:
            change_col = quote_source(cfg.change_column, dialect)
+            epoch_us = cfg.change_column in self._cache._datetime_columns.get(table, ())
            cursor = source_conn.execute(
                f"SELECT {col_list} FROM {q_table} WHERE {change_col} >= ?",
-                (_bind_watermark(watermark),),
+                (_bind_watermark(watermark, epoch_us),),
            )

        # Stream the delta in batches so a large catch-up never materializes at once.
@@ -66,6 +66,7 @@ class CachingEngine:
        fetch_batch: int | None = None,
        dialect: str | None = None,
        pragmas: dict[str, str | int] | None = None,
+        datetime_columns: dict[str, list[str]] | None = None,
        blocking_startup_refresh: bool = False,
    ) -> None:
        self._source_engine = source_engine
@@ -81,6 +82,7 @@ class CachingEngine:
            dialect=self._dialect,
            fetch_batch=fetch_batch if fetch_batch is not None else FETCH_BATCH_SIZE,
            pragmas=pragmas,
+            datetime_columns=datetime_columns,
        )
        self._registry = ColumnRegistry(self._cache.connection)
        self._stats = StatsCollector()