Store named datetime columns as INTEGER microseconds (datetime_columns)

This commit is contained in:
Jan Doubravský
2026-06-09 18:18:38 +02:00
parent a21b5a2a04
commit 8e46ee3547
11 changed files with 255 additions and 22 deletions
+25
View File
@@ -15,6 +15,8 @@ from typing import Any
Params = tuple | list | dict | None
_EPOCH = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
def to_sqlite(value: Any) -> Any:
if isinstance(value, decimal.Decimal):
@@ -28,6 +30,29 @@ def to_sqlite(value: Any) -> Any:
return value
def to_sqlite_datetime(value: Any) -> int | None:
"""Store a datetime as INTEGER microseconds since the Unix epoch (UTC).
Used for columns the caller marks via ``datetime_columns``: 8 bytes as an
INTEGER instead of a ~28-byte ISO ``TEXT`` string, and integer comparison on
the change column instead of string collation. ``None`` passes through; a
naive datetime is treated as UTC. A non-datetime value is parsed from its ISO
string form (so ``date``/ISO-``str`` inputs work too); anything unparseable
becomes ``None``.
"""
if value is None:
return None
if isinstance(value, datetime.datetime):
if value.tzinfo is None:
value = value.replace(tzinfo=datetime.timezone.utc)
delta = value - _EPOCH # exact integer arithmetic (no float rounding)
return delta.days * 86_400_000_000 + delta.seconds * 1_000_000 + delta.microseconds
try:
return to_sqlite_datetime(datetime.datetime.fromisoformat(str(value)))
except (TypeError, ValueError):
return None
def coerce_row(row: tuple) -> tuple:
return tuple(to_sqlite(v) for v in row)
+42 -8
View File
@@ -9,12 +9,12 @@ from pathlib import Path
from loguru import logger
import sqlmem._meta as _meta
from ._coerce import coerce_params, coerce_row
from ._coerce import coerce_params, coerce_row, to_sqlite, to_sqlite_datetime
from ._sql import quote, quote_list, quote_source
from .config import FETCH_BATCH_SIZE, SQL_DIALECT
from .stats import TableState
SCHEMA_VERSION = 3
SCHEMA_VERSION = 4
@dataclass(frozen=True)
@@ -41,6 +41,7 @@ class CacheManager:
dialect: str = SQL_DIALECT,
fetch_batch: int = FETCH_BATCH_SIZE,
pragmas: dict[str, str | int] | None = None,
datetime_columns: dict[str, list[str]] | None = None,
) -> None:
self._db_path = db_path
self._backup_interval = backup_interval
@@ -48,6 +49,8 @@ class CacheManager:
self._dialect = dialect # source-DB dialect, for identifier quoting
self._fetch_batch = fetch_batch # rows fetched per source batch
self._pragmas = dict(pragmas or {}) # extra read/layout PRAGMAs (disk mode)
# table → columns stored as INTEGER µs-since-epoch instead of ISO TEXT
self._datetime_columns = {t: list(c) for t, c in (datetime_columns or {}).items()}
self._lock = threading.Lock() # serializes connection access
self._load_lock = threading.Lock() # serializes full table loads
self._states: dict[str, str] = {} # table → live processing state
@@ -387,6 +390,27 @@ class CacheManager:
self._conn.commit()
logger.debug(f"Index {idx.name!r} ready on {table} ({cols})")
def _row_coercer(self, table: str, columns: list[str]):
"""Return a per-row coercer for *columns* in source order.
Columns registered in ``datetime_columns`` for *table* are coerced to
INTEGER µs-since-epoch (``to_sqlite_datetime``); everything else keeps the
default stringifying coercion (``to_sqlite``). With no datetime columns it
is the plain :func:`coerce_row`, so the common path is unchanged.
"""
dt_cols = set(self._datetime_columns.get(table, ()))
dt_idx = {i for i, c in enumerate(columns) if c in dt_cols}
if not dt_idx:
return coerce_row
def coerce(row: tuple) -> tuple:
return tuple(
to_sqlite_datetime(v) if i in dt_idx else to_sqlite(v)
for i, v in enumerate(row)
)
return coerce
def load_table(
self,
table: str,
@@ -403,7 +427,11 @@ class CacheManager:
connection lock is only held for the brief per-batch inserts and the swap.
"""
src_cols = ", ".join(quote_source(c, self._dialect) for c in columns)
col_defs = ", ".join(f"{quote(c)} TEXT" for c in columns)
dt_cols = set(self._datetime_columns.get(table, ()))
col_defs = ", ".join(
f"{quote(c)} {'INTEGER' if c in dt_cols else 'TEXT'}" for c in columns
)
coerce = self._row_coercer(table, columns)
placeholders = ", ".join("?" * len(columns))
staging = f"{table}__sqlmem_load"
q_staging = quote(staging)
@@ -427,7 +455,7 @@ class CacheManager:
batch = cursor.fetchmany(self._fetch_batch) # network outside _lock
if not batch:
break
clean = [coerce_row(row) for row in batch]
clean = [coerce(row) for row in batch]
with self._lock:
self._conn.executemany(insert_sql, clean)
self._conn.commit()
@@ -518,9 +546,11 @@ class CacheManager:
row = self._conn.execute(
"SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
).fetchone()
# Stored in a TEXT column: an INTEGER-µs watermark (datetime_columns) comes
# back as its digit string; delta._bind_watermark reconstructs the datetime.
return row[0] if row else None
def set_last_synced_at(self, table: str, value: str | None) -> None:
def set_last_synced_at(self, table: str, value: str | int | None) -> None:
with self._lock:
self._conn.execute(
"UPDATE _sqlmem_tables SET last_synced_at = ? WHERE table_name = ?",
@@ -528,8 +558,11 @@ class CacheManager:
)
self._conn.commit()
def max_value(self, table: str, column: str) -> str | None:
"""Maximum value of *column* across cached rows (the delta watermark)."""
def max_value(self, table: str, column: str) -> str | int | None:
"""Maximum value of *column* across cached rows (the delta watermark).
Returns an ``int`` for a datetime column stored as INTEGER µs, else the
ISO ``TEXT`` string."""
row = self._conn.execute(
f"SELECT MAX({quote(column)}) FROM {quote(table)}"
).fetchone()
@@ -539,7 +572,8 @@ class CacheManager:
"""Insert-or-replace one batch of *rows* by the table's unique key."""
col_list = quote_list(columns)
placeholders = ", ".join("?" * len(columns))
clean_rows = [coerce_row(row) for row in rows]
coerce = self._row_coercer(table, columns)
clean_rows = [coerce(row) for row in rows]
with self._lock:
self._conn.executemany(
f"INSERT OR REPLACE INTO {quote(table)} ({col_list}) VALUES ({placeholders})",
+18 -5
View File
@@ -1,5 +1,5 @@
from dataclasses import dataclass, field
from datetime import datetime
from datetime import datetime, timedelta, timezone
from typing import Any
from loguru import logger
@@ -8,8 +8,10 @@ from ._sql import quote_source
from .cache import CacheManager
from .stats import TableState
_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
def _bind_watermark(watermark: str) -> datetime | str:
def _bind_watermark(watermark: str | int, epoch_us: bool = False) -> datetime | str:
"""Bind the delta watermark back to the source in its native type.
The cache stores the change column as an ISO ``TEXT`` string (see
@@ -22,11 +24,21 @@ def _bind_watermark(watermark: str) -> datetime | str:
driver send a typed timestamp, so the comparison happens natively with no
string conversion. Non-datetime change columns (e.g. an integer rowversion)
don't parse and are passed through unchanged.
When the change column is stored as INTEGER µs-since-epoch (``datetime_columns``)
*epoch_us* is set: the watermark is a microsecond count (an ``int`` or its digit
string, since it round-trips through a TEXT column) and is reconstructed into a
UTC :class:`~datetime.datetime` so the source still receives a typed timestamp.
"""
if epoch_us:
try:
return _EPOCH + timedelta(microseconds=int(watermark))
except (TypeError, ValueError):
return watermark if isinstance(watermark, str) else str(watermark)
try:
return datetime.fromisoformat(watermark)
return datetime.fromisoformat(watermark) # type: ignore[arg-type]
except (TypeError, ValueError):
return watermark
return watermark if isinstance(watermark, str) else str(watermark)
@dataclass(frozen=True)
@@ -92,9 +104,10 @@ class DeltaRefresher:
cursor = source_conn.execute(f"SELECT {col_list} FROM {q_table}")
else:
change_col = quote_source(cfg.change_column, dialect)
epoch_us = cfg.change_column in self._cache._datetime_columns.get(table, ())
cursor = source_conn.execute(
f"SELECT {col_list} FROM {q_table} WHERE {change_col} >= ?",
(_bind_watermark(watermark),),
(_bind_watermark(watermark, epoch_us),),
)
# Stream the delta in batches so a large catch-up never materializes at once.
+2
View File
@@ -66,6 +66,7 @@ class CachingEngine:
fetch_batch: int | None = None,
dialect: str | None = None,
pragmas: dict[str, str | int] | None = None,
datetime_columns: dict[str, list[str]] | None = None,
blocking_startup_refresh: bool = False,
) -> None:
self._source_engine = source_engine
@@ -81,6 +82,7 @@ class CachingEngine:
dialect=self._dialect,
fetch_batch=fetch_batch if fetch_batch is not None else FETCH_BATCH_SIZE,
pragmas=pragmas,
datetime_columns=datetime_columns,
)
self._registry = ColumnRegistry(self._cache.connection)
self._stats = StatsCollector()