Store named datetime columns as INTEGER microseconds (datetime_columns)
This commit is contained in:
@@ -15,6 +15,8 @@ from typing import Any
|
||||
|
||||
Params = tuple | list | dict | None
|
||||
|
||||
_EPOCH = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
|
||||
|
||||
|
||||
def to_sqlite(value: Any) -> Any:
|
||||
if isinstance(value, decimal.Decimal):
|
||||
@@ -28,6 +30,29 @@ def to_sqlite(value: Any) -> Any:
|
||||
return value
|
||||
|
||||
|
||||
def to_sqlite_datetime(value: Any) -> int | None:
|
||||
"""Store a datetime as INTEGER microseconds since the Unix epoch (UTC).
|
||||
|
||||
Used for columns the caller marks via ``datetime_columns``: 8 bytes as an
|
||||
INTEGER instead of a ~28-byte ISO ``TEXT`` string, and integer comparison on
|
||||
the change column instead of string collation. ``None`` passes through; a
|
||||
naive datetime is treated as UTC. A non-datetime value is parsed from its ISO
|
||||
string form (so ``date``/ISO-``str`` inputs work too); anything unparseable
|
||||
becomes ``None``.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, datetime.datetime):
|
||||
if value.tzinfo is None:
|
||||
value = value.replace(tzinfo=datetime.timezone.utc)
|
||||
delta = value - _EPOCH # exact integer arithmetic (no float rounding)
|
||||
return delta.days * 86_400_000_000 + delta.seconds * 1_000_000 + delta.microseconds
|
||||
try:
|
||||
return to_sqlite_datetime(datetime.datetime.fromisoformat(str(value)))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def coerce_row(row: tuple) -> tuple:
|
||||
return tuple(to_sqlite(v) for v in row)
|
||||
|
||||
|
||||
+42
-8
@@ -9,12 +9,12 @@ from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
import sqlmem._meta as _meta
|
||||
from ._coerce import coerce_params, coerce_row
|
||||
from ._coerce import coerce_params, coerce_row, to_sqlite, to_sqlite_datetime
|
||||
from ._sql import quote, quote_list, quote_source
|
||||
from .config import FETCH_BATCH_SIZE, SQL_DIALECT
|
||||
from .stats import TableState
|
||||
|
||||
SCHEMA_VERSION = 3
|
||||
SCHEMA_VERSION = 4
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -41,6 +41,7 @@ class CacheManager:
|
||||
dialect: str = SQL_DIALECT,
|
||||
fetch_batch: int = FETCH_BATCH_SIZE,
|
||||
pragmas: dict[str, str | int] | None = None,
|
||||
datetime_columns: dict[str, list[str]] | None = None,
|
||||
) -> None:
|
||||
self._db_path = db_path
|
||||
self._backup_interval = backup_interval
|
||||
@@ -48,6 +49,8 @@ class CacheManager:
|
||||
self._dialect = dialect # source-DB dialect, for identifier quoting
|
||||
self._fetch_batch = fetch_batch # rows fetched per source batch
|
||||
self._pragmas = dict(pragmas or {}) # extra read/layout PRAGMAs (disk mode)
|
||||
# table → columns stored as INTEGER µs-since-epoch instead of ISO TEXT
|
||||
self._datetime_columns = {t: list(c) for t, c in (datetime_columns or {}).items()}
|
||||
self._lock = threading.Lock() # serializes connection access
|
||||
self._load_lock = threading.Lock() # serializes full table loads
|
||||
self._states: dict[str, str] = {} # table → live processing state
|
||||
@@ -387,6 +390,27 @@ class CacheManager:
|
||||
self._conn.commit()
|
||||
logger.debug(f"Index {idx.name!r} ready on {table} ({cols})")
|
||||
|
||||
def _row_coercer(self, table: str, columns: list[str]):
|
||||
"""Return a per-row coercer for *columns* in source order.
|
||||
|
||||
Columns registered in ``datetime_columns`` for *table* are coerced to
|
||||
INTEGER µs-since-epoch (``to_sqlite_datetime``); everything else keeps the
|
||||
default stringifying coercion (``to_sqlite``). With no datetime columns it
|
||||
is the plain :func:`coerce_row`, so the common path is unchanged.
|
||||
"""
|
||||
dt_cols = set(self._datetime_columns.get(table, ()))
|
||||
dt_idx = {i for i, c in enumerate(columns) if c in dt_cols}
|
||||
if not dt_idx:
|
||||
return coerce_row
|
||||
|
||||
def coerce(row: tuple) -> tuple:
|
||||
return tuple(
|
||||
to_sqlite_datetime(v) if i in dt_idx else to_sqlite(v)
|
||||
for i, v in enumerate(row)
|
||||
)
|
||||
|
||||
return coerce
|
||||
|
||||
def load_table(
|
||||
self,
|
||||
table: str,
|
||||
@@ -403,7 +427,11 @@ class CacheManager:
|
||||
connection lock is only held for the brief per-batch inserts and the swap.
|
||||
"""
|
||||
src_cols = ", ".join(quote_source(c, self._dialect) for c in columns)
|
||||
col_defs = ", ".join(f"{quote(c)} TEXT" for c in columns)
|
||||
dt_cols = set(self._datetime_columns.get(table, ()))
|
||||
col_defs = ", ".join(
|
||||
f"{quote(c)} {'INTEGER' if c in dt_cols else 'TEXT'}" for c in columns
|
||||
)
|
||||
coerce = self._row_coercer(table, columns)
|
||||
placeholders = ", ".join("?" * len(columns))
|
||||
staging = f"{table}__sqlmem_load"
|
||||
q_staging = quote(staging)
|
||||
@@ -427,7 +455,7 @@ class CacheManager:
|
||||
batch = cursor.fetchmany(self._fetch_batch) # network outside _lock
|
||||
if not batch:
|
||||
break
|
||||
clean = [coerce_row(row) for row in batch]
|
||||
clean = [coerce(row) for row in batch]
|
||||
with self._lock:
|
||||
self._conn.executemany(insert_sql, clean)
|
||||
self._conn.commit()
|
||||
@@ -518,9 +546,11 @@ class CacheManager:
|
||||
row = self._conn.execute(
|
||||
"SELECT last_synced_at FROM _sqlmem_tables WHERE table_name = ?", (table,)
|
||||
).fetchone()
|
||||
# Stored in a TEXT column: an INTEGER-µs watermark (datetime_columns) comes
|
||||
# back as its digit string; delta._bind_watermark reconstructs the datetime.
|
||||
return row[0] if row else None
|
||||
|
||||
def set_last_synced_at(self, table: str, value: str | None) -> None:
|
||||
def set_last_synced_at(self, table: str, value: str | int | None) -> None:
|
||||
with self._lock:
|
||||
self._conn.execute(
|
||||
"UPDATE _sqlmem_tables SET last_synced_at = ? WHERE table_name = ?",
|
||||
@@ -528,8 +558,11 @@ class CacheManager:
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def max_value(self, table: str, column: str) -> str | None:
|
||||
"""Maximum value of *column* across cached rows (the delta watermark)."""
|
||||
def max_value(self, table: str, column: str) -> str | int | None:
|
||||
"""Maximum value of *column* across cached rows (the delta watermark).
|
||||
|
||||
Returns an ``int`` for a datetime column stored as INTEGER µs, else the
|
||||
ISO ``TEXT`` string."""
|
||||
row = self._conn.execute(
|
||||
f"SELECT MAX({quote(column)}) FROM {quote(table)}"
|
||||
).fetchone()
|
||||
@@ -539,7 +572,8 @@ class CacheManager:
|
||||
"""Insert-or-replace one batch of *rows* by the table's unique key."""
|
||||
col_list = quote_list(columns)
|
||||
placeholders = ", ".join("?" * len(columns))
|
||||
clean_rows = [coerce_row(row) for row in rows]
|
||||
coerce = self._row_coercer(table, columns)
|
||||
clean_rows = [coerce(row) for row in rows]
|
||||
with self._lock:
|
||||
self._conn.executemany(
|
||||
f"INSERT OR REPLACE INTO {quote(table)} ({col_list}) VALUES ({placeholders})",
|
||||
|
||||
+18
-5
@@ -1,5 +1,5 @@
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
@@ -8,8 +8,10 @@ from ._sql import quote_source
|
||||
from .cache import CacheManager
|
||||
from .stats import TableState
|
||||
|
||||
_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||
|
||||
def _bind_watermark(watermark: str) -> datetime | str:
|
||||
|
||||
def _bind_watermark(watermark: str | int, epoch_us: bool = False) -> datetime | str:
|
||||
"""Bind the delta watermark back to the source in its native type.
|
||||
|
||||
The cache stores the change column as an ISO ``TEXT`` string (see
|
||||
@@ -22,11 +24,21 @@ def _bind_watermark(watermark: str) -> datetime | str:
|
||||
driver send a typed timestamp, so the comparison happens natively with no
|
||||
string conversion. Non-datetime change columns (e.g. an integer rowversion)
|
||||
don't parse and are passed through unchanged.
|
||||
|
||||
When the change column is stored as INTEGER µs-since-epoch (``datetime_columns``)
|
||||
*epoch_us* is set: the watermark is a microsecond count (an ``int`` or its digit
|
||||
string, since it round-trips through a TEXT column) and is reconstructed into a
|
||||
UTC :class:`~datetime.datetime` so the source still receives a typed timestamp.
|
||||
"""
|
||||
if epoch_us:
|
||||
try:
|
||||
return _EPOCH + timedelta(microseconds=int(watermark))
|
||||
except (TypeError, ValueError):
|
||||
return watermark if isinstance(watermark, str) else str(watermark)
|
||||
try:
|
||||
return datetime.fromisoformat(watermark)
|
||||
return datetime.fromisoformat(watermark) # type: ignore[arg-type]
|
||||
except (TypeError, ValueError):
|
||||
return watermark
|
||||
return watermark if isinstance(watermark, str) else str(watermark)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -92,9 +104,10 @@ class DeltaRefresher:
|
||||
cursor = source_conn.execute(f"SELECT {col_list} FROM {q_table}")
|
||||
else:
|
||||
change_col = quote_source(cfg.change_column, dialect)
|
||||
epoch_us = cfg.change_column in self._cache._datetime_columns.get(table, ())
|
||||
cursor = source_conn.execute(
|
||||
f"SELECT {col_list} FROM {q_table} WHERE {change_col} >= ?",
|
||||
(_bind_watermark(watermark),),
|
||||
(_bind_watermark(watermark, epoch_us),),
|
||||
)
|
||||
|
||||
# Stream the delta in batches so a large catch-up never materializes at once.
|
||||
|
||||
@@ -66,6 +66,7 @@ class CachingEngine:
|
||||
fetch_batch: int | None = None,
|
||||
dialect: str | None = None,
|
||||
pragmas: dict[str, str | int] | None = None,
|
||||
datetime_columns: dict[str, list[str]] | None = None,
|
||||
blocking_startup_refresh: bool = False,
|
||||
) -> None:
|
||||
self._source_engine = source_engine
|
||||
@@ -81,6 +82,7 @@ class CachingEngine:
|
||||
dialect=self._dialect,
|
||||
fetch_batch=fetch_batch if fetch_batch is not None else FETCH_BATCH_SIZE,
|
||||
pragmas=pragmas,
|
||||
datetime_columns=datetime_columns,
|
||||
)
|
||||
self._registry = ColumnRegistry(self._cache.connection)
|
||||
self._stats = StatsCollector()
|
||||
|
||||
Reference in New Issue
Block a user