feat: DuckDB two-file architecture — resolve SQLMesh/web-app lock contention
Split the single lakehouse.duckdb into two files to eliminate the exclusive write-lock conflict between SQLMesh (pipeline) and the Quart web app (reader): lakehouse.duckdb — SQLMesh exclusive (all pipeline layers) serving.duckdb — web app reads (serving tables only, atomically swapped) Changes: web/src/beanflows/analytics.py - Replace persistent global _conn with per-thread connections (threading.local) - Add _get_conn(): opens read_only=True on first call per thread, reopens automatically on inode change (~1μs os.stat) to pick up atomic file swaps - Switch env var from DUCKDB_PATH → SERVING_DUCKDB_PATH - Add module docstring documenting architecture + DuckLake migration path web/src/beanflows/app.py - Startup check: use SERVING_DUCKDB_PATH - Health check: use _db_path instead of _conn src/materia/export_serving.py (new) - Reads all serving.* tables from lakehouse.duckdb (read_only) - Writes to serving_new.duckdb, then os.rename → serving.duckdb (atomic) - ~50 lines; runs after each SQLMesh transform src/materia/pipelines.py - Add export_serving pipeline entry (uv run python -c ...) infra/supervisor/supervisor.sh - Add SERVING_DUCKDB_PATH env var comment - Add export step: uv run materia pipeline run export_serving infra/supervisor/materia-supervisor.service - Add Environment=SERVING_DUCKDB_PATH=/data/materia/serving.duckdb infra/bootstrap_supervisor.sh - Add SERVING_DUCKDB_PATH to .env template web/.env.example + web/docker-compose.yml - Document both env vars; switch web service to SERVING_DUCKDB_PATH web/src/beanflows/dashboard/templates/settings.html - Minor settings page fix from prior session Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
59
src/materia/export_serving.py
Normal file
59
src/materia/export_serving.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Export serving tables from lakehouse.duckdb to serving.duckdb (atomic swap).
|
||||
|
||||
Called by the supervisor after each SQLMesh transform run. Reads all tables in
|
||||
the 'serving' schema from the pipeline DB (DUCKDB_PATH / lakehouse.duckdb),
|
||||
writes them to a temp file, then atomically renames it to the serving DB path
|
||||
(SERVING_DUCKDB_PATH / serving.duckdb).
|
||||
|
||||
The web app's _get_conn() detects the inode change on the next query and
|
||||
reopens the connection automatically — no restart or signal required.
|
||||
|
||||
Usage:
|
||||
DUCKDB_PATH=lakehouse.duckdb SERVING_DUCKDB_PATH=serving.duckdb \
|
||||
uv run materia pipeline run export_serving
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
|
||||
import duckdb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def export_serving() -> None:
|
||||
"""Copy all serving.* tables from the pipeline DB to the serving DB atomically."""
|
||||
pipeline_path = os.getenv("DUCKDB_PATH", "")
|
||||
serving_path = os.getenv("SERVING_DUCKDB_PATH", "")
|
||||
assert pipeline_path, "DUCKDB_PATH must be set"
|
||||
assert serving_path, "SERVING_DUCKDB_PATH must be set"
|
||||
assert os.path.exists(pipeline_path), f"Pipeline DB not found: {pipeline_path}"
|
||||
|
||||
tmp_path = serving_path + ".tmp"
|
||||
|
||||
src = duckdb.connect(pipeline_path, read_only=True)
|
||||
try:
|
||||
tables = src.sql(
|
||||
"SELECT table_name FROM information_schema.tables WHERE table_schema = 'serving' ORDER BY table_name"
|
||||
).fetchall()
|
||||
assert tables, f"No tables found in serving schema of {pipeline_path}"
|
||||
logger.info(f"Exporting {len(tables)} serving tables: {[t[0] for t in tables]}")
|
||||
|
||||
dst = duckdb.connect(tmp_path)
|
||||
try:
|
||||
dst.execute("CREATE SCHEMA IF NOT EXISTS serving")
|
||||
for (table,) in tables:
|
||||
dst.execute(
|
||||
f"CREATE OR REPLACE TABLE serving.{table} AS "
|
||||
f"SELECT * FROM src.serving.{table}",
|
||||
)
|
||||
row_count = dst.sql(f"SELECT count(*) FROM serving.{table}").fetchone()[0]
|
||||
logger.info(f" serving.{table}: {row_count:,} rows")
|
||||
finally:
|
||||
dst.close()
|
||||
finally:
|
||||
src.close()
|
||||
|
||||
# Atomic rename — on Linux, rename() is atomic when src and dst are on the same filesystem.
|
||||
os.rename(tmp_path, serving_path)
|
||||
logger.info(f"Serving DB atomically updated: {serving_path}")
|
||||
@@ -48,6 +48,14 @@ PIPELINES = {
|
||||
"command": ["uv", "run", "--package", "sqlmesh_materia", "sqlmesh", "-p", "transform/sqlmesh_materia", "plan", "prod", "--no-prompts", "--auto-apply"],
|
||||
"timeout_seconds": 3600,
|
||||
},
|
||||
# Copies serving.* tables from lakehouse.duckdb → serving.duckdb (atomic swap).
|
||||
# Run after every transform. Requires both DUCKDB_PATH and SERVING_DUCKDB_PATH.
|
||||
"export_serving": {
|
||||
"command": ["uv", "run", "python", "-c",
|
||||
"import logging; logging.basicConfig(level=logging.INFO); "
|
||||
"from materia.export_serving import export_serving; export_serving()"],
|
||||
"timeout_seconds": 300,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user