feat: DuckDB two-file architecture — resolve SQLMesh/web-app lock contention

Split the single lakehouse.duckdb into two files to eliminate the exclusive write-lock conflict between SQLMesh (pipeline) and the Quart web app (reader): lakehouse.duckdb — SQLMesh exclusive (all pipeline layers) serving.duckdb — web app reads (serving tables only, atomically swapped) Changes: web/src/beanflows/analytics.py - Replace persistent global _conn with per-thread connections (threading.local) - Add _get_conn(): opens read_only=True on first call per thread, reopens automatically on inode change (~1μs os.stat) to pick up atomic file swaps - Switch env var from DUCKDB_PATH → SERVING_DUCKDB_PATH - Add module docstring documenting architecture + DuckLake migration path web/src/beanflows/app.py - Startup check: use SERVING_DUCKDB_PATH - Health check: use _db_path instead of _conn src/materia/export_serving.py (new) - Reads all serving.* tables from lakehouse.duckdb (read_only) - Writes to serving_new.duckdb, then os.rename → serving.duckdb (atomic) - ~50 lines; runs after each SQLMesh transform src/materia/pipelines.py - Add export_serving pipeline entry (uv run python -c ...) infra/supervisor/supervisor.sh - Add SERVING_DUCKDB_PATH env var comment - Add export step: uv run materia pipeline run export_serving infra/supervisor/materia-supervisor.service - Add Environment=SERVING_DUCKDB_PATH=/data/materia/serving.duckdb infra/bootstrap_supervisor.sh - Add SERVING_DUCKDB_PATH to .env template web/.env.example + web/docker-compose.yml - Document both env vars; switch web service to SERVING_DUCKDB_PATH web/src/beanflows/dashboard/templates/settings.html - Minor settings page fix from prior session Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 11:06:55 +01:00
parent ca7b2ab18b
commit b899bcbad4
10 changed files with 159 additions and 26 deletions
--- a/src/materia/export_serving.py
+++ b/src/materia/export_serving.py
@@ -0,0 +1,59 @@
+"""
+Export serving tables from lakehouse.duckdb to serving.duckdb (atomic swap).
+
+Called by the supervisor after each SQLMesh transform run. Reads all tables in
+the 'serving' schema from the pipeline DB (DUCKDB_PATH / lakehouse.duckdb),
+writes them to a temp file, then atomically renames it to the serving DB path
+(SERVING_DUCKDB_PATH / serving.duckdb).
+
+The web app's _get_conn() detects the inode change on the next query and
+reopens the connection automatically — no restart or signal required.
+
+Usage:
+    DUCKDB_PATH=lakehouse.duckdb SERVING_DUCKDB_PATH=serving.duckdb \
+        uv run materia pipeline run export_serving
+"""
+import logging
+import os
+
+import duckdb
+
+logger = logging.getLogger(__name__)
+
+
+def export_serving() -> None:
+    """Copy all serving.* tables from the pipeline DB to the serving DB atomically."""
+    pipeline_path = os.getenv("DUCKDB_PATH", "")
+    serving_path = os.getenv("SERVING_DUCKDB_PATH", "")
+    assert pipeline_path, "DUCKDB_PATH must be set"
+    assert serving_path, "SERVING_DUCKDB_PATH must be set"
+    assert os.path.exists(pipeline_path), f"Pipeline DB not found: {pipeline_path}"
+
+    tmp_path = serving_path + ".tmp"
+
+    src = duckdb.connect(pipeline_path, read_only=True)
+    try:
+        tables = src.sql(
+            "SELECT table_name FROM information_schema.tables WHERE table_schema = 'serving' ORDER BY table_name"
+        ).fetchall()
+        assert tables, f"No tables found in serving schema of {pipeline_path}"
+        logger.info(f"Exporting {len(tables)} serving tables: {[t[0] for t in tables]}")
+
+        dst = duckdb.connect(tmp_path)
+        try:
+            dst.execute("CREATE SCHEMA IF NOT EXISTS serving")
+            for (table,) in tables:
+                dst.execute(
+                    f"CREATE OR REPLACE TABLE serving.{table} AS "
+                    f"SELECT * FROM src.serving.{table}",
+                )
+                row_count = dst.sql(f"SELECT count(*) FROM serving.{table}").fetchone()[0]
+                logger.info(f"  serving.{table}: {row_count:,} rows")
+        finally:
+            dst.close()
+    finally:
+        src.close()
+
+    # Atomic rename — on Linux, rename() is atomic when src and dst are on the same filesystem.
+    os.rename(tmp_path, serving_path)
+    logger.info(f"Serving DB atomically updated: {serving_path}")
--- a/src/materia/pipelines.py
+++ b/src/materia/pipelines.py
@@ -48,6 +48,14 @@ PIPELINES = {
        "command": ["uv", "run", "--package", "sqlmesh_materia", "sqlmesh", "-p", "transform/sqlmesh_materia", "plan", "prod", "--no-prompts", "--auto-apply"],
        "timeout_seconds": 3600,
    },
+    # Copies serving.* tables from lakehouse.duckdb → serving.duckdb (atomic swap).
+    # Run after every transform. Requires both DUCKDB_PATH and SERVING_DUCKDB_PATH.
+    "export_serving": {
+        "command": ["uv", "run", "python", "-c",
+                    "import logging; logging.basicConfig(level=logging.INFO); "
+                    "from materia.export_serving import export_serving; export_serving()"],
+        "timeout_seconds": 300,
+    },
 }