fix(analytics): directory bind mount + inode-based auto-reopen
All checks were successful
CI / test (push) Successful in 51s
CI / tag (push) Successful in 3s

- docker-compose.prod.yml: replace file bind mount for analytics.duckdb
  with directory bind mount (/opt/padelnomics/data:/app/data/pipeline:ro)
  so os.rename() on the host is visible inside the container
- Override SERVING_DUCKDB_PATH to /app/data/pipeline/analytics.duckdb in
  all 6 blue/green services (removes dependency on .env value)
- analytics.py: track file inode; call _check_and_reopen() at start of
  each query — transparently picks up new analytics.duckdb without restart
  when export_serving.py atomically replaces it after each pipeline run

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-28 21:48:20 +01:00
parent d1a10ff243
commit e0c3f38c0a
2 changed files with 78 additions and 16 deletions

View File

@@ -59,10 +59,10 @@ services:
env_file: ./.env env_file: ./.env
environment: environment:
- DATABASE_PATH=/app/data/app.db - DATABASE_PATH=/app/data/app.db
- SERVING_DUCKDB_PATH=/app/data/analytics.duckdb - SERVING_DUCKDB_PATH=/app/data/pipeline/analytics.duckdb
volumes: volumes:
- app-data:/app/data - app-data:/app/data
- /data/padelnomics/analytics.duckdb:/app/data/analytics.duckdb:ro - /opt/padelnomics/data:/app/data/pipeline:ro
networks: networks:
- net - net
healthcheck: healthcheck:
@@ -81,10 +81,10 @@ services:
env_file: ./.env env_file: ./.env
environment: environment:
- DATABASE_PATH=/app/data/app.db - DATABASE_PATH=/app/data/app.db
- SERVING_DUCKDB_PATH=/app/data/analytics.duckdb - SERVING_DUCKDB_PATH=/app/data/pipeline/analytics.duckdb
volumes: volumes:
- app-data:/app/data - app-data:/app/data
- /data/padelnomics/analytics.duckdb:/app/data/analytics.duckdb:ro - /opt/padelnomics/data:/app/data/pipeline:ro
networks: networks:
- net - net
@@ -97,10 +97,10 @@ services:
env_file: ./.env env_file: ./.env
environment: environment:
- DATABASE_PATH=/app/data/app.db - DATABASE_PATH=/app/data/app.db
- SERVING_DUCKDB_PATH=/app/data/analytics.duckdb - SERVING_DUCKDB_PATH=/app/data/pipeline/analytics.duckdb
volumes: volumes:
- app-data:/app/data - app-data:/app/data
- /data/padelnomics/analytics.duckdb:/app/data/analytics.duckdb:ro - /opt/padelnomics/data:/app/data/pipeline:ro
networks: networks:
- net - net
@@ -114,10 +114,10 @@ services:
env_file: ./.env env_file: ./.env
environment: environment:
- DATABASE_PATH=/app/data/app.db - DATABASE_PATH=/app/data/app.db
- SERVING_DUCKDB_PATH=/app/data/analytics.duckdb - SERVING_DUCKDB_PATH=/app/data/pipeline/analytics.duckdb
volumes: volumes:
- app-data:/app/data - app-data:/app/data
- /data/padelnomics/analytics.duckdb:/app/data/analytics.duckdb:ro - /opt/padelnomics/data:/app/data/pipeline:ro
networks: networks:
- net - net
healthcheck: healthcheck:
@@ -136,10 +136,10 @@ services:
env_file: ./.env env_file: ./.env
environment: environment:
- DATABASE_PATH=/app/data/app.db - DATABASE_PATH=/app/data/app.db
- SERVING_DUCKDB_PATH=/app/data/analytics.duckdb - SERVING_DUCKDB_PATH=/app/data/pipeline/analytics.duckdb
volumes: volumes:
- app-data:/app/data - app-data:/app/data
- /data/padelnomics/analytics.duckdb:/app/data/analytics.duckdb:ro - /opt/padelnomics/data:/app/data/pipeline:ro
networks: networks:
- net - net
@@ -152,10 +152,10 @@ services:
env_file: ./.env env_file: ./.env
environment: environment:
- DATABASE_PATH=/app/data/app.db - DATABASE_PATH=/app/data/app.db
- SERVING_DUCKDB_PATH=/app/data/analytics.duckdb - SERVING_DUCKDB_PATH=/app/data/pipeline/analytics.duckdb
volumes: volumes:
- app-data:/app/data - app-data:/app/data
- /data/padelnomics/analytics.duckdb:/app/data/analytics.duckdb:ro - /opt/padelnomics/data:/app/data/pipeline:ro
networks: networks:
- net - net

View File

@@ -4,6 +4,10 @@ DuckDB read-only analytics reader.
Opens a single long-lived DuckDB connection at startup (read_only=True). Opens a single long-lived DuckDB connection at startup (read_only=True).
All queries run via asyncio.to_thread() to avoid blocking the event loop. All queries run via asyncio.to_thread() to avoid blocking the event loop.
When export_serving.py atomically renames a new analytics.duckdb into place,
_check_and_reopen() detects the inode change and transparently reopens —
no app restart required.
Usage: Usage:
from .analytics import fetch_analytics, execute_user_query from .analytics import fetch_analytics, execute_user_query
@@ -14,6 +18,7 @@ Usage:
import asyncio import asyncio
import logging import logging
import os import os
import threading
import time import time
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -21,6 +26,8 @@ from typing import Any
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_conn = None # duckdb.DuckDBPyConnection | None — lazy import _conn = None # duckdb.DuckDBPyConnection | None — lazy import
_conn_inode: int | None = None
_reopen_lock = threading.Lock()
_DUCKDB_PATH = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb") _DUCKDB_PATH = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
# DuckDB queries run in the asyncio thread pool. Cap them so a slow scan # DuckDB queries run in the asyncio thread pool. Cap them so a slow scan
@@ -32,20 +39,67 @@ def open_analytics_db() -> None:
"""Open the DuckDB connection. Call once at app startup.""" """Open the DuckDB connection. Call once at app startup."""
import duckdb import duckdb
global _conn global _conn, _conn_inode
path = Path(_DUCKDB_PATH) path = Path(_DUCKDB_PATH)
if not path.exists(): if not path.exists():
# Database doesn't exist yet — skip silently. Queries will return empty. # Database doesn't exist yet — skip silently. Queries will return empty.
return return
_conn = duckdb.connect(str(path), read_only=True) _conn = duckdb.connect(str(path), read_only=True)
_conn_inode = path.stat().st_ino
def close_analytics_db() -> None: def close_analytics_db() -> None:
"""Close the DuckDB connection. Call at app shutdown.""" """Close the DuckDB connection. Call at app shutdown."""
global _conn global _conn, _conn_inode
if _conn is not None: if _conn is not None:
_conn.close() _conn.close()
_conn = None _conn = None
_conn_inode = None
def _check_and_reopen() -> None:
"""Reopen the connection if analytics.duckdb was atomically replaced (new inode).
Called at the start of each query. Requires a directory bind mount (not a file
bind mount) so that os.stat() inside the container sees the new inode after rename.
"""
global _conn, _conn_inode
import duckdb
path = Path(_DUCKDB_PATH)
try:
current_inode = path.stat().st_ino
except OSError:
return
if current_inode == _conn_inode:
return # same file — nothing to do
with _reopen_lock:
# Double-check under lock to avoid concurrent reopens.
try:
current_inode = path.stat().st_ino
except OSError:
return
if current_inode == _conn_inode:
return
old_conn = _conn
try:
new_conn = duckdb.connect(str(path), read_only=True)
except Exception:
logger.exception("Failed to reopen analytics DB after file change")
return
_conn = new_conn
_conn_inode = current_inode
logger.info("Analytics DB reopened (inode changed to %d)", current_inode)
if old_conn is not None:
try:
old_conn.close()
except Exception:
pass
async def fetch_analytics(sql: str, params: list | None = None) -> list[dict[str, Any]]: async def fetch_analytics(sql: str, params: list | None = None) -> list[dict[str, Any]]:
@@ -61,7 +115,11 @@ async def fetch_analytics(sql: str, params: list | None = None) -> list[dict[str
return [] return []
def _run() -> list[dict]: def _run() -> list[dict]:
cur = _conn.cursor() _check_and_reopen()
conn = _conn
if conn is None:
return []
cur = conn.cursor()
try: try:
rel = cur.execute(sql, params or []) rel = cur.execute(sql, params or [])
cols = [d[0] for d in rel.description] cols = [d[0] for d in rel.description]
@@ -104,8 +162,12 @@ async def execute_user_query(
return [], [], "Analytics database is not available.", 0.0 return [], [], "Analytics database is not available.", 0.0
def _run() -> tuple[list[str], list[tuple], str | None, float]: def _run() -> tuple[list[str], list[tuple], str | None, float]:
_check_and_reopen()
conn = _conn
if conn is None:
return [], [], "Analytics database is not available.", 0.0
t0 = time.monotonic() t0 = time.monotonic()
cur = _conn.cursor() cur = conn.cursor()
try: try:
rel = cur.execute(sql) rel = cur.execute(sql)
cols = [d[0] for d in rel.description] cols = [d[0] for d in rel.description]