feat(pipeline): scaffold Pipeline Console blueprint + sidebar + app registration

- New pipeline_routes.py blueprint (url_prefix=/admin/pipeline) with: - All 9 routes (dashboard, overview, extractions, catalog, query editor) - Data access functions: state DB (sync+to_thread), serving meta, landing FS, workflows.toml - execute_user_query() added to analytics.py (columns+rows+error+elapsed_ms) - Query security: blocklist regex, 10k char limit, 1000 row cap, 10s timeout - Add 'Pipeline' sidebar section to base_admin.html (between Analytics and System) - Register pipeline_bp in app.py - Add run_extraction task handler to worker.py Subtask 1 of 6 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 12:44:03 +01:00
parent 1905844cd2
commit 060cb9b32e
5 changed files with 782 additions and 1 deletions
--- a/web/src/padelnomics/admin/pipeline_routes.py
+++ b/web/src/padelnomics/admin/pipeline_routes.py
@@ -0,0 +1,699 @@
 """
 Pipeline Console admin blueprint.
 Operational visibility for the data extraction and transformation pipeline:
  /admin/pipeline/             → dashboard (health stats + tab container)
  /admin/pipeline/overview     → HTMX tab: extraction status, serving freshness, landing stats
  /admin/pipeline/extractions  → HTMX tab: filterable extraction run history
  /admin/pipeline/extractions/<id>/mark-stale → POST: mark stuck "running" row as failed
  /admin/pipeline/extract/trigger             → POST: enqueue full extraction run
  /admin/pipeline/catalog      → HTMX tab: data catalog (tables, columns, sample data)
  /admin/pipeline/catalog/<table>             → HTMX partial: table detail (columns + sample)
  /admin/pipeline/query        → HTMX tab: SQL query editor
  /admin/pipeline/query/execute               → POST: run user SQL, return results table
 Data sources:
  - data/landing/.state.sqlite  (extraction run history — stdlib sqlite3, sync via to_thread)
  - SERVING_DUCKDB_PATH/../_serving_meta.json  (export timestamp + per-table row counts)
  - analytics.duckdb            (DuckDB read-only via analytics.execute_user_query)
  - LANDING_DIR/                (filesystem scan for file sizes + dates)
  - infra/supervisor/workflows.toml (schedule definitions — tomllib, stdlib)
 """
 import asyncio
 import json
 import logging
 import os
 import re
 import sqlite3
 import sys
 import time
 from datetime import UTC, datetime, timedelta
 from pathlib import Path
 from quart import Blueprint, flash, redirect, render_template, request, url_for
 from ..auth.routes import role_required
 from ..core import csrf_protect
 logger = logging.getLogger(__name__)
 bp = Blueprint(
    "pipeline",
    __name__,
    template_folder=str(Path(__file__).parent / "templates"),
    url_prefix="/admin/pipeline",
 )
 # ── Config ────────────────────────────────────────────────────────────────────
 _LANDING_DIR = os.environ.get("LANDING_DIR", "data/landing")
 _SERVING_DUCKDB_PATH = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
 # Repo root: web/src/padelnomics/admin/ → up 4 levels
 _REPO_ROOT = Path(__file__).resolve().parents[5]
 _WORKFLOWS_TOML = _REPO_ROOT / "infra" / "supervisor" / "workflows.toml"
 # A "running" row older than this is considered stale/crashed.
 _STALE_THRESHOLD_HOURS = 2
 # Query editor limits
 _QUERY_MAX_CHARS = 10_000
 _QUERY_MAX_ROWS = 1_000
 _QUERY_TIMEOUT_SECONDS = 10
 # Blocked SQL keywords (read-only connection enforces engine-level, this adds belt+suspenders)
 _BLOCKED_SQL_RE = re.compile(
    r"\b(ATTACH|COPY|EXPORT|INSTALL|LOAD|CREATE|DROP|ALTER|INSERT|UPDATE|DELETE|GRANT|REVOKE|PRAGMA)\b",
    re.IGNORECASE,
 )
 # ── Sidebar data injection (same pattern as pseo_routes.py) ──────────────────
@bp.before_request
 async def _inject_sidebar_data():
    """Load unread inbox count for the admin sidebar badge."""
    from quart import g
    from ..core import fetch_one
    try:
        row = await fetch_one("SELECT COUNT(*) as cnt FROM inbound_emails WHERE is_read = 0")
        g.admin_unread_count = row["cnt"] if row else 0
    except Exception:
        g.admin_unread_count = 0
@bp.context_processor
 def _admin_context():
    from quart import g
    return {"unread_count": getattr(g, "admin_unread_count", 0)}
 # ── Data access: state DB (sync, called via to_thread) ────────────────────────
 def _state_db_path() -> Path:
    return Path(_LANDING_DIR) / ".state.sqlite"
 def _fetch_extraction_summary_sync() -> dict:
    """Aggregate stats from extraction_runs: total, success, failed, stale counts."""
    db_path = _state_db_path()
    if not db_path.exists():
        return {"total": 0, "success": 0, "failed": 0, "running": 0, "stale": 0}
    cutoff = (datetime.now(UTC) - timedelta(hours=_STALE_THRESHOLD_HOURS)).strftime(
        "%Y-%m-%dT%H:%M:%SZ"
    )
    conn = sqlite3.connect(str(db_path))
    conn.row_factory = sqlite3.Row
    try:
        row = conn.execute(
            """
            SELECT
                COUNT(*) AS total,
                SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success,
                SUM(CASE WHEN status = 'failed'  THEN 1 ELSE 0 END) AS failed,
                SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) AS running,
                SUM(CASE WHEN status = 'running' AND started_at < ? THEN 1 ELSE 0 END) AS stale
            FROM extraction_runs
            """,
            (cutoff,),
        ).fetchone()
        return {
            "total": row["total"] or 0,
            "success": row["success"] or 0,
            "failed": row["failed"] or 0,
            "running": row["running"] or 0,
            "stale": row["stale"] or 0,
        }
    finally:
        conn.close()
 def _fetch_latest_per_extractor_sync() -> list[dict]:
    """Return most recent run for each extractor name."""
    db_path = _state_db_path()
    if not db_path.exists():
        return []
    conn = sqlite3.connect(str(db_path))
    conn.row_factory = sqlite3.Row
    try:
        rows = conn.execute(
            """
            SELECT r.*
            FROM extraction_runs r
            INNER JOIN (
                SELECT extractor, MAX(run_id) AS max_id
                FROM extraction_runs
                GROUP BY extractor
            ) latest ON r.run_id = latest.max_id
            ORDER BY r.extractor
            """
        ).fetchall()
        return [dict(r) for r in rows]
    finally:
        conn.close()
 def _fetch_extraction_runs_sync(
    *,
    extractor: str = "",
    status: str = "",
    limit: int = 50,
    offset: int = 0,
 ) -> tuple[list[dict], int]:
    """Return (rows, total_count) for the filtered run history."""
    assert 1 <= limit <= 200, f"limit must be 1–200, got {limit}"
    assert offset >= 0, f"offset must be >= 0, got {offset}"
    db_path = _state_db_path()
    if not db_path.exists():
        return [], 0
    where_clauses = []
    params: list = []
    if extractor:
        where_clauses.append("extractor = ?")
        params.append(extractor)
    if status:
        where_clauses.append("status = ?")
        params.append(status)
    where_sql = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else ""
    conn = sqlite3.connect(str(db_path))
    conn.row_factory = sqlite3.Row
    try:
        total = conn.execute(
            f"SELECT COUNT(*) FROM extraction_runs {where_sql}", params
        ).fetchone()[0]
        rows = conn.execute(
            f"""
            SELECT run_id, extractor, started_at, finished_at, status,
                   files_written, files_skipped, bytes_written,
                   cursor_value, error_message
            FROM extraction_runs {where_sql}
            ORDER BY run_id DESC
            LIMIT ? OFFSET ?
            """,
            params + [limit, offset],
        ).fetchall()
        return [dict(r) for r in rows], total
    finally:
        conn.close()
 def _fetch_distinct_extractors_sync() -> list[str]:
    """Return distinct extractor names for filter dropdowns."""
    db_path = _state_db_path()
    if not db_path.exists():
        return []
    conn = sqlite3.connect(str(db_path))
    try:
        rows = conn.execute(
            "SELECT DISTINCT extractor FROM extraction_runs ORDER BY extractor"
        ).fetchall()
        return [r[0] for r in rows]
    finally:
        conn.close()
 def _mark_run_failed_sync(run_id: int) -> bool:
    """Mark a stuck 'running' row as 'failed'. Returns True if row was updated."""
    assert run_id > 0, f"run_id must be positive, got {run_id}"
    db_path = _state_db_path()
    if not db_path.exists():
        return False
    conn = sqlite3.connect(str(db_path))
    try:
        cur = conn.execute(
            """
            UPDATE extraction_runs
            SET status = 'failed',
                finished_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
                error_message = 'Marked failed manually (admin — process appeared stuck)'
            WHERE run_id = ? AND status = 'running'
            """,
            (run_id,),
        )
        conn.commit()
        return cur.rowcount > 0
    finally:
        conn.close()
 # ── Data access: serving meta ─────────────────────────────────────────────────
 def _load_serving_meta() -> dict | None:
    """Read _serving_meta.json alongside analytics.duckdb. Returns None if absent."""
    meta_path = Path(_SERVING_DUCKDB_PATH).parent / "_serving_meta.json"
    if not meta_path.exists():
        return None
    try:
        return json.loads(meta_path.read_text())
    except Exception:
        logger.warning("Failed to read _serving_meta.json", exc_info=True)
        return None
 # ── Data access: landing zone filesystem ─────────────────────────────────────
 def _get_landing_zone_stats_sync() -> list[dict]:
    """Scan LANDING_DIR and return per-source file counts + total bytes."""
    landing = Path(_LANDING_DIR)
    if not landing.exists():
        return []
    sources = []
    for source_dir in sorted(landing.iterdir()):
        if not source_dir.is_dir() or source_dir.name.startswith("."):
            continue
        files = list(source_dir.rglob("*.gz")) + list(source_dir.rglob("*.jsonl"))
        total_bytes = sum(f.stat().st_size for f in files)
        latest_mtime = max((f.stat().st_mtime for f in files), default=None)
        sources.append({
            "name": source_dir.name,
            "file_count": len(files),
            "total_bytes": total_bytes,
            "latest_mtime": (
                datetime.fromtimestamp(latest_mtime, tz=UTC).strftime("%Y-%m-%d %H:%M")
                if latest_mtime
                else None
            ),
        })
    return sources
 # ── Data access: workflows.toml ───────────────────────────────────────────────
 _SCHEDULE_LABELS = {
    "hourly": "Every hour",
    "daily": "Daily",
    "weekly": "Weekly",
    "monthly": "Monthly",
 }
 def _load_workflows() -> list[dict]:
    """Parse workflows.toml and return workflow definitions with human schedule labels."""
    if not _WORKFLOWS_TOML.exists():
        return []
    if sys.version_info >= (3, 11):
        import tomllib
        data = tomllib.loads(_WORKFLOWS_TOML.read_text())
    else:
        # Fallback for older Python (shouldn't happen — project requires 3.11+)
        try:
            import tomli as tomllib  # type: ignore[no-redef]
            data = tomllib.loads(_WORKFLOWS_TOML.read_text())
        except ImportError:
            return []
    workflows = []
    for name, config in data.items():
        schedule = config.get("schedule", "")
        schedule_label = _SCHEDULE_LABELS.get(schedule, schedule)
        workflows.append({
            "name": name,
            "module": config.get("module", ""),
            "schedule": schedule,
            "schedule_label": schedule_label,
            "depends_on": config.get("depends_on", []),
        })
    return workflows
 # ── Route helpers ─────────────────────────────────────────────────────────────
 def _format_bytes(n: int) -> str:
    """Human-readable byte count."""
    if n < 1024:
        return f"{n} B"
    if n < 1024 * 1024:
        return f"{n / 1024:.1f} KB"
    return f"{n / 1024 / 1024:.1f} MB"
 def _duration_str(started_at: str | None, finished_at: str | None) -> str:
    """Return human-readable duration, or '' if unavailable."""
    if not started_at or not finished_at:
        return ""
    try:
        fmt = "%Y-%m-%dT%H:%M:%SZ"
        start = datetime.strptime(started_at, fmt)
        end = datetime.strptime(finished_at, fmt)
        delta = int((end - start).total_seconds())
        if delta < 60:
            return f"{delta}s"
        return f"{delta // 60}m {delta % 60}s"
    except ValueError:
        return ""
 def _is_stale(run: dict) -> bool:
    """True if a 'running' row has been stuck longer than the stale threshold."""
    if run.get("status") != "running":
        return False
    started = run.get("started_at", "")
    if not started:
        return True
    try:
        fmt = "%Y-%m-%dT%H:%M:%SZ"
        start = datetime.strptime(started, fmt).replace(tzinfo=UTC)
        return (datetime.now(UTC) - start) > timedelta(hours=_STALE_THRESHOLD_HOURS)
    except ValueError:
        return False
 # ── Dashboard ─────────────────────────────────────────────────────────────────
@bp.route("/")
@role_required("admin")
 async def pipeline_dashboard():
    """Main page: health stat cards + tab container."""
    summary = await asyncio.to_thread(_fetch_extraction_summary_sync)
    serving_meta = await asyncio.to_thread(_load_serving_meta)
    total_serving_tables = len(serving_meta["tables"]) if serving_meta else 0
    last_export = serving_meta.get("exported_at_utc", "")[:19].replace("T", " ") if serving_meta else "—"
    success_rate = 0
    if summary["total"] > 0:
        success_rate = round(100 * summary["success"] / summary["total"])
    return await render_template(
        "admin/pipeline.html",
        summary=summary,
        success_rate=success_rate,
        total_serving_tables=total_serving_tables,
        last_export=last_export,
        admin_page="pipeline",
    )
 # ── Overview tab ─────────────────────────────────────────────────────────────
@bp.route("/overview")
@role_required("admin")
 async def pipeline_overview():
    """HTMX tab: extraction status per source, serving freshness, landing zone."""
    latest_runs, landing_stats, workflows, serving_meta = await asyncio.gather(
        asyncio.to_thread(_fetch_latest_per_extractor_sync),
        asyncio.to_thread(_get_landing_zone_stats_sync),
        asyncio.to_thread(_load_workflows),
        asyncio.to_thread(_load_serving_meta),
    )
    # Build a lookup: extractor name → latest run
    latest_by_name = {r["extractor"]: r for r in latest_runs}
    # Enrich each workflow with its latest run data
    cutoff = (datetime.now(UTC) - timedelta(hours=_STALE_THRESHOLD_HOURS)).strftime(
        "%Y-%m-%dT%H:%M:%SZ"
    )
    workflow_rows = []
    for wf in workflows:
        run = latest_by_name.get(wf["name"])
        workflow_rows.append({
            "workflow": wf,
            "run": run,
            "stale": _is_stale(run) if run else False,
        })
    # Compute landing zone totals
    total_landing_bytes = sum(s["total_bytes"] for s in landing_stats)
    return await render_template(
        "admin/partials/pipeline_overview.html",
        workflow_rows=workflow_rows,
        landing_stats=landing_stats,
        total_landing_bytes=total_landing_bytes,
        serving_meta=serving_meta,
        format_bytes=_format_bytes,
    )
 # ── Extractions tab ────────────────────────────────────────────────────────────
@bp.route("/extractions")
@role_required("admin")
 async def pipeline_extractions():
    """HTMX tab: paginated + filtered extraction run history."""
    extractor_filter = request.args.get("extractor", "")
    status_filter = request.args.get("status", "")
    page = max(1, int(request.args.get("page", 1)))
    per_page = 30
    (runs, total), extractors = await asyncio.gather(
        asyncio.to_thread(
            _fetch_extraction_runs_sync,
            extractor=extractor_filter,
            status=status_filter,
            limit=per_page,
            offset=(page - 1) * per_page,
        ),
        asyncio.to_thread(_fetch_distinct_extractors_sync),
    )
    # Enrich rows with computed fields
    for run in runs:
        run["duration"] = _duration_str(run.get("started_at"), run.get("finished_at"))
        run["bytes_label"] = _format_bytes(run.get("bytes_written") or 0)
        run["is_stale"] = _is_stale(run)
    total_pages = max(1, (total + per_page - 1) // per_page)
    return await render_template(
        "admin/partials/pipeline_extractions.html",
        runs=runs,
        total=total,
        page=page,
        per_page=per_page,
        total_pages=total_pages,
        extractors=extractors,
        extractor_filter=extractor_filter,
        status_filter=status_filter,
    )
@bp.route("/extractions/<int:run_id>/mark-stale", methods=["POST"])
@role_required("admin")
@csrf_protect
 async def pipeline_mark_stale(run_id: int):
    """Mark a stuck 'running' extraction row as 'failed'."""
    updated = await asyncio.to_thread(_mark_run_failed_sync, run_id)
    if updated:
        await flash(f"Run #{run_id} marked as failed.", "success")
    else:
        await flash(f"Run #{run_id} could not be updated (not in 'running' state).", "warning")
    return redirect(url_for("pipeline.pipeline_extractions"))
 # ── Trigger extraction ────────────────────────────────────────────────────────
@bp.route("/extract/trigger", methods=["POST"])
@role_required("admin")
@csrf_protect
 async def pipeline_trigger_extract():
    """Enqueue a full pipeline extraction run."""
    from ..worker import enqueue
    await enqueue("run_extraction")
    await flash("Extraction run queued. Check the task queue for progress.", "success")
    return redirect(url_for("pipeline.pipeline_dashboard"))
 # ── Catalog tab ───────────────────────────────────────────────────────────────
@bp.route("/catalog")
@role_required("admin")
 async def pipeline_catalog():
    """HTMX tab: list serving tables with row counts + column counts."""
    from ..analytics import fetch_analytics
    schema_rows = await fetch_analytics(
        """
        SELECT table_name, column_name, data_type, ordinal_position
        FROM information_schema.columns
        WHERE table_schema = 'serving'
        ORDER BY table_name, ordinal_position
        """
    )
    # Group by table
    tables: dict[str, dict] = {}
    for row in schema_rows:
        tname = row["table_name"]
        if tname not in tables:
            tables[tname] = {"name": tname, "columns": [], "column_count": 0}
        tables[tname]["columns"].append({
            "name": row["column_name"],
            "type": row["data_type"],
        })
        tables[tname]["column_count"] += 1
    # Enrich with row counts from serving meta
    serving_meta = await asyncio.to_thread(_load_serving_meta)
    meta_counts = serving_meta.get("tables", {}) if serving_meta else {}
    for tname, tdata in tables.items():
        tdata["row_count"] = meta_counts.get(tname, {}).get("row_count")
    return await render_template(
        "admin/partials/pipeline_catalog.html",
        tables=list(tables.values()),
        serving_meta=serving_meta,
    )
@bp.route("/catalog/<table_name>")
@role_required("admin")
 async def pipeline_table_detail(table_name: str):
    """HTMX partial: column list + 10-row sample for a serving table."""
    from ..analytics import fetch_analytics
    # Validate table name before using in SQL
    if not re.match(r"^[a-z_][a-z0-9_]*$", table_name):
        return "Invalid table name", 400
    # Confirm table exists in serving schema
    exists = await fetch_analytics(
        "SELECT 1 FROM information_schema.tables"
        " WHERE table_schema = 'serving' AND table_name = ?",
        [table_name],
    )
    if not exists:
        return f"Table serving.{table_name} not found", 404
    columns, sample = await asyncio.gather(
        fetch_analytics(
            "SELECT column_name, data_type, ordinal_position"
            " FROM information_schema.columns"
            " WHERE table_schema = 'serving' AND table_name = ?"
            " ORDER BY ordinal_position",
            [table_name],
        ),
        fetch_analytics(
            f"SELECT * FROM serving.{table_name} LIMIT 10"  # noqa: S608
        ),
    )
    return await render_template(
        "admin/partials/pipeline_table_detail.html",
        table_name=table_name,
        columns=columns,
        sample=sample,
    )
 # ── Query editor ──────────────────────────────────────────────────────────────
@bp.route("/query")
@role_required("admin")
 async def pipeline_query_editor():
    """HTMX tab: SQL query editor with schema sidebar."""
    from ..analytics import fetch_analytics
    schema_rows = await fetch_analytics(
        """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'serving'
        ORDER BY table_name, ordinal_position
        """
    )
    # Group by table for the schema sidebar
    schema: dict[str, list] = {}
    for row in schema_rows:
        tname = row["table_name"]
        if tname not in schema:
            schema[tname] = []
        schema[tname].append({"name": row["column_name"], "type": row["data_type"]})
    return await render_template(
        "admin/partials/pipeline_query.html",
        schema=schema,
        max_rows=_QUERY_MAX_ROWS,
        timeout_seconds=_QUERY_TIMEOUT_SECONDS,
    )
@bp.route("/query/execute", methods=["POST"])
@role_required("admin")
@csrf_protect
 async def pipeline_query_execute():
    """Run user-submitted SQL and return a results table partial."""
    from ..analytics import execute_user_query
    form = await request.form
    sql = (form.get("sql") or "").strip()
    # Input validation
    if not sql:
        return await render_template(
            "admin/partials/pipeline_query_results.html",
            error="SQL query is empty.",
            columns=[],
            rows=[],
            row_count=0,
            elapsed_ms=0,
            truncated=False,
        )
    if len(sql) > _QUERY_MAX_CHARS:
        return await render_template(
            "admin/partials/pipeline_query_results.html",
            error=f"Query too long ({len(sql):,} chars). Maximum is {_QUERY_MAX_CHARS:,} characters.",
            columns=[],
            rows=[],
            row_count=0,
            elapsed_ms=0,
            truncated=False,
        )
    if _BLOCKED_SQL_RE.search(sql):
        return await render_template(
            "admin/partials/pipeline_query_results.html",
            error="Query contains a blocked keyword. Only SELECT statements are allowed.",
            columns=[],
            rows=[],
            row_count=0,
            elapsed_ms=0,
            truncated=False,
        )
    columns, rows, error, elapsed_ms = await execute_user_query(
        sql,
        max_rows=_QUERY_MAX_ROWS,
        timeout_seconds=_QUERY_TIMEOUT_SECONDS,
    )
    truncated = len(rows) >= _QUERY_MAX_ROWS
    return await render_template(
        "admin/partials/pipeline_query_results.html",
        error=error,
        columns=columns,
        rows=rows,
        row_count=len(rows),
        elapsed_ms=elapsed_ms,
        truncated=truncated,
    )
--- a/web/src/padelnomics/admin/templates/admin/base_admin.html
+++ b/web/src/padelnomics/admin/templates/admin/base_admin.html
@@ -133,6 +133,12 @@
        SEO Hub
      </a>
      <div class="admin-sidebar__section">Pipeline</div>
      <a href="{{ url_for('pipeline.pipeline_dashboard') }}" class="{% if admin_page == 'pipeline' %}active{% endif %}">
        <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M20.25 6.375c0 2.278-3.694 4.125-8.25 4.125S3.75 8.653 3.75 6.375m16.5 0c0-2.278-3.694-4.125-8.25-4.125S3.75 4.097 3.75 6.375m16.5 0v11.25c0 2.278-3.694 4.125-8.25 4.125s-8.25-1.847-8.25-4.125V6.375m16.5 0v3.75m-16.5-3.75v3.75m16.5 0v3.75C20.25 16.153 16.556 18 12 18s-8.25-1.847-8.25-4.125v-3.75m16.5 0c0 2.278-3.694 4.125-8.25 4.125s-8.25-1.847-8.25-4.125"/></svg>
        Pipeline
      </a>
      <div class="admin-sidebar__section">System</div>
      <a href="{{ url_for('admin.flags') }}" class="{% if admin_page == 'flags' %}active{% endif %}">
        <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M3 3v1.5M3 21v-6m0 0 2.77-.693a9 9 0 0 1 6.208.682l.108.054a9 9 0 0 0 6.086.71l3.114-.732a48.524 48.524 0 0 1-.005-10.499l-3.11.732a9 9 0 0 1-6.085-.711l-.108-.054a9 9 0 0 0-6.208-.682L3 4.5M3 15V4.5"/></svg>
--- a/web/src/padelnomics/analytics.py
+++ b/web/src/padelnomics/analytics.py
@@ -5,13 +5,16 @@ Opens a single long-lived DuckDB connection at startup (read_only=True).
 All queries run via asyncio.to_thread() to avoid blocking the event loop.
 Usage:
-    from .analytics import fetch_analytics
+    from .analytics import fetch_analytics, execute_user_query
    rows = await fetch_analytics("SELECT * FROM serving.planner_defaults WHERE city_slug = ?", ["berlin"])
    cols, rows, error, elapsed_ms = await execute_user_query("SELECT city_slug FROM serving.city_market_profile LIMIT 5")
 """
 import asyncio
 import logging
 import os
 import time
 from pathlib import Path
 from typing import Any
@@ -77,3 +80,48 @@ async def fetch_analytics(sql: str, params: list | None = None) -> list[dict[str
    except Exception:
        logger.exception("DuckDB analytics query failed: %.200s", sql)
        return []
 async def execute_user_query(
    sql: str,
    max_rows: int = 1000,
    timeout_seconds: int = 10,
 ) -> tuple[list[str], list[tuple], str | None, float]:
    """
    Run an admin-submitted SQL query.
    Returns (columns, rows, error, elapsed_ms).
    - columns: list of column name strings (empty on error)
    - rows: list of value tuples, capped at max_rows
    - error: error message string, or None on success
    - elapsed_ms: wall-clock query time in milliseconds
    """
    assert sql, "sql must not be empty"
    assert 1 <= max_rows <= 10_000, f"max_rows must be 1–10000, got {max_rows}"
    assert 1 <= timeout_seconds <= 60, f"timeout_seconds must be 1–60, got {timeout_seconds}"
    if _conn is None:
        return [], [], "Analytics database is not available.", 0.0
    def _run() -> tuple[list[str], list[tuple], str | None, float]:
        t0 = time.monotonic()
        cur = _conn.cursor()
        try:
            rel = cur.execute(sql)
            cols = [d[0] for d in rel.description]
            rows = rel.fetchmany(max_rows)
            elapsed_ms = round((time.monotonic() - t0) * 1000, 1)
            return cols, rows, None, elapsed_ms
        except Exception as exc:
            elapsed_ms = round((time.monotonic() - t0) * 1000, 1)
            return [], [], str(exc), elapsed_ms
        finally:
            cur.close()
    try:
        return await asyncio.wait_for(
            asyncio.to_thread(_run),
            timeout=timeout_seconds,
        )
    except asyncio.TimeoutError:
        return [], [], f"Query timed out after {timeout_seconds}s.", 0.0
--- a/web/src/padelnomics/app.py
+++ b/web/src/padelnomics/app.py
@@ -313,6 +313,7 @@ def create_app() -> Quart:
    # Blueprint registration
    # -------------------------------------------------------------------------
    from .admin.pipeline_routes import bp as pipeline_bp
    from .admin.pseo_routes import bp as pseo_bp
    from .admin.routes import bp as admin_bp
    from .auth.routes import bp as auth_bp
@@ -339,6 +340,7 @@ def create_app() -> Quart:
    app.register_blueprint(billing_bp)
    app.register_blueprint(admin_bp)
    app.register_blueprint(pseo_bp)
    app.register_blueprint(pipeline_bp)
    app.register_blueprint(webhooks_bp)
    # Content catch-all LAST — lives under /<lang> too
--- a/web/src/padelnomics/worker.py
+++ b/web/src/padelnomics/worker.py
@@ -698,6 +698,32 @@ async def handle_cleanup_seo_metrics(payload: dict) -> None:
    logger.info("Cleaned up %s old SEO metric rows", deleted)
@task("run_extraction")
 async def handle_run_extraction(payload: dict) -> None:
    """Run the full extraction pipeline (all extractors) in the background.
    Shells out to `uv run extract` in the repo root. The extraction CLI
    manages its own state in .state.sqlite and writes to the landing zone.
    """
    import subprocess
    from pathlib import Path
    repo_root = Path(__file__).resolve().parents[4]
    result = await asyncio.to_thread(
        subprocess.run,
        ["uv", "run", "--package", "padelnomics_extract", "extract"],
        capture_output=True,
        text=True,
        timeout=7200,  # 2-hour absolute timeout
        cwd=str(repo_root),
    )
    if result.returncode != 0:
        raise RuntimeError(
            f"Extraction failed (exit {result.returncode}): {result.stderr[:500]}"
        )
    logger.info("Extraction completed: %s", result.stdout[-300:] if result.stdout else "(no output)")
@task("generate_articles")
 async def handle_generate_articles(payload: dict) -> None:
    """Generate articles from a template in the background."""