1028 lines
37 KiB
Python
1028 lines
37 KiB
Python
"""
|
||
Pipeline Console admin blueprint.
|
||
|
||
Operational visibility for the data extraction and transformation pipeline:
|
||
/admin/pipeline/ → dashboard (health stats + tab container)
|
||
/admin/pipeline/overview → HTMX tab: extraction status, serving freshness, landing stats
|
||
/admin/pipeline/extractions → HTMX tab: filterable extraction run history
|
||
/admin/pipeline/extractions/<id>/mark-stale → POST: mark stuck "running" row as failed
|
||
/admin/pipeline/extract/trigger → POST: enqueue full extraction run
|
||
/admin/pipeline/catalog → HTMX tab: data catalog (tables, columns, sample data)
|
||
/admin/pipeline/catalog/<table> → HTMX partial: table detail (columns + sample)
|
||
/admin/pipeline/query → HTMX tab: SQL query editor
|
||
/admin/pipeline/query/execute → POST: run user SQL, return results table
|
||
|
||
Data sources:
|
||
- data/landing/.state.sqlite (extraction run history — stdlib sqlite3, sync via to_thread)
|
||
- SERVING_DUCKDB_PATH/../_serving_meta.json (export timestamp + per-table row counts)
|
||
- analytics.duckdb (DuckDB read-only via analytics.execute_user_query)
|
||
- LANDING_DIR/ (filesystem scan for file sizes + dates)
|
||
- infra/supervisor/workflows.toml (schedule definitions — tomllib, stdlib)
|
||
"""
|
||
import asyncio
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
import sqlite3
|
||
import tomllib
|
||
from datetime import UTC, datetime, timedelta
|
||
from pathlib import Path
|
||
|
||
from quart import Blueprint, flash, redirect, render_template, request, url_for
|
||
|
||
from ..auth.routes import role_required
|
||
from ..core import csrf_protect
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
bp = Blueprint(
|
||
"pipeline",
|
||
__name__,
|
||
template_folder=str(Path(__file__).parent / "templates"),
|
||
url_prefix="/admin/pipeline",
|
||
)
|
||
|
||
# ── Config ────────────────────────────────────────────────────────────────────
|
||
|
||
_LANDING_DIR = os.environ.get("LANDING_DIR", "data/landing")
|
||
_SERVING_DUCKDB_PATH = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
|
||
|
||
# Repo root: web/src/padelnomics/admin/ → up 4 levels
|
||
_REPO_ROOT = Path(__file__).resolve().parents[5]
|
||
_WORKFLOWS_TOML = _REPO_ROOT / "infra" / "supervisor" / "workflows.toml"
|
||
|
||
# A "running" row older than this is considered stale/crashed.
|
||
_STALE_THRESHOLD_HOURS = 2
|
||
|
||
# Query editor limits
|
||
_QUERY_MAX_CHARS = 10_000
|
||
_QUERY_MAX_ROWS = 1_000
|
||
_QUERY_TIMEOUT_SECONDS = 10
|
||
|
||
# Blocked SQL keywords (read-only connection enforces engine-level, this adds belt+suspenders)
|
||
_BLOCKED_SQL_RE = re.compile(
|
||
r"\b(ATTACH|COPY|EXPORT|INSTALL|LOAD|CREATE|DROP|ALTER|INSERT|UPDATE|DELETE|GRANT|REVOKE|PRAGMA)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# ── Lineage DAG ───────────────────────────────────────────────────────────────
|
||
#
|
||
# Canonical model dependency map: model_name → [upstream_dependencies].
|
||
# Layer is derived from name prefix: stg_* = staging, dim_*/fct_* = foundation,
|
||
# everything else = serving.
|
||
# Update this dict whenever models are added or removed from transform/.
|
||
_DAG: dict[str, list[str]] = {
|
||
# Staging — read landing zone files, no model deps
|
||
"stg_padel_courts": [],
|
||
"stg_playtomic_venues": [],
|
||
"stg_playtomic_resources": [],
|
||
"stg_playtomic_opening_hours": [],
|
||
"stg_playtomic_availability": [],
|
||
"stg_population": [],
|
||
"stg_population_usa": [],
|
||
"stg_population_uk": [],
|
||
"stg_population_geonames": [],
|
||
"stg_income": [],
|
||
"stg_income_usa": [],
|
||
"stg_city_labels": [],
|
||
"stg_nuts2_boundaries": [],
|
||
"stg_regional_income": [],
|
||
"stg_tennis_courts": [],
|
||
# Foundation
|
||
"dim_venues": ["stg_playtomic_venues", "stg_playtomic_resources", "stg_padel_courts"],
|
||
"dim_cities": [
|
||
"dim_venues", "stg_income", "stg_city_labels",
|
||
"stg_population", "stg_population_usa", "stg_population_uk", "stg_population_geonames",
|
||
],
|
||
"dim_locations": [
|
||
"stg_population_geonames", "stg_income", "stg_nuts2_boundaries",
|
||
"stg_regional_income", "stg_income_usa", "stg_padel_courts", "stg_tennis_courts",
|
||
],
|
||
"dim_venue_capacity": [
|
||
"dim_venues", "stg_playtomic_resources", "stg_playtomic_opening_hours",
|
||
],
|
||
"fct_availability_slot": ["stg_playtomic_availability"],
|
||
"fct_daily_availability": ["fct_availability_slot", "dim_venue_capacity"],
|
||
# Serving
|
||
"venue_pricing_benchmarks": ["fct_daily_availability"],
|
||
"city_market_profile": ["dim_cities", "venue_pricing_benchmarks"],
|
||
"planner_defaults": ["venue_pricing_benchmarks", "city_market_profile"],
|
||
"location_opportunity_profile": ["dim_locations"],
|
||
"pseo_city_costs_de": [
|
||
"city_market_profile", "planner_defaults", "location_opportunity_profile",
|
||
],
|
||
"pseo_city_pricing": ["venue_pricing_benchmarks", "city_market_profile"],
|
||
"pseo_country_overview": ["pseo_city_costs_de"],
|
||
}
|
||
|
||
|
||
def _classify_layer(name: str) -> str:
|
||
"""Return 'staging', 'foundation', or 'serving' for a model name."""
|
||
if name.startswith("stg_"):
|
||
return "staging"
|
||
if name.startswith("dim_") or name.startswith("fct_"):
|
||
return "foundation"
|
||
return "serving"
|
||
|
||
|
||
def _render_lineage_svg(dag: dict[str, list[str]]) -> str:
|
||
"""Render the 3-layer model dependency DAG as an SVG string.
|
||
|
||
Layout: three vertical swim lanes (staging / foundation / serving) with
|
||
nodes stacked top-to-bottom in each lane. Edges are cubic bezier paths
|
||
flowing left-to-right. No external dependencies — pure Python string
|
||
construction.
|
||
"""
|
||
# ── Layout constants ───────────────────────────────────────────────────
|
||
CHAR_WIDTH_PX = 7.4 # approximate monospace char width at 11px
|
||
NODE_PAD_H = 10 # horizontal padding inside node rect
|
||
NODE_H = 26 # node height
|
||
NODE_VGAP = 10 # vertical gap between nodes in same lane
|
||
LANE_PAD_TOP = 52 # space for lane header
|
||
LANE_PAD_BOTTOM = 24
|
||
LANE_INNER_W = 210 # inner usable width per lane
|
||
LANE_GAP = 40 # gap between lanes
|
||
LANE_PAD_LEFT = 16 # left padding inside lane bg
|
||
|
||
LANE_COLORS = {
|
||
"staging": {"bg": "#F0FDF4", "border": "#BBF7D0", "accent": "#16A34A",
|
||
"fill": "#DCFCE7", "text": "#14532D"},
|
||
"foundation": {"bg": "#EFF6FF", "border": "#BFDBFE", "accent": "#1D4ED8",
|
||
"fill": "#DBEAFE", "text": "#1E3A8A"},
|
||
"serving": {"bg": "#FFFBEB", "border": "#FDE68A", "accent": "#D97706",
|
||
"fill": "#FEF3C7", "text": "#78350F"},
|
||
}
|
||
LANE_ORDER = ["staging", "foundation", "serving"]
|
||
LANE_LABELS = {"staging": "STAGING", "foundation": "FOUNDATION", "serving": "SERVING"}
|
||
|
||
# ── Group and sort nodes per layer ─────────────────────────────────────
|
||
# Count how many nodes each node is depended upon by (downstream count)
|
||
downstream: dict[str, int] = {n: 0 for n in dag}
|
||
for deps in dag.values():
|
||
for d in deps:
|
||
downstream[d] = downstream.get(d, 0) + 1
|
||
|
||
layers: dict[str, list[str]] = {"staging": [], "foundation": [], "serving": []}
|
||
for name in dag:
|
||
layers[_classify_layer(name)].append(name)
|
||
for layer_name, nodes in layers.items():
|
||
# Sort: most-connected first (hub nodes near vertical center), then alpha
|
||
nodes.sort(key=lambda n: (-downstream.get(n, 0), n))
|
||
|
||
# ── Compute node widths ────────────────────────────────────────────────
|
||
def node_w(name: str) -> float:
|
||
return max(len(name) * CHAR_WIDTH_PX + NODE_PAD_H * 2, 80.0)
|
||
|
||
# ── Assign positions ───────────────────────────────────────────────────
|
||
# x = left edge of lane background; node rect starts at x + LANE_PAD_LEFT
|
||
lane_x: dict[str, float] = {}
|
||
x_cursor = 0.0
|
||
for lane in LANE_ORDER:
|
||
lane_x[lane] = x_cursor
|
||
x_cursor += LANE_INNER_W + LANE_PAD_LEFT * 2 + LANE_GAP
|
||
|
||
positions: dict[str, tuple[float, float]] = {} # node → (rect_x, rect_y)
|
||
lane_heights: dict[str, float] = {}
|
||
for lane in LANE_ORDER:
|
||
nodes = layers[lane]
|
||
y = LANE_PAD_TOP
|
||
for name in nodes:
|
||
rx = lane_x[lane] + LANE_PAD_LEFT
|
||
positions[name] = (rx, y)
|
||
y += NODE_H + NODE_VGAP
|
||
lane_heights[lane] = y + LANE_PAD_BOTTOM - NODE_VGAP
|
||
|
||
total_w = x_cursor - LANE_GAP
|
||
total_h = max(lane_heights.values())
|
||
|
||
# ── SVG assembly ───────────────────────────────────────────────────────
|
||
parts: list[str] = []
|
||
|
||
# Arrowhead marker
|
||
parts.append(
|
||
'<defs>'
|
||
'<marker id="arr" markerWidth="6" markerHeight="6" refX="5" refY="3" orient="auto">'
|
||
'<path d="M0,0 L0,6 L6,3 z" fill="#94A3B8"/>'
|
||
'</marker>'
|
||
'<marker id="arr-hi" markerWidth="6" markerHeight="6" refX="5" refY="3" orient="auto">'
|
||
'<path d="M0,0 L0,6 L6,3 z" fill="#1D4ED8"/>'
|
||
'</marker>'
|
||
'</defs>'
|
||
)
|
||
|
||
# Lane backgrounds + headers
|
||
for lane in LANE_ORDER:
|
||
c = LANE_COLORS[lane]
|
||
lx = lane_x[lane]
|
||
lw = LANE_INNER_W + LANE_PAD_LEFT * 2
|
||
lh = lane_heights[lane]
|
||
parts.append(
|
||
f'<rect x="{lx:.1f}" y="0" width="{lw:.1f}" height="{lh:.1f}" '
|
||
f'rx="10" fill="{c["bg"]}" stroke="{c["border"]}" stroke-width="1"/>'
|
||
)
|
||
# Lane header label
|
||
label_x = lx + lw / 2
|
||
parts.append(
|
||
f'<text x="{label_x:.1f}" y="28" text-anchor="middle" '
|
||
f'font-family="\'DM Sans\',ui-sans-serif,system-ui,sans-serif" '
|
||
f'font-size="10" font-weight="700" letter-spacing="1.5" '
|
||
f'fill="{c["accent"]}">{LANE_LABELS[lane]}</text>'
|
||
)
|
||
# Divider line under header
|
||
parts.append(
|
||
f'<line x1="{lx + 12:.1f}" y1="36" x2="{lx + lw - 12:.1f}" y2="36" '
|
||
f'stroke="{c["border"]}" stroke-width="1"/>'
|
||
)
|
||
|
||
# Edges (rendered before nodes so nodes appear on top)
|
||
for name, deps in dag.items():
|
||
if not deps:
|
||
continue
|
||
tx, ty = positions[name]
|
||
tgt_cx = tx # left edge of target node
|
||
tgt_cy = ty + NODE_H / 2
|
||
for dep in deps:
|
||
sx, sy = positions[dep]
|
||
sw = node_w(dep)
|
||
src_cx = sx + sw # right edge of source node
|
||
src_cy = sy + NODE_H / 2
|
||
cpx1 = src_cx + (tgt_cx - src_cx) * 0.45
|
||
cpx2 = tgt_cx - (tgt_cx - src_cx) * 0.45
|
||
d = f"M{src_cx:.1f},{src_cy:.1f} C{cpx1:.1f},{src_cy:.1f} {cpx2:.1f},{tgt_cy:.1f} {tgt_cx:.1f},{tgt_cy:.1f}"
|
||
parts.append(
|
||
f'<path class="lineage-edge" data-from="{dep}" data-to="{name}" '
|
||
f'd="{d}" fill="none" stroke="#CBD5E1" stroke-width="1" '
|
||
f'marker-end="url(#arr)"/>'
|
||
)
|
||
|
||
# Nodes
|
||
for name in dag:
|
||
layer = _classify_layer(name)
|
||
c = LANE_COLORS[layer]
|
||
rx, ry = positions[name]
|
||
rw = node_w(name)
|
||
text_x = rx + NODE_PAD_H
|
||
text_y = ry + NODE_H / 2 + 4 # +4 for baseline alignment
|
||
parts.append(
|
||
f'<g class="lineage-node" data-model="{name}">'
|
||
f'<rect x="{rx:.1f}" y="{ry:.1f}" width="{rw:.1f}" height="{NODE_H}" '
|
||
f'rx="5" fill="{c["fill"]}" stroke="{c["border"]}" stroke-width="1"/>'
|
||
# Left accent bar
|
||
f'<rect x="{rx:.1f}" y="{ry:.1f}" width="3" height="{NODE_H}" '
|
||
f'rx="5" fill="{c["accent"]}"/>'
|
||
f'<text x="{text_x:.1f}" y="{text_y:.1f}" '
|
||
f'font-family="\'Commit Mono\',ui-monospace,\'Cascadia Code\',monospace" '
|
||
f'font-size="11" fill="{c["text"]}">{name}</text>'
|
||
'</g>'
|
||
)
|
||
|
||
svg_inner = "\n".join(parts)
|
||
return (
|
||
f'<svg class="lineage-svg" viewBox="0 0 {total_w:.1f} {total_h:.1f}" '
|
||
f'xmlns="http://www.w3.org/2000/svg" '
|
||
f'style="width:100%;height:auto;min-width:{total_w:.0f}px">'
|
||
f'{svg_inner}'
|
||
f'</svg>'
|
||
)
|
||
|
||
|
||
# ── Sidebar data injection (same pattern as pseo_routes.py) ──────────────────
|
||
|
||
|
||
@bp.before_request
|
||
async def _inject_sidebar_data():
|
||
"""Load unread inbox count for the admin sidebar badge."""
|
||
from quart import g
|
||
|
||
from ..core import fetch_one
|
||
|
||
try:
|
||
row = await fetch_one("SELECT COUNT(*) as cnt FROM inbound_emails WHERE is_read = 0")
|
||
g.admin_unread_count = row["cnt"] if row else 0
|
||
except Exception:
|
||
g.admin_unread_count = 0
|
||
|
||
|
||
@bp.context_processor
|
||
def _admin_context():
|
||
from quart import g
|
||
|
||
return {"unread_count": getattr(g, "admin_unread_count", 0)}
|
||
|
||
|
||
# ── Data access: state DB (sync, called via to_thread) ────────────────────────
|
||
|
||
|
||
def _state_db_path() -> Path:
|
||
return Path(_LANDING_DIR) / ".state.sqlite"
|
||
|
||
|
||
def _fetch_extraction_summary_sync() -> dict:
|
||
"""Aggregate stats from extraction_runs: total, success, failed, stale counts."""
|
||
db_path = _state_db_path()
|
||
if not db_path.exists():
|
||
return {"total": 0, "success": 0, "failed": 0, "running": 0, "stale": 0}
|
||
|
||
cutoff = (datetime.now(UTC) - timedelta(hours=_STALE_THRESHOLD_HOURS)).strftime(
|
||
"%Y-%m-%dT%H:%M:%SZ"
|
||
)
|
||
conn = sqlite3.connect(str(db_path))
|
||
conn.row_factory = sqlite3.Row
|
||
try:
|
||
row = conn.execute(
|
||
"""
|
||
SELECT
|
||
COUNT(*) AS total,
|
||
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success,
|
||
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed,
|
||
SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) AS running,
|
||
SUM(CASE WHEN status = 'running' AND started_at < ? THEN 1 ELSE 0 END) AS stale
|
||
FROM extraction_runs
|
||
""",
|
||
(cutoff,),
|
||
).fetchone()
|
||
return {
|
||
"total": row["total"] or 0,
|
||
"success": row["success"] or 0,
|
||
"failed": row["failed"] or 0,
|
||
"running": row["running"] or 0,
|
||
"stale": row["stale"] or 0,
|
||
}
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
def _fetch_latest_per_extractor_sync() -> list[dict]:
|
||
"""Return most recent run for each extractor name."""
|
||
db_path = _state_db_path()
|
||
if not db_path.exists():
|
||
return []
|
||
|
||
conn = sqlite3.connect(str(db_path))
|
||
conn.row_factory = sqlite3.Row
|
||
try:
|
||
rows = conn.execute(
|
||
"""
|
||
SELECT r.*
|
||
FROM extraction_runs r
|
||
INNER JOIN (
|
||
SELECT extractor, MAX(run_id) AS max_id
|
||
FROM extraction_runs
|
||
GROUP BY extractor
|
||
) latest ON r.run_id = latest.max_id
|
||
ORDER BY r.extractor
|
||
"""
|
||
).fetchall()
|
||
return [dict(r) for r in rows]
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
def _fetch_extraction_runs_sync(
|
||
*,
|
||
extractor: str = "",
|
||
status: str = "",
|
||
limit: int = 50,
|
||
offset: int = 0,
|
||
) -> tuple[list[dict], int]:
|
||
"""Return (rows, total_count) for the filtered run history."""
|
||
assert 1 <= limit <= 200, f"limit must be 1–200, got {limit}"
|
||
assert offset >= 0, f"offset must be >= 0, got {offset}"
|
||
|
||
db_path = _state_db_path()
|
||
if not db_path.exists():
|
||
return [], 0
|
||
|
||
where_clauses = []
|
||
params: list = []
|
||
if extractor:
|
||
where_clauses.append("extractor = ?")
|
||
params.append(extractor)
|
||
if status:
|
||
where_clauses.append("status = ?")
|
||
params.append(status)
|
||
|
||
where_sql = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else ""
|
||
|
||
conn = sqlite3.connect(str(db_path))
|
||
conn.row_factory = sqlite3.Row
|
||
try:
|
||
total = conn.execute(
|
||
f"SELECT COUNT(*) FROM extraction_runs {where_sql}", params
|
||
).fetchone()[0]
|
||
|
||
rows = conn.execute(
|
||
f"""
|
||
SELECT run_id, extractor, started_at, finished_at, status,
|
||
files_written, files_skipped, bytes_written,
|
||
cursor_value, error_message
|
||
FROM extraction_runs {where_sql}
|
||
ORDER BY run_id DESC
|
||
LIMIT ? OFFSET ?
|
||
""",
|
||
params + [limit, offset],
|
||
).fetchall()
|
||
return [dict(r) for r in rows], total
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
def _fetch_distinct_extractors_sync() -> list[str]:
|
||
"""Return distinct extractor names for filter dropdowns."""
|
||
db_path = _state_db_path()
|
||
if not db_path.exists():
|
||
return []
|
||
conn = sqlite3.connect(str(db_path))
|
||
try:
|
||
rows = conn.execute(
|
||
"SELECT DISTINCT extractor FROM extraction_runs ORDER BY extractor"
|
||
).fetchall()
|
||
return [r[0] for r in rows]
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
def _mark_run_failed_sync(run_id: int) -> bool:
|
||
"""Mark a stuck 'running' row as 'failed'. Returns True if row was updated."""
|
||
assert run_id > 0, f"run_id must be positive, got {run_id}"
|
||
db_path = _state_db_path()
|
||
if not db_path.exists():
|
||
return False
|
||
conn = sqlite3.connect(str(db_path))
|
||
try:
|
||
cur = conn.execute(
|
||
"""
|
||
UPDATE extraction_runs
|
||
SET status = 'failed',
|
||
finished_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
|
||
error_message = 'Marked failed manually (admin — process appeared stuck)'
|
||
WHERE run_id = ? AND status = 'running'
|
||
""",
|
||
(run_id,),
|
||
)
|
||
conn.commit()
|
||
return cur.rowcount > 0
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
# ── Data access: serving meta ─────────────────────────────────────────────────
|
||
|
||
|
||
def _load_serving_meta() -> dict | None:
|
||
"""Read _serving_meta.json alongside analytics.duckdb. Returns None if absent."""
|
||
meta_path = Path(_SERVING_DUCKDB_PATH).parent / "_serving_meta.json"
|
||
if not meta_path.exists():
|
||
return None
|
||
try:
|
||
return json.loads(meta_path.read_text())
|
||
except Exception:
|
||
logger.warning("Failed to read _serving_meta.json", exc_info=True)
|
||
return None
|
||
|
||
|
||
# ── Data access: landing zone filesystem ─────────────────────────────────────
|
||
|
||
|
||
def _get_landing_zone_stats_sync() -> list[dict]:
|
||
"""Scan LANDING_DIR and return per-source file counts + total bytes."""
|
||
landing = Path(_LANDING_DIR)
|
||
if not landing.exists():
|
||
return []
|
||
|
||
sources = []
|
||
for source_dir in sorted(landing.iterdir()):
|
||
if not source_dir.is_dir() or source_dir.name.startswith("."):
|
||
continue
|
||
files = list(source_dir.rglob("*.gz")) + list(source_dir.rglob("*.jsonl"))
|
||
total_bytes = sum(f.stat().st_size for f in files)
|
||
latest_mtime = max((f.stat().st_mtime for f in files), default=None)
|
||
sources.append({
|
||
"name": source_dir.name,
|
||
"file_count": len(files),
|
||
"total_bytes": total_bytes,
|
||
"latest_mtime": (
|
||
datetime.fromtimestamp(latest_mtime, tz=UTC).strftime("%Y-%m-%d %H:%M")
|
||
if latest_mtime
|
||
else None
|
||
),
|
||
})
|
||
return sources
|
||
|
||
|
||
# ── Data access: workflows.toml ───────────────────────────────────────────────
|
||
|
||
_SCHEDULE_LABELS = {
|
||
"hourly": "Every hour",
|
||
"daily": "Daily",
|
||
"weekly": "Weekly",
|
||
"monthly": "Monthly",
|
||
}
|
||
|
||
|
||
def _load_workflows() -> list[dict]:
|
||
"""Parse workflows.toml and return workflow definitions with human schedule labels."""
|
||
if not _WORKFLOWS_TOML.exists():
|
||
return []
|
||
|
||
data = tomllib.loads(_WORKFLOWS_TOML.read_text())
|
||
|
||
workflows = []
|
||
for name, config in data.items():
|
||
schedule = config.get("schedule", "")
|
||
schedule_label = _SCHEDULE_LABELS.get(schedule, schedule)
|
||
workflows.append({
|
||
"name": name,
|
||
"module": config.get("module", ""),
|
||
"schedule": schedule,
|
||
"schedule_label": schedule_label,
|
||
"depends_on": config.get("depends_on", []),
|
||
})
|
||
return workflows
|
||
|
||
|
||
# ── Route helpers ─────────────────────────────────────────────────────────────
|
||
|
||
|
||
def _format_bytes(n: int) -> str:
|
||
"""Human-readable byte count."""
|
||
if n < 1024:
|
||
return f"{n} B"
|
||
if n < 1024 * 1024:
|
||
return f"{n / 1024:.1f} KB"
|
||
return f"{n / 1024 / 1024:.1f} MB"
|
||
|
||
|
||
def _duration_str(started_at: str | None, finished_at: str | None) -> str:
|
||
"""Return human-readable duration, or '' if unavailable."""
|
||
if not started_at or not finished_at:
|
||
return ""
|
||
try:
|
||
fmt = "%Y-%m-%dT%H:%M:%SZ"
|
||
start = datetime.strptime(started_at, fmt)
|
||
end = datetime.strptime(finished_at, fmt)
|
||
delta = int((end - start).total_seconds())
|
||
if delta < 60:
|
||
return f"{delta}s"
|
||
return f"{delta // 60}m {delta % 60}s"
|
||
except ValueError:
|
||
return ""
|
||
|
||
|
||
def _is_stale(run: dict) -> bool:
|
||
"""True if a 'running' row has been stuck longer than the stale threshold."""
|
||
if run.get("status") != "running":
|
||
return False
|
||
started = run.get("started_at", "")
|
||
if not started:
|
||
return True
|
||
try:
|
||
fmt = "%Y-%m-%dT%H:%M:%SZ"
|
||
start = datetime.strptime(started, fmt).replace(tzinfo=UTC)
|
||
return (datetime.now(UTC) - start) > timedelta(hours=_STALE_THRESHOLD_HOURS)
|
||
except ValueError:
|
||
return False
|
||
|
||
|
||
# ── Dashboard ─────────────────────────────────────────────────────────────────
|
||
|
||
|
||
@bp.route("/")
|
||
@role_required("admin")
|
||
async def pipeline_dashboard():
|
||
"""Main page: health stat cards + tab container."""
|
||
from ..analytics import fetch_analytics # noqa: PLC0415
|
||
|
||
summary, serving_meta = await asyncio.gather(
|
||
asyncio.to_thread(_fetch_extraction_summary_sync),
|
||
asyncio.to_thread(_load_serving_meta),
|
||
)
|
||
|
||
if serving_meta:
|
||
total_serving_tables = len(serving_meta.get("tables", {}))
|
||
last_export = serving_meta.get("exported_at_utc", "")[:19].replace("T", " ") or "—"
|
||
else:
|
||
schema_rows = await fetch_analytics(
|
||
"SELECT COUNT(*) AS n FROM information_schema.tables WHERE table_schema = 'serving'"
|
||
)
|
||
total_serving_tables = schema_rows[0]["n"] if schema_rows else 0
|
||
last_export = "—"
|
||
|
||
success_rate = 0
|
||
if summary["total"] > 0:
|
||
success_rate = round(100 * summary["success"] / summary["total"])
|
||
|
||
return await render_template(
|
||
"admin/pipeline.html",
|
||
summary=summary,
|
||
success_rate=success_rate,
|
||
total_serving_tables=total_serving_tables,
|
||
last_export=last_export,
|
||
admin_page="pipeline",
|
||
)
|
||
|
||
|
||
# ── Overview tab ─────────────────────────────────────────────────────────────
|
||
|
||
|
||
@bp.route("/overview")
|
||
@role_required("admin")
|
||
async def pipeline_overview():
|
||
"""HTMX tab: extraction status per source, serving freshness, landing zone."""
|
||
latest_runs, landing_stats, workflows, serving_meta = await asyncio.gather(
|
||
asyncio.to_thread(_fetch_latest_per_extractor_sync),
|
||
asyncio.to_thread(_get_landing_zone_stats_sync),
|
||
asyncio.to_thread(_load_workflows),
|
||
asyncio.to_thread(_load_serving_meta),
|
||
)
|
||
|
||
# Build a lookup: extractor name → latest run
|
||
latest_by_name = {r["extractor"]: r for r in latest_runs}
|
||
|
||
# Enrich each workflow with its latest run data
|
||
workflow_rows = []
|
||
for wf in workflows:
|
||
run = latest_by_name.get(wf["name"])
|
||
workflow_rows.append({
|
||
"workflow": wf,
|
||
"run": run,
|
||
"stale": _is_stale(run) if run else False,
|
||
})
|
||
|
||
# Compute landing zone totals
|
||
total_landing_bytes = sum(s["total_bytes"] for s in landing_stats)
|
||
|
||
# Build serving tables list: prefer _serving_meta.json (has counts + timestamp),
|
||
# fall back to information_schema when file doesn't exist yet.
|
||
if serving_meta:
|
||
serving_tables = [
|
||
{"name": name, "row_count": meta.get("row_count")}
|
||
for name, meta in sorted(serving_meta.get("tables", {}).items())
|
||
]
|
||
last_export = serving_meta.get("exported_at_utc", "")[:19].replace("T", " ") or None
|
||
else:
|
||
from ..analytics import fetch_analytics # noqa: PLC0415
|
||
schema_rows = await fetch_analytics(
|
||
"SELECT table_name FROM information_schema.tables "
|
||
"WHERE table_schema = 'serving' ORDER BY table_name"
|
||
)
|
||
serving_tables = [{"name": r["table_name"], "row_count": None} for r in schema_rows]
|
||
last_export = None
|
||
|
||
return await render_template(
|
||
"admin/partials/pipeline_overview.html",
|
||
workflow_rows=workflow_rows,
|
||
landing_stats=landing_stats,
|
||
total_landing_bytes=total_landing_bytes,
|
||
serving_tables=serving_tables,
|
||
last_export=last_export,
|
||
format_bytes=_format_bytes,
|
||
)
|
||
|
||
|
||
# ── Extractions tab ────────────────────────────────────────────────────────────
|
||
|
||
|
||
@bp.route("/extractions")
|
||
@role_required("admin")
|
||
async def pipeline_extractions():
|
||
"""HTMX tab: paginated + filtered extraction run history."""
|
||
extractor_filter = request.args.get("extractor", "")
|
||
status_filter = request.args.get("status", "")
|
||
page = max(1, int(request.args.get("page", 1)))
|
||
per_page = 30
|
||
|
||
(runs, total), extractors = await asyncio.gather(
|
||
asyncio.to_thread(
|
||
_fetch_extraction_runs_sync,
|
||
extractor=extractor_filter,
|
||
status=status_filter,
|
||
limit=per_page,
|
||
offset=(page - 1) * per_page,
|
||
),
|
||
asyncio.to_thread(_fetch_distinct_extractors_sync),
|
||
)
|
||
|
||
# Enrich rows with computed fields
|
||
for run in runs:
|
||
run["duration"] = _duration_str(run.get("started_at"), run.get("finished_at"))
|
||
run["bytes_label"] = _format_bytes(run.get("bytes_written") or 0)
|
||
run["is_stale"] = _is_stale(run)
|
||
|
||
total_pages = max(1, (total + per_page - 1) // per_page)
|
||
|
||
return await render_template(
|
||
"admin/partials/pipeline_extractions.html",
|
||
runs=runs,
|
||
total=total,
|
||
page=page,
|
||
per_page=per_page,
|
||
total_pages=total_pages,
|
||
extractors=extractors,
|
||
extractor_filter=extractor_filter,
|
||
status_filter=status_filter,
|
||
)
|
||
|
||
|
||
@bp.route("/extractions/<int:run_id>/mark-stale", methods=["POST"])
|
||
@role_required("admin")
|
||
@csrf_protect
|
||
async def pipeline_mark_stale(run_id: int):
|
||
"""Mark a stuck 'running' extraction row as 'failed'."""
|
||
updated = await asyncio.to_thread(_mark_run_failed_sync, run_id)
|
||
if updated:
|
||
await flash(f"Run #{run_id} marked as failed.", "success")
|
||
else:
|
||
await flash(f"Run #{run_id} could not be updated (not in 'running' state).", "warning")
|
||
return redirect(url_for("pipeline.pipeline_dashboard"))
|
||
|
||
|
||
# ── Trigger extraction ────────────────────────────────────────────────────────
|
||
|
||
|
||
@bp.route("/extract/trigger", methods=["POST"])
|
||
@role_required("admin")
|
||
@csrf_protect
|
||
async def pipeline_trigger_extract():
|
||
"""Enqueue an extraction run — all extractors, or a single named one."""
|
||
from ..worker import enqueue
|
||
|
||
form = await request.form
|
||
extractor = (form.get("extractor") or "").strip()
|
||
|
||
if extractor:
|
||
valid_names = {wf["name"] for wf in await asyncio.to_thread(_load_workflows)}
|
||
if extractor not in valid_names:
|
||
await flash(f"Unknown extractor '{extractor}'.", "warning")
|
||
return redirect(url_for("pipeline.pipeline_dashboard"))
|
||
await enqueue("run_extraction", {"extractor": extractor})
|
||
await flash(f"Extractor '{extractor}' queued. Check the task queue for progress.", "success")
|
||
else:
|
||
await enqueue("run_extraction")
|
||
await flash("Extraction run queued. Check the task queue for progress.", "success")
|
||
|
||
return redirect(url_for("pipeline.pipeline_dashboard"))
|
||
|
||
|
||
# ── Lineage tab ───────────────────────────────────────────────────────────────
|
||
|
||
# Compute downstream map once at import time (DAG is static).
|
||
_DOWNSTREAM: dict[str, list[str]] = {n: [] for n in _DAG}
|
||
for _name, _deps in _DAG.items():
|
||
for _dep in _deps:
|
||
_DOWNSTREAM.setdefault(_dep, []).append(_name)
|
||
|
||
|
||
@bp.route("/lineage")
|
||
@role_required("admin")
|
||
async def pipeline_lineage():
|
||
"""HTMX tab: data lineage DAG visualization."""
|
||
svg = await asyncio.to_thread(_render_lineage_svg, _DAG)
|
||
return await render_template(
|
||
"admin/partials/pipeline_lineage.html",
|
||
lineage_svg=svg,
|
||
node_count=len(_DAG),
|
||
)
|
||
|
||
|
||
@bp.route("/lineage/schema/<model>")
|
||
@role_required("admin")
|
||
async def pipeline_lineage_schema(model: str):
|
||
"""JSON: schema details for a lineage node.
|
||
|
||
Returns columns + types from information_schema (serving models only —
|
||
staging/foundation live in lakehouse.duckdb which the web app cannot open).
|
||
Row count is included for serving models when the table exists.
|
||
"""
|
||
from quart import jsonify
|
||
|
||
from ..analytics import fetch_analytics
|
||
|
||
if model not in _DAG:
|
||
return jsonify({"error": "unknown model"}), 404
|
||
|
||
layer = _classify_layer(model)
|
||
upstream = _DAG[model]
|
||
downstream = _DOWNSTREAM.get(model, [])
|
||
|
||
row_count = None
|
||
columns: list[dict] = []
|
||
|
||
if layer == "serving":
|
||
col_rows = await fetch_analytics(
|
||
"""
|
||
SELECT column_name, data_type, is_nullable
|
||
FROM information_schema.columns
|
||
WHERE table_schema = 'serving' AND table_name = ?
|
||
ORDER BY ordinal_position
|
||
""",
|
||
[model],
|
||
)
|
||
columns = [
|
||
{
|
||
"name": r["column_name"],
|
||
"type": r["data_type"],
|
||
"nullable": r["is_nullable"] == "YES",
|
||
}
|
||
for r in col_rows
|
||
]
|
||
if columns:
|
||
# model is validated against _DAG keys — safe to interpolate
|
||
count_rows = await fetch_analytics(
|
||
f"SELECT count(*) AS n FROM serving.{model}"
|
||
)
|
||
if count_rows:
|
||
row_count = count_rows[0]["n"]
|
||
|
||
return jsonify(
|
||
{
|
||
"model": model,
|
||
"layer": layer,
|
||
"upstream": upstream,
|
||
"downstream": downstream,
|
||
"row_count": row_count,
|
||
"columns": columns,
|
||
}
|
||
)
|
||
|
||
|
||
# ── Catalog tab ───────────────────────────────────────────────────────────────
|
||
|
||
|
||
@bp.route("/catalog")
|
||
@role_required("admin")
|
||
async def pipeline_catalog():
|
||
"""HTMX tab: list serving tables with row counts + column counts."""
|
||
from ..analytics import fetch_analytics
|
||
|
||
schema_rows = await fetch_analytics(
|
||
"""
|
||
SELECT table_name, column_name, data_type, ordinal_position
|
||
FROM information_schema.columns
|
||
WHERE table_schema = 'serving'
|
||
ORDER BY table_name, ordinal_position
|
||
"""
|
||
)
|
||
|
||
# Group by table
|
||
tables: dict[str, dict] = {}
|
||
for row in schema_rows:
|
||
tname = row["table_name"]
|
||
if tname not in tables:
|
||
tables[tname] = {"name": tname, "columns": [], "column_count": 0}
|
||
tables[tname]["columns"].append({
|
||
"name": row["column_name"],
|
||
"type": row["data_type"],
|
||
})
|
||
tables[tname]["column_count"] += 1
|
||
|
||
# Enrich with row counts from serving meta
|
||
serving_meta = await asyncio.to_thread(_load_serving_meta)
|
||
meta_counts = serving_meta.get("tables", {}) if serving_meta else {}
|
||
for tname, tdata in tables.items():
|
||
tdata["row_count"] = meta_counts.get(tname, {}).get("row_count")
|
||
|
||
return await render_template(
|
||
"admin/partials/pipeline_catalog.html",
|
||
tables=list(tables.values()),
|
||
serving_meta=serving_meta,
|
||
)
|
||
|
||
|
||
@bp.route("/catalog/<table_name>")
|
||
@role_required("admin")
|
||
async def pipeline_table_detail(table_name: str):
|
||
"""HTMX partial: column list + 10-row sample for a serving table."""
|
||
from ..analytics import fetch_analytics
|
||
|
||
# Validate table name before using in SQL
|
||
if not re.match(r"^[a-z_][a-z0-9_]*$", table_name):
|
||
return "Invalid table name", 400
|
||
|
||
# Confirm table exists in serving schema
|
||
exists = await fetch_analytics(
|
||
"SELECT 1 FROM information_schema.tables"
|
||
" WHERE table_schema = 'serving' AND table_name = ?",
|
||
[table_name],
|
||
)
|
||
if not exists:
|
||
return f"Table serving.{table_name} not found", 404
|
||
|
||
columns, sample = await asyncio.gather(
|
||
fetch_analytics(
|
||
"SELECT column_name, data_type, ordinal_position"
|
||
" FROM information_schema.columns"
|
||
" WHERE table_schema = 'serving' AND table_name = ?"
|
||
" ORDER BY ordinal_position",
|
||
[table_name],
|
||
),
|
||
fetch_analytics(
|
||
f"SELECT * FROM serving.{table_name} LIMIT 10" # noqa: S608
|
||
),
|
||
)
|
||
|
||
return await render_template(
|
||
"admin/partials/pipeline_table_detail.html",
|
||
table_name=table_name,
|
||
columns=columns,
|
||
sample=sample,
|
||
)
|
||
|
||
|
||
# ── Query editor ──────────────────────────────────────────────────────────────
|
||
|
||
|
||
@bp.route("/query")
|
||
@role_required("admin")
|
||
async def pipeline_query_editor():
|
||
"""HTMX tab: SQL query editor with schema sidebar."""
|
||
from ..analytics import fetch_analytics
|
||
|
||
schema_rows = await fetch_analytics(
|
||
"""
|
||
SELECT table_name, column_name, data_type
|
||
FROM information_schema.columns
|
||
WHERE table_schema = 'serving'
|
||
ORDER BY table_name, ordinal_position
|
||
"""
|
||
)
|
||
|
||
# Group by table for the schema sidebar
|
||
schema: dict[str, list] = {}
|
||
for row in schema_rows:
|
||
tname = row["table_name"]
|
||
if tname not in schema:
|
||
schema[tname] = []
|
||
schema[tname].append({"name": row["column_name"], "type": row["data_type"]})
|
||
|
||
return await render_template(
|
||
"admin/partials/pipeline_query.html",
|
||
schema=schema,
|
||
max_rows=_QUERY_MAX_ROWS,
|
||
timeout_seconds=_QUERY_TIMEOUT_SECONDS,
|
||
)
|
||
|
||
|
||
@bp.route("/query/execute", methods=["POST"])
|
||
@role_required("admin")
|
||
@csrf_protect
|
||
async def pipeline_query_execute():
|
||
"""Run user-submitted SQL and return a results table partial."""
|
||
from ..analytics import execute_user_query
|
||
|
||
form = await request.form
|
||
sql = (form.get("sql") or "").strip()
|
||
|
||
# Input validation
|
||
if not sql:
|
||
return await render_template(
|
||
"admin/partials/pipeline_query_results.html",
|
||
error="SQL query is empty.",
|
||
columns=[],
|
||
rows=[],
|
||
row_count=0,
|
||
elapsed_ms=0,
|
||
truncated=False,
|
||
)
|
||
|
||
if len(sql) > _QUERY_MAX_CHARS:
|
||
return await render_template(
|
||
"admin/partials/pipeline_query_results.html",
|
||
error=f"Query too long ({len(sql):,} chars). Maximum is {_QUERY_MAX_CHARS:,} characters.",
|
||
columns=[],
|
||
rows=[],
|
||
row_count=0,
|
||
elapsed_ms=0,
|
||
truncated=False,
|
||
)
|
||
|
||
if _BLOCKED_SQL_RE.search(sql):
|
||
return await render_template(
|
||
"admin/partials/pipeline_query_results.html",
|
||
error="Query contains a blocked keyword. Only SELECT statements are allowed.",
|
||
columns=[],
|
||
rows=[],
|
||
row_count=0,
|
||
elapsed_ms=0,
|
||
truncated=False,
|
||
)
|
||
|
||
columns, rows, error, elapsed_ms = await execute_user_query(
|
||
sql,
|
||
max_rows=_QUERY_MAX_ROWS,
|
||
timeout_seconds=_QUERY_TIMEOUT_SECONDS,
|
||
)
|
||
|
||
truncated = len(rows) >= _QUERY_MAX_ROWS
|
||
|
||
return await render_template(
|
||
"admin/partials/pipeline_query_results.html",
|
||
error=error,
|
||
columns=columns,
|
||
rows=rows,
|
||
row_count=len(rows),
|
||
elapsed_ms=elapsed_ms,
|
||
truncated=truncated,
|
||
)
|