feat(supervisor): port Python supervisor from padelnomics + workflows.toml

Port padelnomics' schedule-aware Python supervisor to materia: - src/materia/supervisor.py — croniter scheduling, topological wave execution (parallel independent workflows), tag-based git pull + deploy, status CLI subcommand - infra/supervisor/workflows.toml — workflow registry (psd daily, cot weekly, prices daily, ice daily, weather daily) - infra/supervisor/materia-supervisor.service — updated ExecStart to Python supervisor, added SUPERVISOR_GIT_PULL=1 Adaptations from padelnomics: - Uses extract_core.state.open_state_db (not padelnomics_extract.utils) - uv run sqlmesh -p transform/sqlmesh_materia run - uv run materia pipeline run export_serving - web/deploy.sh path (materia's deploy.sh is under web/) - Removed proxy_mode (not used in materia) Also: add croniter dependency to src/materia, delete old supervisor.sh. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 11:59:55 +01:00
parent 520da2c920
commit 5d7d53a260
6 changed files with 503 additions and 86 deletions
--- a/infra/supervisor/materia-supervisor.service
+++ b/infra/supervisor/materia-supervisor.service
@@ -7,13 +7,14 @@ Wants=network-online.target
 Type=simple
 User=root
 WorkingDirectory=/opt/materia
-ExecStart=/opt/materia/infra/supervisor/supervisor.sh
+ExecStart=/bin/sh -c 'exec uv run python src/materia/supervisor.py'
 Restart=always
 RestartSec=10
 EnvironmentFile=/opt/materia/.env
 Environment=LANDING_DIR=/data/materia/landing
 Environment=DUCKDB_PATH=/data/materia/lakehouse.duckdb
 Environment=SERVING_DUCKDB_PATH=/data/materia/analytics.duckdb
+Environment=SUPERVISOR_GIT_PULL=1

 # Resource limits
 LimitNOFILE=65536
--- a/infra/supervisor/supervisor.sh
+++ b/infra/supervisor/supervisor.sh
@@ -1,69 +0,0 @@
-#!/bin/sh
-# Materia Supervisor - Continuous pipeline orchestration
-# Inspired by TigerBeetle's CFO supervisor: simple, resilient, easy to understand
-# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
-#
-# Environment variables (set in systemd EnvironmentFile):
-#   LANDING_DIR          — local path for extracted landing data
-#   DUCKDB_PATH          — path to DuckDB lakehouse file (SQLMesh pipeline DB)
-#   SERVING_DUCKDB_PATH  — path to serving-only DuckDB (web app reads from here)
-#   ALERT_WEBHOOK_URL    — optional ntfy.sh / Slack / Telegram webhook for failure alerts
-
-set -eu
-
-readonly REPO_DIR="/opt/materia"
-
-while true
-do
-    (
-        # Clone repo if missing
-        if ! [ -d "$REPO_DIR/.git" ]
-        then
-            echo "Repository not found, bootstrap required!"
-            exit 1
-        fi
-
-        cd "$REPO_DIR"
-
-        # Update code from git
-        git fetch origin master
-        git switch --discard-changes --detach origin/master
-        uv sync
-
-        # Extract all data sources
-        LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
-        DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
-            uv run materia pipeline run extract
-
-        LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
-        DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
-            uv run materia pipeline run extract_cot
-
-        LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
-        DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
-            uv run materia pipeline run extract_prices
-
-        LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
-        DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
-            uv run materia pipeline run extract_ice
-
-        # Transform all data sources
-        LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
-        DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
-            uv run materia pipeline run transform
-
-        # Export serving tables to analytics.duckdb (atomic swap).
-        # The web app reads from SERVING_DUCKDB_PATH and picks up the new file
-        # automatically via inode-based connection reopen — no restart needed.
-        DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
-        SERVING_DUCKDB_PATH="${SERVING_DUCKDB_PATH:-/data/materia/analytics.duckdb}" \
-            uv run materia pipeline run export_serving
-
-    ) || {
-        # Notify on failure if webhook is configured, then sleep to avoid busy-loop
-        if [ -n "${ALERT_WEBHOOK_URL:-}" ]; then
-            curl -s -d "Materia pipeline failed at $(date)" "$ALERT_WEBHOOK_URL" 2>/dev/null || true
-        fi
-        sleep 600  # Sleep 10 min on failure
-    }
-done
--- a/infra/supervisor/workflows.toml
+++ b/infra/supervisor/workflows.toml
@@ -0,0 +1,34 @@
+# Workflow registry — the supervisor reads this file on every tick.
+# To add a new extractor: add a [section] here and create the Python module.
+#
+# Fields:
+#   module   — Python module path (must expose an entry function)
+#   entry    — function name in the module (default: "main")
+#   schedule — named preset ("hourly", "daily", "weekly", "monthly")
+#              or raw cron expression (e.g. "0 6 * * 1-5")
+#   depends_on — optional: list of workflow names that must complete first
+
+[extract_psd]
+module = "psdonline.execute"
+entry = "extract_psd_dataset"
+schedule = "daily"
+
+[extract_cot]
+module = "cftc_cot.execute"
+entry = "extract_cot_dataset"
+schedule = "weekly"
+
+[extract_prices]
+module = "coffee_prices.execute"
+entry = "extract_coffee_prices"
+schedule = "daily"
+
+[extract_ice]
+module = "ice_stocks.execute"
+entry = "extract_ice_all"
+schedule = "daily"
+
+[extract_weather]
+module = "openmeteo.execute"
+entry = "extract_weather"
+schedule = "daily"