feat(supervisor): port Python supervisor from padelnomics + workflows.toml
Port padelnomics' schedule-aware Python supervisor to materia: - src/materia/supervisor.py — croniter scheduling, topological wave execution (parallel independent workflows), tag-based git pull + deploy, status CLI subcommand - infra/supervisor/workflows.toml — workflow registry (psd daily, cot weekly, prices daily, ice daily, weather daily) - infra/supervisor/materia-supervisor.service — updated ExecStart to Python supervisor, added SUPERVISOR_GIT_PULL=1 Adaptations from padelnomics: - Uses extract_core.state.open_state_db (not padelnomics_extract.utils) - uv run sqlmesh -p transform/sqlmesh_materia run - uv run materia pipeline run export_serving - web/deploy.sh path (materia's deploy.sh is under web/) - Removed proxy_mode (not used in materia) Also: add croniter dependency to src/materia, delete old supervisor.sh. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,13 +7,14 @@ Wants=network-online.target
|
||||
Type=simple
|
||||
User=root
|
||||
WorkingDirectory=/opt/materia
|
||||
ExecStart=/opt/materia/infra/supervisor/supervisor.sh
|
||||
ExecStart=/bin/sh -c 'exec uv run python src/materia/supervisor.py'
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
EnvironmentFile=/opt/materia/.env
|
||||
Environment=LANDING_DIR=/data/materia/landing
|
||||
Environment=DUCKDB_PATH=/data/materia/lakehouse.duckdb
|
||||
Environment=SERVING_DUCKDB_PATH=/data/materia/analytics.duckdb
|
||||
Environment=SUPERVISOR_GIT_PULL=1
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
#!/bin/sh
|
||||
# Materia Supervisor - Continuous pipeline orchestration
|
||||
# Inspired by TigerBeetle's CFO supervisor: simple, resilient, easy to understand
|
||||
# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
|
||||
#
|
||||
# Environment variables (set in systemd EnvironmentFile):
|
||||
# LANDING_DIR — local path for extracted landing data
|
||||
# DUCKDB_PATH — path to DuckDB lakehouse file (SQLMesh pipeline DB)
|
||||
# SERVING_DUCKDB_PATH — path to serving-only DuckDB (web app reads from here)
|
||||
# ALERT_WEBHOOK_URL — optional ntfy.sh / Slack / Telegram webhook for failure alerts
|
||||
|
||||
set -eu
|
||||
|
||||
readonly REPO_DIR="/opt/materia"
|
||||
|
||||
while true
|
||||
do
|
||||
(
|
||||
# Clone repo if missing
|
||||
if ! [ -d "$REPO_DIR/.git" ]
|
||||
then
|
||||
echo "Repository not found, bootstrap required!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "$REPO_DIR"
|
||||
|
||||
# Update code from git
|
||||
git fetch origin master
|
||||
git switch --discard-changes --detach origin/master
|
||||
uv sync
|
||||
|
||||
# Extract all data sources
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run extract
|
||||
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run extract_cot
|
||||
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run extract_prices
|
||||
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run extract_ice
|
||||
|
||||
# Transform all data sources
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run transform
|
||||
|
||||
# Export serving tables to analytics.duckdb (atomic swap).
|
||||
# The web app reads from SERVING_DUCKDB_PATH and picks up the new file
|
||||
# automatically via inode-based connection reopen — no restart needed.
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
SERVING_DUCKDB_PATH="${SERVING_DUCKDB_PATH:-/data/materia/analytics.duckdb}" \
|
||||
uv run materia pipeline run export_serving
|
||||
|
||||
) || {
|
||||
# Notify on failure if webhook is configured, then sleep to avoid busy-loop
|
||||
if [ -n "${ALERT_WEBHOOK_URL:-}" ]; then
|
||||
curl -s -d "Materia pipeline failed at $(date)" "$ALERT_WEBHOOK_URL" 2>/dev/null || true
|
||||
fi
|
||||
sleep 600 # Sleep 10 min on failure
|
||||
}
|
||||
done
|
||||
34
infra/supervisor/workflows.toml
Normal file
34
infra/supervisor/workflows.toml
Normal file
@@ -0,0 +1,34 @@
|
||||
# Workflow registry — the supervisor reads this file on every tick.
|
||||
# To add a new extractor: add a [section] here and create the Python module.
|
||||
#
|
||||
# Fields:
|
||||
# module — Python module path (must expose an entry function)
|
||||
# entry — function name in the module (default: "main")
|
||||
# schedule — named preset ("hourly", "daily", "weekly", "monthly")
|
||||
# or raw cron expression (e.g. "0 6 * * 1-5")
|
||||
# depends_on — optional: list of workflow names that must complete first
|
||||
|
||||
[extract_psd]
|
||||
module = "psdonline.execute"
|
||||
entry = "extract_psd_dataset"
|
||||
schedule = "daily"
|
||||
|
||||
[extract_cot]
|
||||
module = "cftc_cot.execute"
|
||||
entry = "extract_cot_dataset"
|
||||
schedule = "weekly"
|
||||
|
||||
[extract_prices]
|
||||
module = "coffee_prices.execute"
|
||||
entry = "extract_coffee_prices"
|
||||
schedule = "daily"
|
||||
|
||||
[extract_ice]
|
||||
module = "ice_stocks.execute"
|
||||
entry = "extract_ice_all"
|
||||
schedule = "daily"
|
||||
|
||||
[extract_weather]
|
||||
module = "openmeteo.execute"
|
||||
entry = "extract_weather"
|
||||
schedule = "daily"
|
||||
Reference in New Issue
Block a user