Simplify supervisor architecture and automate bootstrap

- Simplify supervisor.sh following TigerBeetle pattern
  - Remove complex functions, use simple while loop
  - Add || sleep 600 for resilience against crashes
  - Use git switch --discard-changes for clean updates
  - Run pipelines every hour (SQLMesh handles scheduling)
  - Use POSIX sh instead of bash

- Remove /repo subdirectory nesting
  - Repository clones directly to /opt/materia
  - Simpler paths throughout

- Move systemd service to repo
  - Bootstrap copies from repo instead of hardcoding
  - Service can be updated via git pull

- Automate bootstrap in CI/CD
  - deploy:supervisor now auto-bootstraps on first deploy
  - Waits for SSH to be ready (retry loop)
  - Injects secrets via SSH environment
  - Idempotent: detects if already bootstrapped

Result: Push to master and supervisor "just works"

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Deeman
2025-10-13 21:17:12 +02:00
parent 21f99767bf
commit 2fff895a73
5 changed files with 53 additions and 191 deletions

View File

@@ -94,7 +94,7 @@ deploy:supervisor:
exit 1 exit 1
fi fi
echo "Deploying to supervisor at ${SUPERVISOR_IP}..." echo "Connecting to supervisor at ${SUPERVISOR_IP}..."
# Setup SSH # Setup SSH
mkdir -p ~/.ssh mkdir -p ~/.ssh
@@ -102,15 +102,25 @@ deploy:supervisor:
chmod 600 ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa
ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts
# Wait for SSH to be ready (new instance may take a moment)
echo "Waiting for SSH to be ready..."
for i in $(seq 1 30); do
if ssh -o ConnectTimeout=5 root@${SUPERVISOR_IP} "echo 'SSH ready'"; then
break
fi
echo "Attempt $i/30 failed, retrying..."
sleep 10
done
# Check if supervisor is bootstrapped # Check if supervisor is bootstrapped
if ssh -o ConnectTimeout=10 root@${SUPERVISOR_IP} "test -d /opt/materia/repo/.git"; then if ssh root@${SUPERVISOR_IP} "test -d /opt/materia/.git"; then
echo "Supervisor already bootstrapped, triggering update..." echo "Supervisor already bootstrapped and will auto-update"
# Just signal supervisor to pull latest - it will do so on next check cycle ssh root@${SUPERVISOR_IP} "systemctl status materia-supervisor --no-pager"
ssh root@${SUPERVISOR_IP} "systemctl is-active materia-supervisor || echo 'Service not running, may need bootstrap'"
else else
echo "Supervisor not bootstrapped yet. Run bootstrap script:" echo "Bootstrapping supervisor for the first time..."
echo " export PULUMI_ACCESS_TOKEN=\${PULUMI_ACCESS_TOKEN}" # Export secrets and run bootstrap
echo " ssh root@${SUPERVISOR_IP} 'bash -s' < infra/bootstrap_supervisor.sh" ssh root@${SUPERVISOR_IP} "export PULUMI_ACCESS_TOKEN='${PULUMI_ACCESS_TOKEN}' GITLAB_READ_TOKEN='${GITLAB_READ_TOKEN}' && bash -s" < infra/bootstrap_supervisor.sh
echo "Bootstrap complete!"
fi fi
dependencies: dependencies:
- deploy:infra - deploy:infra

View File

@@ -307,3 +307,6 @@ Supervisor: uv run materia pipeline run <pipeline>
Note: The dev database is large and should not be committed to git (.gitignore already configured). Note: The dev database is large and should not be committed to git (.gitignore already configured).
- We use a monorepo with uv workspaces - We use a monorepo with uv workspaces
- The pulumi env is called beanflows/prod - The pulumi env is called beanflows/prod
- NEVER hardcode secrets in plaintext
- Never add ssh keys to the git repo!
- If there is a simpler more direct solution and there is no other tradeoff, always choose the simpler solution

View File

@@ -26,8 +26,7 @@ if [ "$EUID" -ne 0 ]; then
fi fi
# Configuration # Configuration
MATERIA_DIR="/opt/materia" REPO_DIR="/opt/materia"
REPO_DIR="$MATERIA_DIR/repo"
GITLAB_PROJECT="deemanone/materia" GITLAB_PROJECT="deemanone/materia"
# GITLAB_READ_TOKEN should be set in Pulumi ESC (beanflows/prod) # GITLAB_READ_TOKEN should be set in Pulumi ESC (beanflows/prod)
@@ -71,22 +70,20 @@ echo "--- Loading secrets from Pulumi ESC ---"
eval $(esc env open beanflows/prod --format shell) eval $(esc env open beanflows/prod --format shell)
echo "--- Cloning repository ---" echo "--- Cloning repository ---"
mkdir -p "$MATERIA_DIR"
if [ -d "$REPO_DIR" ]; then if [ -d "$REPO_DIR" ]; then
echo "Repository already exists, pulling latest..." echo "Repository already exists, pulling latest..."
cd "$REPO_DIR" cd "$REPO_DIR"
git pull origin master git pull origin master
else else
cd "$MATERIA_DIR" git clone "$REPO_URL" "$REPO_DIR"
git clone "$REPO_URL" repo cd "$REPO_DIR"
cd repo
fi fi
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
uv sync uv sync
echo "--- Creating environment file ---" echo "--- Creating environment file ---"
cat > "$MATERIA_DIR/.env" <<EOF cat > "$REPO_DIR/.env" <<EOF
# Environment variables for supervisor # Environment variables for supervisor
# Loaded from Pulumi ESC: beanflows/prod # Loaded from Pulumi ESC: beanflows/prod
PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN} PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN}
@@ -94,32 +91,7 @@ PATH=/root/.cargo/bin:/root/.pulumi/bin:/usr/local/bin:/usr/bin:/bin
EOF EOF
echo "--- Setting up systemd service ---" echo "--- Setting up systemd service ---"
cat > /etc/systemd/system/materia-supervisor.service <<'EOF' cp "$REPO_DIR/infra/supervisor/materia-supervisor.service" /etc/systemd/system/materia-supervisor.service
[Unit]
Description=Materia Supervisor - Pipeline Orchestration
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=root
WorkingDirectory=/opt/materia/repo
ExecStart=/opt/materia/repo/infra/supervisor/supervisor.sh
Restart=always
RestartSec=10
EnvironmentFile=/opt/materia/.env
# Resource limits
LimitNOFILE=65536
# Logging
StandardOutput=journal
StandardError=journal
SyslogIdentifier=materia-supervisor
[Install]
WantedBy=multi-user.target
EOF
echo "--- Enabling and starting service ---" echo "--- Enabling and starting service ---"
systemctl daemon-reload systemctl daemon-reload

View File

@@ -7,12 +7,10 @@ Wants=network-online.target
Type=simple Type=simple
User=root User=root
WorkingDirectory=/opt/materia WorkingDirectory=/opt/materia
Environment="PATH=/usr/local/bin:/usr/bin:/bin:/root/.pulumi/bin" ExecStart=/opt/materia/infra/supervisor/supervisor.sh
EnvironmentFile=/opt/materia/.env
# Restart policy
Restart=always Restart=always
RestartSec=10 RestartSec=10
EnvironmentFile=/opt/materia/.env
# Resource limits # Resource limits
LimitNOFILE=65536 LimitNOFILE=65536
@@ -22,8 +20,5 @@ StandardOutput=journal
StandardError=journal StandardError=journal
SyslogIdentifier=materia-supervisor SyslogIdentifier=materia-supervisor
# Execute supervisor script
ExecStart=/opt/materia/supervisor.sh
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@@ -1,152 +1,34 @@
#!/bin/bash #!/bin/sh
# Materia Supervisor - Continuous pipeline orchestration # Materia Supervisor - Continuous pipeline orchestration
# Inspired by TigerBeetle's CFO supervisor pattern # Inspired by TigerBeetle's CFO supervisor: simple, resilient, easy to understand
# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh # https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
#
# Git-based deployment: pulls latest code from master and runs pipelines via uv
set -euo pipefail set -eu
# Configuration readonly REPO_DIR="/opt/materia"
readonly CHECK_INTERVAL=900 # 15 minutes
readonly MATERIA_REPO="/opt/materia/repo"
readonly STATE_DIR="/var/lib/materia"
# Schedules (cron-style times in UTC) while true
readonly EXTRACT_SCHEDULE_HOUR=2 # 02:00 UTC do
readonly TRANSFORM_SCHEDULE_HOUR=3 # 03:00 UTC (
# Clone repo if missing
# Ensure state directory exists if ! [ -d "$REPO_DIR/.git" ]
mkdir -p "$STATE_DIR" then
echo "Repository not found, bootstrap required!"
log() { exit 1
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
log_error() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2
}
# Update code from git
update_code() {
log "Checking for code updates..."
cd "$MATERIA_REPO"
# Fetch latest from master
if ! git fetch origin master 2>&1 | grep -v "^From"; then
log_error "Failed to fetch from git"
return 1
fi
# Check if update available
LOCAL=$(git rev-parse HEAD)
REMOTE=$(git rev-parse origin/master)
if [ "$LOCAL" != "$REMOTE" ]; then
log "New version detected: $LOCAL -> $REMOTE"
# Pull latest code
if git pull origin master; then
log "Code updated successfully"
# Update dependencies
log "Updating dependencies with uv sync..."
if uv sync; then
log "Dependencies updated"
return 0
else
log_error "Failed to update dependencies"
return 1
fi
else
log_error "Failed to pull code"
return 1
fi
fi
log "Already up to date at $(git rev-parse --short HEAD)"
return 1 # Return 1 to indicate no update (not an error)
}
# Run pipeline using materia CLI via uv
run_pipeline() {
local pipeline=$1
local date=$(date -u +%Y-%m-%d)
local state_file="$STATE_DIR/${pipeline}_last_run"
log "Running $pipeline pipeline..."
cd "$MATERIA_REPO"
if uv run materia pipeline run "$pipeline"; then
log "$pipeline completed successfully"
echo "$date" > "$state_file"
return 0
else
log_error "$pipeline failed"
return 1
fi
}
# Check if pipeline should run today
should_run_pipeline() {
local pipeline=$1
local schedule_hour=$2
local current_hour=$(date -u +%H)
local current_date=$(date -u +%Y-%m-%d)
local state_file="$STATE_DIR/${pipeline}_last_run"
# Only run at the scheduled hour
if [ "$current_hour" -ne "$schedule_hour" ]; then
return 1
fi
# Check if already ran today
if [ -f "$state_file" ]; then
local last_run=$(cat "$state_file")
if [ "$last_run" = "$current_date" ]; then
return 1 # Already ran today
fi
fi
return 0 # Should run
}
# Main supervisor loop
main() {
log "Materia supervisor starting..."
log "Repository: $MATERIA_REPO"
log "Extract schedule: daily at ${EXTRACT_SCHEDULE_HOUR}:00 UTC"
log "Transform schedule: daily at ${TRANSFORM_SCHEDULE_HOUR}:00 UTC"
log "Check interval: ${CHECK_INTERVAL}s"
# Ensure repo exists
if [ ! -d "$MATERIA_REPO/.git" ]; then
log_error "Repository not found at $MATERIA_REPO"
log_error "Run bootstrap script first!"
exit 1
fi
# Show initial version
cd "$MATERIA_REPO"
log "Starting at commit: $(git rev-parse --short HEAD)"
while true; do
# Check for code updates every loop
update_code || true
# Check extract schedule
if should_run_pipeline "extract" "$EXTRACT_SCHEDULE_HOUR"; then
run_pipeline extract || true
fi fi
# Check transform schedule cd "$REPO_DIR"
if should_run_pipeline "transform" "$TRANSFORM_SCHEDULE_HOUR"; then
run_pipeline transform || true
fi
sleep "$CHECK_INTERVAL" # Update code from git
done git fetch origin master
} git switch --discard-changes --detach origin/master
uv sync
# Run main loop # Run pipelines (SQLMesh handles scheduling)
main uv run materia pipeline run extract
uv run materia pipeline run transform
) || sleep 600 # Sleep 10 min on failure to avoid busy-loop retries
sleep 3600 # Run pipelines every hour
done