Simplify supervisor architecture and automate bootstrap
- Simplify supervisor.sh following TigerBeetle pattern - Remove complex functions, use simple while loop - Add || sleep 600 for resilience against crashes - Use git switch --discard-changes for clean updates - Run pipelines every hour (SQLMesh handles scheduling) - Use POSIX sh instead of bash - Remove /repo subdirectory nesting - Repository clones directly to /opt/materia - Simpler paths throughout - Move systemd service to repo - Bootstrap copies from repo instead of hardcoding - Service can be updated via git pull - Automate bootstrap in CI/CD - deploy:supervisor now auto-bootstraps on first deploy - Waits for SSH to be ready (retry loop) - Injects secrets via SSH environment - Idempotent: detects if already bootstrapped Result: Push to master and supervisor "just works" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -94,7 +94,7 @@ deploy:supervisor:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Deploying to supervisor at ${SUPERVISOR_IP}..."
|
echo "Connecting to supervisor at ${SUPERVISOR_IP}..."
|
||||||
|
|
||||||
# Setup SSH
|
# Setup SSH
|
||||||
mkdir -p ~/.ssh
|
mkdir -p ~/.ssh
|
||||||
@@ -102,15 +102,25 @@ deploy:supervisor:
|
|||||||
chmod 600 ~/.ssh/id_rsa
|
chmod 600 ~/.ssh/id_rsa
|
||||||
ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts
|
ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts
|
||||||
|
|
||||||
|
# Wait for SSH to be ready (new instance may take a moment)
|
||||||
|
echo "Waiting for SSH to be ready..."
|
||||||
|
for i in $(seq 1 30); do
|
||||||
|
if ssh -o ConnectTimeout=5 root@${SUPERVISOR_IP} "echo 'SSH ready'"; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo "Attempt $i/30 failed, retrying..."
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
|
|
||||||
# Check if supervisor is bootstrapped
|
# Check if supervisor is bootstrapped
|
||||||
if ssh -o ConnectTimeout=10 root@${SUPERVISOR_IP} "test -d /opt/materia/repo/.git"; then
|
if ssh root@${SUPERVISOR_IP} "test -d /opt/materia/.git"; then
|
||||||
echo "Supervisor already bootstrapped, triggering update..."
|
echo "Supervisor already bootstrapped and will auto-update"
|
||||||
# Just signal supervisor to pull latest - it will do so on next check cycle
|
ssh root@${SUPERVISOR_IP} "systemctl status materia-supervisor --no-pager"
|
||||||
ssh root@${SUPERVISOR_IP} "systemctl is-active materia-supervisor || echo 'Service not running, may need bootstrap'"
|
|
||||||
else
|
else
|
||||||
echo "Supervisor not bootstrapped yet. Run bootstrap script:"
|
echo "Bootstrapping supervisor for the first time..."
|
||||||
echo " export PULUMI_ACCESS_TOKEN=\${PULUMI_ACCESS_TOKEN}"
|
# Export secrets and run bootstrap
|
||||||
echo " ssh root@${SUPERVISOR_IP} 'bash -s' < infra/bootstrap_supervisor.sh"
|
ssh root@${SUPERVISOR_IP} "export PULUMI_ACCESS_TOKEN='${PULUMI_ACCESS_TOKEN}' GITLAB_READ_TOKEN='${GITLAB_READ_TOKEN}' && bash -s" < infra/bootstrap_supervisor.sh
|
||||||
|
echo "Bootstrap complete!"
|
||||||
fi
|
fi
|
||||||
dependencies:
|
dependencies:
|
||||||
- deploy:infra
|
- deploy:infra
|
||||||
|
|||||||
@@ -307,3 +307,6 @@ Supervisor: uv run materia pipeline run <pipeline>
|
|||||||
Note: The dev database is large and should not be committed to git (.gitignore already configured).
|
Note: The dev database is large and should not be committed to git (.gitignore already configured).
|
||||||
- We use a monorepo with uv workspaces
|
- We use a monorepo with uv workspaces
|
||||||
- The pulumi env is called beanflows/prod
|
- The pulumi env is called beanflows/prod
|
||||||
|
- NEVER hardcode secrets in plaintext
|
||||||
|
- Never add ssh keys to the git repo!
|
||||||
|
- If there is a simpler more direct solution and there is no other tradeoff, always choose the simpler solution
|
||||||
@@ -26,8 +26,7 @@ if [ "$EUID" -ne 0 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
MATERIA_DIR="/opt/materia"
|
REPO_DIR="/opt/materia"
|
||||||
REPO_DIR="$MATERIA_DIR/repo"
|
|
||||||
GITLAB_PROJECT="deemanone/materia"
|
GITLAB_PROJECT="deemanone/materia"
|
||||||
|
|
||||||
# GITLAB_READ_TOKEN should be set in Pulumi ESC (beanflows/prod)
|
# GITLAB_READ_TOKEN should be set in Pulumi ESC (beanflows/prod)
|
||||||
@@ -71,22 +70,20 @@ echo "--- Loading secrets from Pulumi ESC ---"
|
|||||||
eval $(esc env open beanflows/prod --format shell)
|
eval $(esc env open beanflows/prod --format shell)
|
||||||
|
|
||||||
echo "--- Cloning repository ---"
|
echo "--- Cloning repository ---"
|
||||||
mkdir -p "$MATERIA_DIR"
|
|
||||||
if [ -d "$REPO_DIR" ]; then
|
if [ -d "$REPO_DIR" ]; then
|
||||||
echo "Repository already exists, pulling latest..."
|
echo "Repository already exists, pulling latest..."
|
||||||
cd "$REPO_DIR"
|
cd "$REPO_DIR"
|
||||||
git pull origin master
|
git pull origin master
|
||||||
else
|
else
|
||||||
cd "$MATERIA_DIR"
|
git clone "$REPO_URL" "$REPO_DIR"
|
||||||
git clone "$REPO_URL" repo
|
cd "$REPO_DIR"
|
||||||
cd repo
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
uv sync
|
uv sync
|
||||||
|
|
||||||
echo "--- Creating environment file ---"
|
echo "--- Creating environment file ---"
|
||||||
cat > "$MATERIA_DIR/.env" <<EOF
|
cat > "$REPO_DIR/.env" <<EOF
|
||||||
# Environment variables for supervisor
|
# Environment variables for supervisor
|
||||||
# Loaded from Pulumi ESC: beanflows/prod
|
# Loaded from Pulumi ESC: beanflows/prod
|
||||||
PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN}
|
PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN}
|
||||||
@@ -94,32 +91,7 @@ PATH=/root/.cargo/bin:/root/.pulumi/bin:/usr/local/bin:/usr/bin:/bin
|
|||||||
EOF
|
EOF
|
||||||
|
|
||||||
echo "--- Setting up systemd service ---"
|
echo "--- Setting up systemd service ---"
|
||||||
cat > /etc/systemd/system/materia-supervisor.service <<'EOF'
|
cp "$REPO_DIR/infra/supervisor/materia-supervisor.service" /etc/systemd/system/materia-supervisor.service
|
||||||
[Unit]
|
|
||||||
Description=Materia Supervisor - Pipeline Orchestration
|
|
||||||
After=network-online.target
|
|
||||||
Wants=network-online.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=root
|
|
||||||
WorkingDirectory=/opt/materia/repo
|
|
||||||
ExecStart=/opt/materia/repo/infra/supervisor/supervisor.sh
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
EnvironmentFile=/opt/materia/.env
|
|
||||||
|
|
||||||
# Resource limits
|
|
||||||
LimitNOFILE=65536
|
|
||||||
|
|
||||||
# Logging
|
|
||||||
StandardOutput=journal
|
|
||||||
StandardError=journal
|
|
||||||
SyslogIdentifier=materia-supervisor
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo "--- Enabling and starting service ---"
|
echo "--- Enabling and starting service ---"
|
||||||
systemctl daemon-reload
|
systemctl daemon-reload
|
||||||
|
|||||||
@@ -7,12 +7,10 @@ Wants=network-online.target
|
|||||||
Type=simple
|
Type=simple
|
||||||
User=root
|
User=root
|
||||||
WorkingDirectory=/opt/materia
|
WorkingDirectory=/opt/materia
|
||||||
Environment="PATH=/usr/local/bin:/usr/bin:/bin:/root/.pulumi/bin"
|
ExecStart=/opt/materia/infra/supervisor/supervisor.sh
|
||||||
EnvironmentFile=/opt/materia/.env
|
|
||||||
|
|
||||||
# Restart policy
|
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=10
|
RestartSec=10
|
||||||
|
EnvironmentFile=/opt/materia/.env
|
||||||
|
|
||||||
# Resource limits
|
# Resource limits
|
||||||
LimitNOFILE=65536
|
LimitNOFILE=65536
|
||||||
@@ -22,8 +20,5 @@ StandardOutput=journal
|
|||||||
StandardError=journal
|
StandardError=journal
|
||||||
SyslogIdentifier=materia-supervisor
|
SyslogIdentifier=materia-supervisor
|
||||||
|
|
||||||
# Execute supervisor script
|
|
||||||
ExecStart=/opt/materia/supervisor.sh
|
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -1,152 +1,34 @@
|
|||||||
#!/bin/bash
|
#!/bin/sh
|
||||||
# Materia Supervisor - Continuous pipeline orchestration
|
# Materia Supervisor - Continuous pipeline orchestration
|
||||||
# Inspired by TigerBeetle's CFO supervisor pattern
|
# Inspired by TigerBeetle's CFO supervisor: simple, resilient, easy to understand
|
||||||
# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
|
# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
|
||||||
#
|
|
||||||
# Git-based deployment: pulls latest code from master and runs pipelines via uv
|
|
||||||
|
|
||||||
set -euo pipefail
|
set -eu
|
||||||
|
|
||||||
# Configuration
|
readonly REPO_DIR="/opt/materia"
|
||||||
readonly CHECK_INTERVAL=900 # 15 minutes
|
|
||||||
readonly MATERIA_REPO="/opt/materia/repo"
|
|
||||||
readonly STATE_DIR="/var/lib/materia"
|
|
||||||
|
|
||||||
# Schedules (cron-style times in UTC)
|
while true
|
||||||
readonly EXTRACT_SCHEDULE_HOUR=2 # 02:00 UTC
|
do
|
||||||
readonly TRANSFORM_SCHEDULE_HOUR=3 # 03:00 UTC
|
(
|
||||||
|
# Clone repo if missing
|
||||||
# Ensure state directory exists
|
if ! [ -d "$REPO_DIR/.git" ]
|
||||||
mkdir -p "$STATE_DIR"
|
then
|
||||||
|
echo "Repository not found, bootstrap required!"
|
||||||
log() {
|
exit 1
|
||||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_error() {
|
|
||||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
# Update code from git
|
|
||||||
update_code() {
|
|
||||||
log "Checking for code updates..."
|
|
||||||
cd "$MATERIA_REPO"
|
|
||||||
|
|
||||||
# Fetch latest from master
|
|
||||||
if ! git fetch origin master 2>&1 | grep -v "^From"; then
|
|
||||||
log_error "Failed to fetch from git"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if update available
|
|
||||||
LOCAL=$(git rev-parse HEAD)
|
|
||||||
REMOTE=$(git rev-parse origin/master)
|
|
||||||
|
|
||||||
if [ "$LOCAL" != "$REMOTE" ]; then
|
|
||||||
log "New version detected: $LOCAL -> $REMOTE"
|
|
||||||
|
|
||||||
# Pull latest code
|
|
||||||
if git pull origin master; then
|
|
||||||
log "Code updated successfully"
|
|
||||||
|
|
||||||
# Update dependencies
|
|
||||||
log "Updating dependencies with uv sync..."
|
|
||||||
if uv sync; then
|
|
||||||
log "Dependencies updated"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_error "Failed to update dependencies"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_error "Failed to pull code"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "Already up to date at $(git rev-parse --short HEAD)"
|
|
||||||
return 1 # Return 1 to indicate no update (not an error)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run pipeline using materia CLI via uv
|
|
||||||
run_pipeline() {
|
|
||||||
local pipeline=$1
|
|
||||||
local date=$(date -u +%Y-%m-%d)
|
|
||||||
local state_file="$STATE_DIR/${pipeline}_last_run"
|
|
||||||
|
|
||||||
log "Running $pipeline pipeline..."
|
|
||||||
|
|
||||||
cd "$MATERIA_REPO"
|
|
||||||
if uv run materia pipeline run "$pipeline"; then
|
|
||||||
log "$pipeline completed successfully"
|
|
||||||
echo "$date" > "$state_file"
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_error "$pipeline failed"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check if pipeline should run today
|
|
||||||
should_run_pipeline() {
|
|
||||||
local pipeline=$1
|
|
||||||
local schedule_hour=$2
|
|
||||||
local current_hour=$(date -u +%H)
|
|
||||||
local current_date=$(date -u +%Y-%m-%d)
|
|
||||||
local state_file="$STATE_DIR/${pipeline}_last_run"
|
|
||||||
|
|
||||||
# Only run at the scheduled hour
|
|
||||||
if [ "$current_hour" -ne "$schedule_hour" ]; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if already ran today
|
|
||||||
if [ -f "$state_file" ]; then
|
|
||||||
local last_run=$(cat "$state_file")
|
|
||||||
if [ "$last_run" = "$current_date" ]; then
|
|
||||||
return 1 # Already ran today
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
return 0 # Should run
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main supervisor loop
|
|
||||||
main() {
|
|
||||||
log "Materia supervisor starting..."
|
|
||||||
log "Repository: $MATERIA_REPO"
|
|
||||||
log "Extract schedule: daily at ${EXTRACT_SCHEDULE_HOUR}:00 UTC"
|
|
||||||
log "Transform schedule: daily at ${TRANSFORM_SCHEDULE_HOUR}:00 UTC"
|
|
||||||
log "Check interval: ${CHECK_INTERVAL}s"
|
|
||||||
|
|
||||||
# Ensure repo exists
|
|
||||||
if [ ! -d "$MATERIA_REPO/.git" ]; then
|
|
||||||
log_error "Repository not found at $MATERIA_REPO"
|
|
||||||
log_error "Run bootstrap script first!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Show initial version
|
|
||||||
cd "$MATERIA_REPO"
|
|
||||||
log "Starting at commit: $(git rev-parse --short HEAD)"
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
# Check for code updates every loop
|
|
||||||
update_code || true
|
|
||||||
|
|
||||||
# Check extract schedule
|
|
||||||
if should_run_pipeline "extract" "$EXTRACT_SCHEDULE_HOUR"; then
|
|
||||||
run_pipeline extract || true
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check transform schedule
|
cd "$REPO_DIR"
|
||||||
if should_run_pipeline "transform" "$TRANSFORM_SCHEDULE_HOUR"; then
|
|
||||||
run_pipeline transform || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep "$CHECK_INTERVAL"
|
# Update code from git
|
||||||
done
|
git fetch origin master
|
||||||
}
|
git switch --discard-changes --detach origin/master
|
||||||
|
uv sync
|
||||||
|
|
||||||
# Run main loop
|
# Run pipelines (SQLMesh handles scheduling)
|
||||||
main
|
uv run materia pipeline run extract
|
||||||
|
uv run materia pipeline run transform
|
||||||
|
|
||||||
|
) || sleep 600 # Sleep 10 min on failure to avoid busy-loop retries
|
||||||
|
|
||||||
|
sleep 3600 # Run pipelines every hour
|
||||||
|
done
|
||||||
|
|||||||
Reference in New Issue
Block a user