diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 53f2be3..e67f643 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -94,7 +94,7 @@ deploy:supervisor: exit 1 fi - echo "Deploying to supervisor at ${SUPERVISOR_IP}..." + echo "Connecting to supervisor at ${SUPERVISOR_IP}..." # Setup SSH mkdir -p ~/.ssh @@ -102,15 +102,25 @@ deploy:supervisor: chmod 600 ~/.ssh/id_rsa ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts + # Wait for SSH to be ready (new instance may take a moment) + echo "Waiting for SSH to be ready..." + for i in $(seq 1 30); do + if ssh -o ConnectTimeout=5 root@${SUPERVISOR_IP} "echo 'SSH ready'"; then + break + fi + echo "Attempt $i/30 failed, retrying..." + sleep 10 + done + # Check if supervisor is bootstrapped - if ssh -o ConnectTimeout=10 root@${SUPERVISOR_IP} "test -d /opt/materia/repo/.git"; then - echo "Supervisor already bootstrapped, triggering update..." - # Just signal supervisor to pull latest - it will do so on next check cycle - ssh root@${SUPERVISOR_IP} "systemctl is-active materia-supervisor || echo 'Service not running, may need bootstrap'" + if ssh root@${SUPERVISOR_IP} "test -d /opt/materia/.git"; then + echo "Supervisor already bootstrapped and will auto-update" + ssh root@${SUPERVISOR_IP} "systemctl status materia-supervisor --no-pager" else - echo "Supervisor not bootstrapped yet. Run bootstrap script:" - echo " export PULUMI_ACCESS_TOKEN=\${PULUMI_ACCESS_TOKEN}" - echo " ssh root@${SUPERVISOR_IP} 'bash -s' < infra/bootstrap_supervisor.sh" + echo "Bootstrapping supervisor for the first time..." + # Export secrets and run bootstrap + ssh root@${SUPERVISOR_IP} "export PULUMI_ACCESS_TOKEN='${PULUMI_ACCESS_TOKEN}' GITLAB_READ_TOKEN='${GITLAB_READ_TOKEN}' && bash -s" < infra/bootstrap_supervisor.sh + echo "Bootstrap complete!" fi dependencies: - deploy:infra diff --git a/CLAUDE.md b/CLAUDE.md index cc42623..0861031 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -307,3 +307,6 @@ Supervisor: uv run materia pipeline run Note: The dev database is large and should not be committed to git (.gitignore already configured). - We use a monorepo with uv workspaces - The pulumi env is called beanflows/prod +- NEVER hardcode secrets in plaintext +- Never add ssh keys to the git repo! +- If there is a simpler more direct solution and there is no other tradeoff, always choose the simpler solution \ No newline at end of file diff --git a/infra/bootstrap_supervisor.sh b/infra/bootstrap_supervisor.sh index d867483..3c71497 100755 --- a/infra/bootstrap_supervisor.sh +++ b/infra/bootstrap_supervisor.sh @@ -26,8 +26,7 @@ if [ "$EUID" -ne 0 ]; then fi # Configuration -MATERIA_DIR="/opt/materia" -REPO_DIR="$MATERIA_DIR/repo" +REPO_DIR="/opt/materia" GITLAB_PROJECT="deemanone/materia" # GITLAB_READ_TOKEN should be set in Pulumi ESC (beanflows/prod) @@ -71,22 +70,20 @@ echo "--- Loading secrets from Pulumi ESC ---" eval $(esc env open beanflows/prod --format shell) echo "--- Cloning repository ---" -mkdir -p "$MATERIA_DIR" if [ -d "$REPO_DIR" ]; then echo "Repository already exists, pulling latest..." cd "$REPO_DIR" git pull origin master else - cd "$MATERIA_DIR" - git clone "$REPO_URL" repo - cd repo + git clone "$REPO_URL" "$REPO_DIR" + cd "$REPO_DIR" fi echo "--- Installing Python dependencies ---" uv sync echo "--- Creating environment file ---" -cat > "$MATERIA_DIR/.env" < "$REPO_DIR/.env" < /etc/systemd/system/materia-supervisor.service <<'EOF' -[Unit] -Description=Materia Supervisor - Pipeline Orchestration -After=network-online.target -Wants=network-online.target - -[Service] -Type=simple -User=root -WorkingDirectory=/opt/materia/repo -ExecStart=/opt/materia/repo/infra/supervisor/supervisor.sh -Restart=always -RestartSec=10 -EnvironmentFile=/opt/materia/.env - -# Resource limits -LimitNOFILE=65536 - -# Logging -StandardOutput=journal -StandardError=journal -SyslogIdentifier=materia-supervisor - -[Install] -WantedBy=multi-user.target -EOF +cp "$REPO_DIR/infra/supervisor/materia-supervisor.service" /etc/systemd/system/materia-supervisor.service echo "--- Enabling and starting service ---" systemctl daemon-reload diff --git a/infra/supervisor/materia-supervisor.service b/infra/supervisor/materia-supervisor.service index a35aca0..f32b012 100644 --- a/infra/supervisor/materia-supervisor.service +++ b/infra/supervisor/materia-supervisor.service @@ -7,12 +7,10 @@ Wants=network-online.target Type=simple User=root WorkingDirectory=/opt/materia -Environment="PATH=/usr/local/bin:/usr/bin:/bin:/root/.pulumi/bin" -EnvironmentFile=/opt/materia/.env - -# Restart policy +ExecStart=/opt/materia/infra/supervisor/supervisor.sh Restart=always RestartSec=10 +EnvironmentFile=/opt/materia/.env # Resource limits LimitNOFILE=65536 @@ -22,8 +20,5 @@ StandardOutput=journal StandardError=journal SyslogIdentifier=materia-supervisor -# Execute supervisor script -ExecStart=/opt/materia/supervisor.sh - [Install] WantedBy=multi-user.target diff --git a/infra/supervisor/supervisor.sh b/infra/supervisor/supervisor.sh index 71e9b7e..60313fa 100644 --- a/infra/supervisor/supervisor.sh +++ b/infra/supervisor/supervisor.sh @@ -1,152 +1,34 @@ -#!/bin/bash +#!/bin/sh # Materia Supervisor - Continuous pipeline orchestration -# Inspired by TigerBeetle's CFO supervisor pattern +# Inspired by TigerBeetle's CFO supervisor: simple, resilient, easy to understand # https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh -# -# Git-based deployment: pulls latest code from master and runs pipelines via uv -set -euo pipefail +set -eu -# Configuration -readonly CHECK_INTERVAL=900 # 15 minutes -readonly MATERIA_REPO="/opt/materia/repo" -readonly STATE_DIR="/var/lib/materia" +readonly REPO_DIR="/opt/materia" -# Schedules (cron-style times in UTC) -readonly EXTRACT_SCHEDULE_HOUR=2 # 02:00 UTC -readonly TRANSFORM_SCHEDULE_HOUR=3 # 03:00 UTC - -# Ensure state directory exists -mkdir -p "$STATE_DIR" - -log() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" -} - -log_error() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2 -} - -# Update code from git -update_code() { - log "Checking for code updates..." - cd "$MATERIA_REPO" - - # Fetch latest from master - if ! git fetch origin master 2>&1 | grep -v "^From"; then - log_error "Failed to fetch from git" - return 1 - fi - - # Check if update available - LOCAL=$(git rev-parse HEAD) - REMOTE=$(git rev-parse origin/master) - - if [ "$LOCAL" != "$REMOTE" ]; then - log "New version detected: $LOCAL -> $REMOTE" - - # Pull latest code - if git pull origin master; then - log "Code updated successfully" - - # Update dependencies - log "Updating dependencies with uv sync..." - if uv sync; then - log "Dependencies updated" - return 0 - else - log_error "Failed to update dependencies" - return 1 - fi - else - log_error "Failed to pull code" - return 1 - fi - fi - - log "Already up to date at $(git rev-parse --short HEAD)" - return 1 # Return 1 to indicate no update (not an error) -} - -# Run pipeline using materia CLI via uv -run_pipeline() { - local pipeline=$1 - local date=$(date -u +%Y-%m-%d) - local state_file="$STATE_DIR/${pipeline}_last_run" - - log "Running $pipeline pipeline..." - - cd "$MATERIA_REPO" - if uv run materia pipeline run "$pipeline"; then - log "$pipeline completed successfully" - echo "$date" > "$state_file" - return 0 - else - log_error "$pipeline failed" - return 1 - fi -} - -# Check if pipeline should run today -should_run_pipeline() { - local pipeline=$1 - local schedule_hour=$2 - local current_hour=$(date -u +%H) - local current_date=$(date -u +%Y-%m-%d) - local state_file="$STATE_DIR/${pipeline}_last_run" - - # Only run at the scheduled hour - if [ "$current_hour" -ne "$schedule_hour" ]; then - return 1 - fi - - # Check if already ran today - if [ -f "$state_file" ]; then - local last_run=$(cat "$state_file") - if [ "$last_run" = "$current_date" ]; then - return 1 # Already ran today - fi - fi - - return 0 # Should run -} - -# Main supervisor loop -main() { - log "Materia supervisor starting..." - log "Repository: $MATERIA_REPO" - log "Extract schedule: daily at ${EXTRACT_SCHEDULE_HOUR}:00 UTC" - log "Transform schedule: daily at ${TRANSFORM_SCHEDULE_HOUR}:00 UTC" - log "Check interval: ${CHECK_INTERVAL}s" - - # Ensure repo exists - if [ ! -d "$MATERIA_REPO/.git" ]; then - log_error "Repository not found at $MATERIA_REPO" - log_error "Run bootstrap script first!" - exit 1 - fi - - # Show initial version - cd "$MATERIA_REPO" - log "Starting at commit: $(git rev-parse --short HEAD)" - - while true; do - # Check for code updates every loop - update_code || true - - # Check extract schedule - if should_run_pipeline "extract" "$EXTRACT_SCHEDULE_HOUR"; then - run_pipeline extract || true +while true +do + ( + # Clone repo if missing + if ! [ -d "$REPO_DIR/.git" ] + then + echo "Repository not found, bootstrap required!" + exit 1 fi - # Check transform schedule - if should_run_pipeline "transform" "$TRANSFORM_SCHEDULE_HOUR"; then - run_pipeline transform || true - fi + cd "$REPO_DIR" - sleep "$CHECK_INTERVAL" - done -} + # Update code from git + git fetch origin master + git switch --discard-changes --detach origin/master + uv sync -# Run main loop -main + # Run pipelines (SQLMesh handles scheduling) + uv run materia pipeline run extract + uv run materia pipeline run transform + + ) || sleep 600 # Sleep 10 min on failure to avoid busy-loop retries + + sleep 3600 # Run pipelines every hour +done