Refactor to git-based deployment: simplify CI/CD and supervisor
Addresses GitLab PR comments: 1. Remove hardcoded secrets from Pulumi.prod.yaml, use ESC environment 2. Simplify deployment by using git pull instead of R2 artifacts 3. Add bootstrap script for one-time supervisor setup Major changes: - **Pulumi config**: Use ESC environment (beanflows/prod) for all secrets - **Supervisor script**: Git-based deployment (git pull every 15 min) * No more artifact downloads from R2 * Runs code directly via `uv run materia` * Self-updating from master branch - **Bootstrap script**: New infra/bootstrap_supervisor.sh for initial setup * One-time script to clone repo and setup systemd service * Idempotent and simple - **CI/CD simplification**: Remove build and R2 deployment stages * Eliminated build:extract, build:transform, build:cli jobs * Eliminated deploy:r2 job * Simplified deploy:supervisor to just check bootstrap status * Reduced from 4 stages to 3 stages (Lint → Test → Deploy) - **Documentation**: Updated CLAUDE.md with new architecture * Git-based deployment flow * Bootstrap instructions * Simplified execution model Benefits: - ✅ No hardcoded secrets in config files - ✅ Simpler deployment (no artifact builds) - ✅ Easy to test locally (just git clone + uv sync) - ✅ Auto-updates every 15 minutes - ✅ Fewer CI/CD jobs (faster pipelines) - ✅ Cleaner separation of concerns Inspired by TigerBeetle's CFO supervisor pattern. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
143
.gitlab-ci.yml
143
.gitlab-ci.yml
@@ -3,7 +3,6 @@ image: python:3.13
|
|||||||
stages:
|
stages:
|
||||||
- lint
|
- lint
|
||||||
- test
|
- test
|
||||||
- build
|
|
||||||
- deploy
|
- deploy
|
||||||
|
|
||||||
variables:
|
variables:
|
||||||
@@ -54,83 +53,6 @@ test:sqlmesh:
|
|||||||
- uv sync
|
- uv sync
|
||||||
- cd transform/sqlmesh_materia && uv run sqlmesh test
|
- cd transform/sqlmesh_materia && uv run sqlmesh test
|
||||||
|
|
||||||
build:extract:
|
|
||||||
stage: build
|
|
||||||
before_script:
|
|
||||||
- *uv_setup
|
|
||||||
script:
|
|
||||||
- uv sync
|
|
||||||
- mkdir -p dist
|
|
||||||
- uv build --package psdonline --out-dir dist/extract
|
|
||||||
- cd dist/extract && tar -czf ../materia-extract-latest.tar.gz .
|
|
||||||
artifacts:
|
|
||||||
paths:
|
|
||||||
- dist/materia-extract-latest.tar.gz
|
|
||||||
expire_in: 1 week
|
|
||||||
rules:
|
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
|
||||||
|
|
||||||
build:transform:
|
|
||||||
stage: build
|
|
||||||
before_script:
|
|
||||||
- *uv_setup
|
|
||||||
script:
|
|
||||||
- uv sync
|
|
||||||
- mkdir -p dist
|
|
||||||
- uv build --package sqlmesh_materia --out-dir dist/transform
|
|
||||||
- cd dist/transform && tar -czf ../materia-transform-latest.tar.gz .
|
|
||||||
artifacts:
|
|
||||||
paths:
|
|
||||||
- dist/materia-transform-latest.tar.gz
|
|
||||||
expire_in: 1 week
|
|
||||||
rules:
|
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
|
||||||
|
|
||||||
build:cli:
|
|
||||||
stage: build
|
|
||||||
before_script:
|
|
||||||
- *uv_setup
|
|
||||||
script:
|
|
||||||
- uv sync
|
|
||||||
- mkdir -p dist
|
|
||||||
- uv build --out-dir dist/cli
|
|
||||||
- cd dist/cli && tar -czf ../materia-cli-latest.tar.gz .
|
|
||||||
artifacts:
|
|
||||||
paths:
|
|
||||||
- dist/materia-cli-latest.tar.gz
|
|
||||||
expire_in: 1 week
|
|
||||||
rules:
|
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
|
||||||
|
|
||||||
deploy:r2:
|
|
||||||
stage: deploy
|
|
||||||
image: rclone/rclone:latest
|
|
||||||
before_script:
|
|
||||||
- apk add --no-cache curl unzip
|
|
||||||
- curl -fsSL https://get.pulumi.com/esc/install.sh | sh
|
|
||||||
- export PATH="$HOME/.pulumi/bin:$PATH"
|
|
||||||
- esc login --token ${PULUMI_ACCESS_TOKEN}
|
|
||||||
- eval $(esc env open beanflows/prod --format shell)
|
|
||||||
- |
|
|
||||||
mkdir -p ~/.config/rclone
|
|
||||||
cat > ~/.config/rclone/rclone.conf <<EOF
|
|
||||||
[r2]
|
|
||||||
type = s3
|
|
||||||
provider = Cloudflare
|
|
||||||
access_key_id = ${R2_ACCESS_KEY_ID}
|
|
||||||
secret_access_key = ${R2_SECRET_ACCESS_KEY}
|
|
||||||
endpoint = https://${R2_ENDPOINT}
|
|
||||||
acl = private
|
|
||||||
EOF
|
|
||||||
script:
|
|
||||||
- rclone copy dist/*.tar.gz r2:${R2_ARTIFACTS_BUCKET}/ -v
|
|
||||||
dependencies:
|
|
||||||
- build:extract
|
|
||||||
- build:transform
|
|
||||||
- build:cli
|
|
||||||
rules:
|
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
|
||||||
|
|
||||||
deploy:infra:
|
deploy:infra:
|
||||||
stage: deploy
|
stage: deploy
|
||||||
image: pulumi/pulumi:latest
|
image: pulumi/pulumi:latest
|
||||||
@@ -152,20 +74,26 @@ deploy:supervisor:
|
|||||||
- export PATH="$HOME/.pulumi/bin:$PATH"
|
- export PATH="$HOME/.pulumi/bin:$PATH"
|
||||||
- esc login --token ${PULUMI_ACCESS_TOKEN}
|
- esc login --token ${PULUMI_ACCESS_TOKEN}
|
||||||
- eval $(esc env open beanflows/prod --format shell)
|
- eval $(esc env open beanflows/prod --format shell)
|
||||||
script:
|
# Install Pulumi CLI to get stack outputs
|
||||||
- |
|
- |
|
||||||
# Install pulumi CLI to get stack outputs
|
|
||||||
apk add --no-cache pulumi-bin || {
|
apk add --no-cache pulumi-bin || {
|
||||||
curl -fsSL https://get.pulumi.com/install.sh | sh
|
curl -fsSL https://get.pulumi.com/install.sh | sh
|
||||||
export PATH="$HOME/.pulumi/bin:$PATH"
|
export PATH="$HOME/.pulumi/bin:$PATH"
|
||||||
}
|
}
|
||||||
pulumi login --token ${PULUMI_ACCESS_TOKEN}
|
- pulumi login --token ${PULUMI_ACCESS_TOKEN}
|
||||||
|
script:
|
||||||
|
- |
|
||||||
# Get supervisor IP from Pulumi
|
# Get supervisor IP from Pulumi
|
||||||
cd infra
|
cd infra
|
||||||
SUPERVISOR_IP=$(pulumi stack output supervisor_ip -s prod)
|
SUPERVISOR_IP=$(pulumi stack output supervisor_ip -s prod)
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
|
# Check if supervisor exists
|
||||||
|
if [ -z "$SUPERVISOR_IP" ] || [ "$SUPERVISOR_IP" = "null" ]; then
|
||||||
|
echo "No supervisor instance found. Run 'pulumi up' first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Deploying to supervisor at ${SUPERVISOR_IP}..."
|
echo "Deploying to supervisor at ${SUPERVISOR_IP}..."
|
||||||
|
|
||||||
# Setup SSH
|
# Setup SSH
|
||||||
@@ -174,48 +102,17 @@ deploy:supervisor:
|
|||||||
chmod 600 ~/.ssh/id_rsa
|
chmod 600 ~/.ssh/id_rsa
|
||||||
ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts
|
ssh-keyscan -H $SUPERVISOR_IP >> ~/.ssh/known_hosts
|
||||||
|
|
||||||
# Deploy supervisor script and service
|
# Check if supervisor is bootstrapped
|
||||||
scp infra/supervisor/supervisor.sh root@${SUPERVISOR_IP}:/opt/materia/supervisor.sh
|
if ssh -o ConnectTimeout=10 root@${SUPERVISOR_IP} "test -d /opt/materia/repo/.git"; then
|
||||||
scp infra/supervisor/materia-supervisor.service root@${SUPERVISOR_IP}:/etc/systemd/system/materia-supervisor.service
|
echo "Supervisor already bootstrapped, triggering update..."
|
||||||
|
# Just signal supervisor to pull latest - it will do so on next check cycle
|
||||||
# Deploy to supervisor
|
ssh root@${SUPERVISOR_IP} "systemctl is-active materia-supervisor || echo 'Service not running, may need bootstrap'"
|
||||||
ssh root@${SUPERVISOR_IP} bash <<'ENDSSH'
|
else
|
||||||
set -e
|
echo "Supervisor not bootstrapped yet. Run bootstrap script:"
|
||||||
cd /opt/materia
|
echo " export PULUMI_ACCESS_TOKEN=\${PULUMI_ACCESS_TOKEN}"
|
||||||
|
echo " ssh root@${SUPERVISOR_IP} 'bash -s' < infra/bootstrap_supervisor.sh"
|
||||||
# Create environment file with secrets
|
fi
|
||||||
cat > .env <<EOF
|
|
||||||
R2_ENDPOINT=${R2_ENDPOINT}
|
|
||||||
R2_ARTIFACTS_BUCKET=${R2_ARTIFACTS_BUCKET}
|
|
||||||
PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Download and install CLI
|
|
||||||
curl -fsSL -o materia-cli-latest.tar.gz \
|
|
||||||
https://${R2_ENDPOINT}/${R2_ARTIFACTS_BUCKET}/materia-cli-latest.tar.gz
|
|
||||||
|
|
||||||
rm -rf cli && mkdir -p cli
|
|
||||||
tar -xzf materia-cli-latest.tar.gz -C cli/
|
|
||||||
pip3 install --break-system-packages --force-reinstall cli/*.whl
|
|
||||||
|
|
||||||
# Configure Pulumi ESC
|
|
||||||
export PATH="$HOME/.pulumi/bin:$PATH"
|
|
||||||
esc login --token ${PULUMI_ACCESS_TOKEN}
|
|
||||||
|
|
||||||
# Make supervisor script executable
|
|
||||||
chmod +x /opt/materia/supervisor.sh
|
|
||||||
|
|
||||||
# Reload systemd and restart service
|
|
||||||
systemctl daemon-reload
|
|
||||||
systemctl enable materia-supervisor
|
|
||||||
systemctl restart materia-supervisor
|
|
||||||
|
|
||||||
# Show status
|
|
||||||
echo "Supervisor service status:"
|
|
||||||
systemctl status materia-supervisor --no-pager
|
|
||||||
ENDSSH
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- deploy:r2
|
|
||||||
- deploy:infra
|
- deploy:infra
|
||||||
rules:
|
rules:
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||||
|
|||||||
86
CLAUDE.md
86
CLAUDE.md
@@ -168,11 +168,11 @@ pytest --cov=./ --cov-report=xml
|
|||||||
|
|
||||||
### CI/CD Pipeline (`.gitlab-ci.yml`)
|
### CI/CD Pipeline (`.gitlab-ci.yml`)
|
||||||
|
|
||||||
**4 Stages: Lint → Test → Build → Deploy**
|
**3 Stages: Lint → Test → Deploy**
|
||||||
|
|
||||||
#### 1. Lint Stage
|
#### 1. Lint Stage
|
||||||
- Runs `ruff check` and `ruff format --check`
|
- Runs `ruff check` on every commit
|
||||||
- Validates code quality on every commit
|
- Validates code quality
|
||||||
|
|
||||||
#### 2. Test Stage
|
#### 2. Test Stage
|
||||||
- **`test:cli`**: Runs pytest on materia CLI with 71% coverage
|
- **`test:cli`**: Runs pytest on materia CLI with 71% coverage
|
||||||
@@ -182,53 +182,51 @@ pytest --cov=./ --cov-report=xml
|
|||||||
- Exports coverage reports to GitLab
|
- Exports coverage reports to GitLab
|
||||||
- **`test:sqlmesh`**: Runs SQLMesh model tests in transform layer
|
- **`test:sqlmesh`**: Runs SQLMesh model tests in transform layer
|
||||||
|
|
||||||
#### 3. Build Stage (only on master branch)
|
#### 3. Deploy Stage (only on master branch)
|
||||||
Creates separate artifacts for each workspace package:
|
|
||||||
- **`build:extract`**: Builds `materia-extract-latest.tar.gz` (psdonline package)
|
|
||||||
- **`build:transform`**: Builds `materia-transform-latest.tar.gz` (sqlmesh_materia package)
|
|
||||||
- **`build:cli`**: Builds `materia-cli-latest.tar.gz` (materia management CLI)
|
|
||||||
|
|
||||||
Each artifact is a self-contained tarball with all dependencies.
|
|
||||||
|
|
||||||
#### 4. Deploy Stage (only on master branch)
|
|
||||||
- **`deploy:r2`**: Uploads artifacts to Cloudflare R2 using rclone
|
|
||||||
- Loads secrets from Pulumi ESC (`beanflows/prod`)
|
|
||||||
- Only requires `PULUMI_ACCESS_TOKEN` in GitLab variables
|
|
||||||
- All other secrets (R2 credentials, SSH keys, API tokens) come from ESC
|
|
||||||
- **`deploy:infra`**: Runs `pulumi up` to ensure supervisor instance exists
|
- **`deploy:infra`**: Runs `pulumi up` to ensure supervisor instance exists
|
||||||
- Runs on every master push (not just on infra changes)
|
- Runs on every master push
|
||||||
- Creates/updates Hetzner CCX11 supervisor instance
|
- Creates/updates Hetzner CPX11 supervisor instance (~€4.49/mo)
|
||||||
- Configures Cloudflare R2 buckets (`beanflows-artifacts`, `beanflows-data-prod`)
|
- Uses Pulumi ESC (`beanflows/prod`) for all secrets
|
||||||
- **`deploy:supervisor`**: Deploys supervisor script and materia CLI
|
- **`deploy:supervisor`**: Checks supervisor status
|
||||||
- Runs after `deploy:r2` and `deploy:infra`
|
- Verifies supervisor is bootstrapped
|
||||||
- Copies `supervisor.sh` and systemd service to supervisor instance
|
- Supervisor auto-updates via `git pull` every 15 minutes (no CI/CD deployment needed)
|
||||||
- Downloads and installs latest materia CLI from R2
|
|
||||||
- Restarts supervisor service to pick up changes
|
|
||||||
|
|
||||||
### Production Architecture: Ephemeral Worker Model
|
**Note:** No build artifacts! Supervisor pulls code directly from git and runs via `uv`.
|
||||||
|
|
||||||
|
### Production Architecture: Git-Based Deployment with Ephemeral Workers
|
||||||
|
|
||||||
**Design Philosophy:**
|
**Design Philosophy:**
|
||||||
- No always-on workers (cost optimization)
|
- No always-on workers (cost optimization)
|
||||||
- Supervisor instance dynamically creates/destroys workers on-demand
|
- Supervisor pulls latest code from git (no artifact builds)
|
||||||
- Language-agnostic artifacts enable future migration to C/Rust/Go
|
- Supervisor dynamically creates/destroys workers on-demand
|
||||||
|
- Simple, inspectable, easy to test locally
|
||||||
- Multi-cloud abstraction for pricing optimization
|
- Multi-cloud abstraction for pricing optimization
|
||||||
|
|
||||||
**Components:**
|
**Components:**
|
||||||
|
|
||||||
#### 1. Supervisor Instance (Small Hetzner VM)
|
#### 1. Supervisor Instance (Small Hetzner VM)
|
||||||
- Runs `supervisor.sh` - continuous orchestration loop (inspired by TigerBeetle's CFO supervisor)
|
- Runs `supervisor.sh` - continuous orchestration loop (inspired by TigerBeetle's CFO supervisor)
|
||||||
- Hetzner CCX11: 2 vCPU, 4GB RAM (~€4/mo)
|
- Hetzner CPX11: 2 vCPU (shared), 2GB RAM (~€4.49/mo)
|
||||||
- Always-on, minimal resource usage
|
- Always-on, minimal resource usage
|
||||||
- Checks for new CLI versions every hour (self-updating)
|
- Git-based deployment: `git pull` every 15 minutes for auto-updates
|
||||||
- Runs pipelines on schedule:
|
- Runs pipelines on schedule:
|
||||||
- Extract: Daily at 2 AM UTC
|
- Extract: Daily at 2 AM UTC
|
||||||
- Transform: Daily at 3 AM UTC
|
- Transform: Daily at 3 AM UTC
|
||||||
- Uses systemd service for automatic restart on failure
|
- Uses systemd service for automatic restart on failure
|
||||||
- Pulls secrets from Pulumi ESC and passes to workers
|
- Pulls secrets from Pulumi ESC
|
||||||
|
|
||||||
|
**Bootstrap (one-time):**
|
||||||
|
```bash
|
||||||
|
# Get supervisor IP from Pulumi
|
||||||
|
cd infra && pulumi stack output supervisor_ip -s prod
|
||||||
|
|
||||||
|
# Run bootstrap script
|
||||||
|
export PULUMI_ACCESS_TOKEN=<your-token>
|
||||||
|
ssh root@<supervisor-ip> 'bash -s' < infra/bootstrap_supervisor.sh
|
||||||
|
```
|
||||||
|
|
||||||
#### 2. Ephemeral Workers (On-Demand)
|
#### 2. Ephemeral Workers (On-Demand)
|
||||||
- Created for each pipeline execution
|
- Created for each pipeline execution by materia CLI
|
||||||
- Downloads pre-built artifacts from R2 (no git, no uv on worker)
|
|
||||||
- Receives secrets via SSH environment variable injection
|
- Receives secrets via SSH environment variable injection
|
||||||
- Destroyed immediately after job completion
|
- Destroyed immediately after job completion
|
||||||
- Different instance types per pipeline:
|
- Different instance types per pipeline:
|
||||||
@@ -239,18 +237,20 @@ Each artifact is a self-contained tarball with all dependencies.
|
|||||||
```
|
```
|
||||||
Pulumi ESC (beanflows/prod)
|
Pulumi ESC (beanflows/prod)
|
||||||
↓
|
↓
|
||||||
Supervisor Instance (materia CLI)
|
Supervisor Instance (via esc CLI)
|
||||||
↓
|
↓
|
||||||
Workers (injected as env vars via SSH)
|
Workers (injected as env vars via SSH)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4. Artifact Flow
|
#### 4. Code Deployment Flow
|
||||||
```
|
```
|
||||||
GitLab CI: uv build → tar.gz
|
GitLab (master branch)
|
||||||
↓
|
↓
|
||||||
Cloudflare R2 (artifact storage)
|
Supervisor: git pull origin master (every 15 min)
|
||||||
↓
|
↓
|
||||||
Worker: curl → extract → execute
|
Supervisor: uv sync (update dependencies)
|
||||||
|
↓
|
||||||
|
Supervisor: uv run materia pipeline run <pipeline>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 5. Data Storage
|
#### 5. Data Storage
|
||||||
@@ -261,12 +261,12 @@ Worker: curl → extract → execute
|
|||||||
|
|
||||||
**Execution Flow:**
|
**Execution Flow:**
|
||||||
1. Supervisor loop wakes up every 15 minutes
|
1. Supervisor loop wakes up every 15 minutes
|
||||||
2. Checks if current time matches pipeline schedule (e.g., 2 AM for extract)
|
2. Runs `git fetch` and checks if new commits on master
|
||||||
3. Checks for CLI updates (hourly) and self-updates if needed
|
3. If updates available: `git pull && uv sync`
|
||||||
4. CLI runs: `materia pipeline run extract`
|
4. Checks if current time matches pipeline schedule (e.g., 2 AM for extract)
|
||||||
5. Creates Hetzner worker with SSH key
|
5. If scheduled: `uv run materia pipeline run extract`
|
||||||
6. Worker downloads `materia-extract-latest.tar.gz` from R2
|
6. CLI creates Hetzner worker with SSH key
|
||||||
7. CLI injects secrets via SSH: `export R2_ACCESS_KEY_ID=... && ./extract_psd`
|
7. CLI injects secrets via SSH and executes pipeline
|
||||||
8. Pipeline executes, writes to R2 Iceberg catalog
|
8. Pipeline executes, writes to R2 Iceberg catalog
|
||||||
9. Worker destroyed (entire lifecycle ~5-10 minutes)
|
9. Worker destroyed (entire lifecycle ~5-10 minutes)
|
||||||
10. Supervisor logs results and continues loop
|
10. Supervisor logs results and continues loop
|
||||||
|
|||||||
@@ -1,7 +0,0 @@
|
|||||||
-----BEGIN OPENSSH PRIVATE KEY-----
|
|
||||||
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
|
|
||||||
QyNTUxOQAAACCfGESotAKXA3uc2Mu90jYfpbwqZyRF+VytareVIN3PkgAAAJjG2ri3xtq4
|
|
||||||
twAAAAtzc2gtZWQyNTUxOQAAACCfGESotAKXA3uc2Mu90jYfpbwqZyRF+VytareVIN3Pkg
|
|
||||||
AAAECiPTY1dlijk3nvQcqZckzW2RddBhlqRTp4CMqrqj4oLJ8YRKi0ApcDe5zYy73SNh+l
|
|
||||||
vCpnJEX5XK1qt5Ug3c+SAAAAD2RlZW1hbkBEZWVtYW5QQwECAwQFBg==
|
|
||||||
-----END OPENSSH PRIVATE KEY-----
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJ8YRKi0ApcDe5zYy73SNh+lvCpnJEX5XK1qt5Ug3c+S deeman@DeemanPC
|
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
|
# Production stack configuration
|
||||||
|
# All secrets come from Pulumi ESC environment: beanflows/prod
|
||||||
|
environment:
|
||||||
|
- beanflows/prod
|
||||||
|
|
||||||
config:
|
config:
|
||||||
hcloud:token:
|
materia-infrastructure:hetzner_location: "nbg1" # Nuremberg, Germany
|
||||||
secure: AAABAEdhCpoRPhSknCQDgJWRFUjqwyM7TIz60ICRfcpy2GcYeFH098aX/3/rPCJCuetsRma0Wa145Ff3XXIEgUHFJ4Xr9/fZTZtlAtfMROaEhukWL19k96Fh6m8JihMl
|
|
||||||
materia-infrastructure:ssh_public_key:
|
|
||||||
secure: AAABAERKCdqTMBjaxXE+AzlVlCCxUkF1R7+1kFo7c69gqQt1JQuuvzAL/16f099iMP0Ij97U45VBpKUrMtZfHy68d1w1hyCueMHwhoOsfN7bLpj4R/DdCsupXfs8Vx/bJtBjIvsPKbK7f+DygWM1RA==
|
|
||||||
|
|||||||
130
infra/bootstrap_supervisor.sh
Executable file
130
infra/bootstrap_supervisor.sh
Executable file
@@ -0,0 +1,130 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Bootstrap script for Materia supervisor instance
|
||||||
|
# Run this once on a new supervisor to set it up
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# From CI/CD or locally:
|
||||||
|
# ssh root@<supervisor_ip> 'bash -s' < infra/bootstrap_supervisor.sh
|
||||||
|
#
|
||||||
|
# Or on the supervisor itself:
|
||||||
|
# curl -fsSL <url-to-this-script> | bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "=== Materia Supervisor Bootstrap ==="
|
||||||
|
echo "This script will:"
|
||||||
|
echo " 1. Install dependencies (git, uv, esc)"
|
||||||
|
echo " 2. Clone the materia repository"
|
||||||
|
echo " 3. Setup systemd service"
|
||||||
|
echo " 4. Start the supervisor"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check if we're root
|
||||||
|
if [ "$EUID" -ne 0 ]; then
|
||||||
|
echo "ERROR: This script must be run as root"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
REPO_URL="${REPO_URL:-https://gitlab.com/YOUR_USERNAME/materia.git}" # TODO: Update this!
|
||||||
|
MATERIA_DIR="/opt/materia"
|
||||||
|
REPO_DIR="$MATERIA_DIR/repo"
|
||||||
|
|
||||||
|
echo "--- Installing system dependencies ---"
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y git curl python3-pip
|
||||||
|
|
||||||
|
echo "--- Installing uv ---"
|
||||||
|
if ! command -v uv &> /dev/null; then
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
export PATH="$HOME/.cargo/bin:$PATH"
|
||||||
|
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "--- Installing Pulumi ESC ---"
|
||||||
|
if ! command -v esc &> /dev/null; then
|
||||||
|
curl -fsSL https://get.pulumi.com/esc/install.sh | sh
|
||||||
|
export PATH="$HOME/.pulumi/bin:$PATH"
|
||||||
|
echo 'export PATH="$HOME/.pulumi/bin:$PATH"' >> /root/.bashrc
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "--- Setting up Pulumi ESC authentication ---"
|
||||||
|
if [ -z "${PULUMI_ACCESS_TOKEN:-}" ]; then
|
||||||
|
echo "ERROR: PULUMI_ACCESS_TOKEN environment variable not set"
|
||||||
|
echo "Please set it before running this script:"
|
||||||
|
echo " export PULUMI_ACCESS_TOKEN=<your-token>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
esc login --token "$PULUMI_ACCESS_TOKEN"
|
||||||
|
|
||||||
|
echo "--- Loading secrets from Pulumi ESC ---"
|
||||||
|
eval $(esc env open beanflows/prod --format shell)
|
||||||
|
|
||||||
|
echo "--- Cloning repository ---"
|
||||||
|
mkdir -p "$MATERIA_DIR"
|
||||||
|
if [ -d "$REPO_DIR" ]; then
|
||||||
|
echo "Repository already exists, pulling latest..."
|
||||||
|
cd "$REPO_DIR"
|
||||||
|
git pull origin master
|
||||||
|
else
|
||||||
|
cd "$MATERIA_DIR"
|
||||||
|
git clone "$REPO_URL" repo
|
||||||
|
cd repo
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
echo "--- Creating environment file ---"
|
||||||
|
cat > "$MATERIA_DIR/.env" <<EOF
|
||||||
|
# Environment variables for supervisor
|
||||||
|
# Loaded from Pulumi ESC: beanflows/prod
|
||||||
|
PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN}
|
||||||
|
PATH=/root/.cargo/bin:/root/.pulumi/bin:/usr/local/bin:/usr/bin:/bin
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "--- Setting up systemd service ---"
|
||||||
|
cat > /etc/systemd/system/materia-supervisor.service <<'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=Materia Supervisor - Pipeline Orchestration
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/opt/materia/repo
|
||||||
|
ExecStart=/opt/materia/repo/infra/supervisor/supervisor.sh
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
EnvironmentFile=/opt/materia/.env
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=materia-supervisor
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "--- Enabling and starting service ---"
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable materia-supervisor
|
||||||
|
systemctl start materia-supervisor
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Bootstrap complete! ==="
|
||||||
|
echo ""
|
||||||
|
echo "Supervisor is now running. Check status with:"
|
||||||
|
echo " systemctl status materia-supervisor"
|
||||||
|
echo ""
|
||||||
|
echo "View logs with:"
|
||||||
|
echo " journalctl -u materia-supervisor -f"
|
||||||
|
echo ""
|
||||||
|
echo "Repository location: $REPO_DIR"
|
||||||
|
echo "Current commit: $(cd $REPO_DIR && git rev-parse --short HEAD)"
|
||||||
@@ -2,24 +2,22 @@
|
|||||||
# Materia Supervisor - Continuous pipeline orchestration
|
# Materia Supervisor - Continuous pipeline orchestration
|
||||||
# Inspired by TigerBeetle's CFO supervisor pattern
|
# Inspired by TigerBeetle's CFO supervisor pattern
|
||||||
# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
|
# https://github.com/tigerbeetle/tigerbeetle/blob/main/src/scripts/cfo_supervisor.sh
|
||||||
|
#
|
||||||
|
# Git-based deployment: pulls latest code from master and runs pipelines via uv
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
readonly CHECK_INTERVAL=900 # 15 minutes
|
readonly CHECK_INTERVAL=900 # 15 minutes
|
||||||
readonly CLI_VERSION_CHECK_INTERVAL=3600 # 1 hour
|
readonly MATERIA_REPO="/opt/materia/repo"
|
||||||
readonly MATERIA_DIR="/opt/materia"
|
readonly STATE_DIR="/var/lib/materia"
|
||||||
readonly R2_ARTIFACTS_URL="https://${R2_ENDPOINT}/${R2_ARTIFACTS_BUCKET}"
|
|
||||||
readonly CLI_ARTIFACT="materia-cli-latest.tar.gz"
|
|
||||||
|
|
||||||
# Schedules (cron-style times in UTC)
|
# Schedules (cron-style times in UTC)
|
||||||
readonly EXTRACT_SCHEDULE_HOUR=2 # 02:00 UTC
|
readonly EXTRACT_SCHEDULE_HOUR=2 # 02:00 UTC
|
||||||
readonly TRANSFORM_SCHEDULE_HOUR=3 # 03:00 UTC
|
readonly TRANSFORM_SCHEDULE_HOUR=3 # 03:00 UTC
|
||||||
|
|
||||||
# State tracking
|
# Ensure state directory exists
|
||||||
last_extract_run=""
|
mkdir -p "$STATE_DIR"
|
||||||
last_transform_run=""
|
|
||||||
last_cli_check=0
|
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||||
@@ -29,151 +27,121 @@ log_error() {
|
|||||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check if CLI needs updating
|
# Update code from git
|
||||||
check_cli_update() {
|
update_code() {
|
||||||
local now
|
log "Checking for code updates..."
|
||||||
now=$(date +%s)
|
cd "$MATERIA_REPO"
|
||||||
|
|
||||||
# Only check once per hour
|
# Fetch latest from master
|
||||||
if (( now - last_cli_check < CLI_VERSION_CHECK_INTERVAL )); then
|
if ! git fetch origin master 2>&1 | grep -v "^From"; then
|
||||||
return 0
|
log_error "Failed to fetch from git"
|
||||||
fi
|
|
||||||
|
|
||||||
last_cli_check=$now
|
|
||||||
|
|
||||||
log "Checking for CLI updates..."
|
|
||||||
|
|
||||||
# Download new version
|
|
||||||
local temp_file="${MATERIA_DIR}/cli-new.tar.gz"
|
|
||||||
if ! curl -fsSL -o "$temp_file" "${R2_ARTIFACTS_URL}/${CLI_ARTIFACT}"; then
|
|
||||||
log_error "Failed to download CLI artifact"
|
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Compare checksums
|
# Check if update available
|
||||||
local old_checksum=""
|
LOCAL=$(git rev-parse HEAD)
|
||||||
local new_checksum
|
REMOTE=$(git rev-parse origin/master)
|
||||||
|
|
||||||
if [ -f "${MATERIA_DIR}/${CLI_ARTIFACT}" ]; then
|
if [ "$LOCAL" != "$REMOTE" ]; then
|
||||||
old_checksum=$(sha256sum "${MATERIA_DIR}/${CLI_ARTIFACT}" | awk '{print $1}')
|
log "New version detected: $LOCAL -> $REMOTE"
|
||||||
fi
|
|
||||||
|
|
||||||
new_checksum=$(sha256sum "$temp_file" | awk '{print $1}')
|
# Pull latest code
|
||||||
|
if git pull origin master; then
|
||||||
|
log "Code updated successfully"
|
||||||
|
|
||||||
if [ "$old_checksum" = "$new_checksum" ]; then
|
# Update dependencies
|
||||||
log "CLI is up to date"
|
log "Updating dependencies with uv sync..."
|
||||||
rm -f "$temp_file"
|
if uv sync; then
|
||||||
|
log "Dependencies updated"
|
||||||
return 0
|
return 0
|
||||||
|
else
|
||||||
|
log_error "Failed to update dependencies"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_error "Failed to pull code"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "New CLI version detected, updating..."
|
log "Already up to date at $(git rev-parse --short HEAD)"
|
||||||
|
return 1 # Return 1 to indicate no update (not an error)
|
||||||
|
}
|
||||||
|
|
||||||
# Install new version
|
# Run pipeline using materia CLI via uv
|
||||||
mv "$temp_file" "${MATERIA_DIR}/${CLI_ARTIFACT}"
|
run_pipeline() {
|
||||||
|
local pipeline=$1
|
||||||
|
local date=$(date -u +%Y-%m-%d)
|
||||||
|
local state_file="$STATE_DIR/${pipeline}_last_run"
|
||||||
|
|
||||||
cd "$MATERIA_DIR"
|
log "Running $pipeline pipeline..."
|
||||||
rm -rf cli && mkdir -p cli
|
|
||||||
tar -xzf "$CLI_ARTIFACT" -C cli/
|
|
||||||
|
|
||||||
if pip3 install --force-reinstall cli/*.whl; then
|
cd "$MATERIA_REPO"
|
||||||
log "CLI updated successfully"
|
if uv run materia pipeline run "$pipeline"; then
|
||||||
materia version
|
log "$pipeline completed successfully"
|
||||||
|
echo "$date" > "$state_file"
|
||||||
|
return 0
|
||||||
else
|
else
|
||||||
log_error "Failed to install CLI"
|
log_error "$pipeline failed"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check if we should run extract pipeline (daily at specified hour)
|
# Check if pipeline should run today
|
||||||
should_run_extract() {
|
should_run_pipeline() {
|
||||||
local current_hour
|
local pipeline=$1
|
||||||
local current_date
|
local schedule_hour=$2
|
||||||
|
local current_hour=$(date -u +%H)
|
||||||
current_hour=$(date -u +%H)
|
local current_date=$(date -u +%Y-%m-%d)
|
||||||
current_date=$(date -u +%Y-%m-%d)
|
local state_file="$STATE_DIR/${pipeline}_last_run"
|
||||||
|
|
||||||
# Only run at the scheduled hour
|
# Only run at the scheduled hour
|
||||||
if [ "$current_hour" != "$EXTRACT_SCHEDULE_HOUR" ]; then
|
if [ "$current_hour" -ne "$schedule_hour" ]; then
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Only run once per day
|
# Check if already ran today
|
||||||
if [ "$last_extract_run" = "$current_date" ]; then
|
if [ -f "$state_file" ]; then
|
||||||
return 1
|
local last_run=$(cat "$state_file")
|
||||||
|
if [ "$last_run" = "$current_date" ]; then
|
||||||
|
return 1 # Already ran today
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
return 0
|
return 0 # Should run
|
||||||
}
|
|
||||||
|
|
||||||
# Check if we should run transform pipeline (daily at specified hour)
|
|
||||||
should_run_transform() {
|
|
||||||
local current_hour
|
|
||||||
local current_date
|
|
||||||
|
|
||||||
current_hour=$(date -u +%H)
|
|
||||||
current_date=$(date -u +%Y-%m-%d)
|
|
||||||
|
|
||||||
# Only run at the scheduled hour
|
|
||||||
if [ "$current_hour" != "$TRANSFORM_SCHEDULE_HOUR" ]; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Only run once per day
|
|
||||||
if [ "$last_transform_run" = "$current_date" ]; then
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run extract pipeline
|
|
||||||
run_extract() {
|
|
||||||
log "Starting extract pipeline..."
|
|
||||||
|
|
||||||
if materia pipeline run extract; then
|
|
||||||
log "Extract pipeline completed successfully"
|
|
||||||
last_extract_run=$(date -u +%Y-%m-%d)
|
|
||||||
else
|
|
||||||
log_error "Extract pipeline failed"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run transform pipeline
|
|
||||||
run_transform() {
|
|
||||||
log "Starting transform pipeline..."
|
|
||||||
|
|
||||||
if materia pipeline run transform; then
|
|
||||||
log "Transform pipeline completed successfully"
|
|
||||||
last_transform_run=$(date -u +%Y-%m-%d)
|
|
||||||
else
|
|
||||||
log_error "Transform pipeline failed"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Main supervisor loop
|
# Main supervisor loop
|
||||||
main() {
|
main() {
|
||||||
log "Materia supervisor starting..."
|
log "Materia supervisor starting..."
|
||||||
|
log "Repository: $MATERIA_REPO"
|
||||||
log "Extract schedule: daily at ${EXTRACT_SCHEDULE_HOUR}:00 UTC"
|
log "Extract schedule: daily at ${EXTRACT_SCHEDULE_HOUR}:00 UTC"
|
||||||
log "Transform schedule: daily at ${TRANSFORM_SCHEDULE_HOUR}:00 UTC"
|
log "Transform schedule: daily at ${TRANSFORM_SCHEDULE_HOUR}:00 UTC"
|
||||||
log "Check interval: ${CHECK_INTERVAL}s"
|
log "Check interval: ${CHECK_INTERVAL}s"
|
||||||
|
|
||||||
# Initial CLI check
|
# Ensure repo exists
|
||||||
check_cli_update || log_error "Initial CLI check failed, continuing anyway"
|
if [ ! -d "$MATERIA_REPO/.git" ]; then
|
||||||
|
log_error "Repository not found at $MATERIA_REPO"
|
||||||
while true; do
|
log_error "Run bootstrap script first!"
|
||||||
# Check for CLI updates
|
exit 1
|
||||||
check_cli_update || true
|
|
||||||
|
|
||||||
# Check and run extract pipeline
|
|
||||||
if should_run_extract; then
|
|
||||||
run_extract || true
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check and run transform pipeline
|
# Show initial version
|
||||||
if should_run_transform; then
|
cd "$MATERIA_REPO"
|
||||||
run_transform || true
|
log "Starting at commit: $(git rev-parse --short HEAD)"
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
# Check for code updates every loop
|
||||||
|
update_code || true
|
||||||
|
|
||||||
|
# Check extract schedule
|
||||||
|
if should_run_pipeline "extract" "$EXTRACT_SCHEDULE_HOUR"; then
|
||||||
|
run_pipeline extract || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check transform schedule
|
||||||
|
if should_run_pipeline "transform" "$TRANSFORM_SCHEDULE_HOUR"; then
|
||||||
|
run_pipeline transform || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
sleep "$CHECK_INTERVAL"
|
sleep "$CHECK_INTERVAL"
|
||||||
|
|||||||
Reference in New Issue
Block a user