Add supervisor deployment with continuous pipeline orchestration

Implements automated supervisor instance deployment that runs scheduled
pipelines using a TigerBeetle-inspired continuous orchestration pattern.

Infrastructure changes:
- Update Pulumi to use existing R2 buckets (beanflows-artifacts, beanflows-data-prod)
- Rename scheduler → supervisor, optimize to CCX11 (€4/mo)
- Remove always-on worker (workers are now ephemeral only)
- Add artifacts bucket resource for CLI/pipeline packages

Supervisor architecture:
- supervisor.sh: Continuous loop checking schedules every 15 minutes
- Self-updating: Checks for new CLI versions hourly
- Fixed schedules: Extract at 2 AM UTC, Transform at 3 AM UTC
- systemd service for automatic restart on failure
- Logs to systemd journal for observability

CI/CD changes:
- deploy:infra now runs on every master push (not just on changes)
- New deploy:supervisor job:
  * Deploys supervisor.sh and systemd service
  * Installs latest materia CLI from R2
  * Configures environment with Pulumi ESC secrets
  * Restarts supervisor service

Future enhancements documented:
- SQLMesh-aware scheduling (check models before running)
- Model tags for worker sizing (heavy/distributed hints)
- Multi-pipeline support, distributed execution
- Cost optimization with multi-cloud spot pricing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Deeman
2025-10-12 22:23:55 +02:00
parent 7e6ff29dea
commit f207fb441d
6 changed files with 648 additions and 79 deletions

View File

@@ -16,31 +16,26 @@ hetzner_location = config.get("hetzner_location") or "nbg1" # Nuremberg datacen
# Cloudflare R2 Storage + Data Catalog (Iceberg)
# ============================================================
# R2 bucket for raw data (extraction outputs)
raw_bucket = cloudflare.R2Bucket(
"materia-raw",
# R2 bucket for artifacts (CLI + extract/transform packages)
# Note: Import existing bucket with:
# pulumi import cloudflare:index/r2Bucket:R2Bucket materia-artifacts <account_id>/beanflows-artifacts
artifacts_bucket = cloudflare.R2Bucket(
"materia-artifacts",
account_id=cloudflare_account_id,
name="materia-raw",
name="beanflows-artifacts",
location="weur", # Western Europe
)
# R2 bucket for lakehouse (Iceberg tables)
# Note: Import existing bucket with:
# pulumi import cloudflare:index/r2Bucket:R2Bucket materia-lakehouse <account_id>/beanflows-data-prod
lakehouse_bucket = cloudflare.R2Bucket(
"materia-lakehouse",
account_id=cloudflare_account_id,
name="materia-lakehouse",
name="beanflows-data-prod",
location="weur",
)
# TODO: Enable R2 Data Catalog (Iceberg) on lakehouse bucket
# Note: As of Oct 2025, R2 Data Catalog is in public beta
# May need to enable via Cloudflare dashboard or API once SDK supports it
# For now, document manual step in README
# API token for R2 access (needs R2 + Data Catalog permissions)
# Note: Create this manually in Cloudflare dashboard and store in Pulumi config
# pulumi config set --secret cloudflare_r2_token <token>
# ============================================================
# Hetzner Cloud Infrastructure
# ============================================================
@@ -52,57 +47,41 @@ ssh_key = hcloud.SshKey(
public_key=config.require_secret("ssh_public_key"),
)
# Small CCX instance for scheduler/orchestrator
# This runs the cron scheduler + lightweight tasks
scheduler_server = hcloud.Server(
"materia-scheduler",
name="materia-scheduler",
server_type="ccx12", # 2 vCPU, 8GB RAM, ~€6/mo
# Small CCX instance for supervisor (runs materia CLI to orchestrate pipelines)
# This is an always-on instance that creates/destroys ephemeral workers on-demand
supervisor_server = hcloud.Server(
"materia-supervisor",
name="materia-supervisor",
server_type="ccx11", # 2 vCPU, 4GB RAM, ~€4/mo (cheapest option)
image="ubuntu-24.04",
location=hetzner_location,
ssh_keys=[ssh_key.id],
labels={
"role": "scheduler",
"role": "supervisor",
"project": "materia",
},
user_data="""#!/bin/bash
set -e
# Basic server setup
apt-get update
apt-get install -y python3.13 python3-pip git curl
apt-get install -y python3.13 python3-pip curl unzip
# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh
# Install Pulumi ESC CLI
curl -fsSL https://get.pulumi.com/esc/install.sh | sh
export PATH="$HOME/.pulumi/bin:$PATH"
echo 'export PATH="$HOME/.pulumi/bin:$PATH"' >> /root/.bashrc
# Create deployment directory
mkdir -p /opt/materia
# Configure environment
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
echo 'Setup complete. Materia CLI will be deployed via CI/CD.' > /opt/materia/README.txt
""",
)
# Larger CCX instance for heavy SQLMesh workloads
# This gets spun up on-demand for big transformations
worker_server = hcloud.Server(
"materia-worker-01",
name="materia-worker-01",
server_type="ccx22", # 4 vCPU, 16GB RAM, ~€24/mo
image="ubuntu-24.04",
location=hetzner_location,
ssh_keys=[ssh_key.id],
labels={
"role": "worker",
"project": "materia",
},
user_data="""#!/bin/bash
# Basic server setup
apt-get update
apt-get install -y python3.13 python3-pip git curl
# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh
# Configure environment
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
""",
)
# Note: Workers are created on-demand by the materia CLI
# No always-on worker instances in this architecture
# Firewall for servers (restrict to SSH + outbound only)
firewall = hcloud.Firewall(
@@ -132,27 +111,20 @@ firewall = hcloud.Firewall(
],
)
# Apply firewall to all servers
scheduler_firewall = hcloud.FirewallAttachment(
"scheduler-firewall",
# Apply firewall to supervisor
supervisor_firewall = hcloud.FirewallAttachment(
"supervisor-firewall",
firewall_id=firewall.id,
server_ids=[scheduler_server.id],
)
worker_firewall = hcloud.FirewallAttachment(
"worker-firewall",
firewall_id=firewall.id,
server_ids=[worker_server.id],
server_ids=[supervisor_server.id],
)
# ============================================================
# Outputs
# ============================================================
pulumi.export("raw_bucket_name", raw_bucket.name)
pulumi.export("artifacts_bucket_name", artifacts_bucket.name)
pulumi.export("lakehouse_bucket_name", lakehouse_bucket.name)
pulumi.export("scheduler_ip", scheduler_server.ipv4_address)
pulumi.export("worker_ip", worker_server.ipv4_address)
pulumi.export("supervisor_ip", supervisor_server.ipv4_address)
# Export connection info for DuckDB
pulumi.export(