Add supervisor deployment with continuous pipeline orchestration

Implements automated supervisor instance deployment that runs scheduled pipelines using a TigerBeetle-inspired continuous orchestration pattern. Infrastructure changes: - Update Pulumi to use existing R2 buckets (beanflows-artifacts, beanflows-data-prod) - Rename scheduler → supervisor, optimize to CCX11 (€4/mo) - Remove always-on worker (workers are now ephemeral only) - Add artifacts bucket resource for CLI/pipeline packages Supervisor architecture: - supervisor.sh: Continuous loop checking schedules every 15 minutes - Self-updating: Checks for new CLI versions hourly - Fixed schedules: Extract at 2 AM UTC, Transform at 3 AM UTC - systemd service for automatic restart on failure - Logs to systemd journal for observability CI/CD changes: - deploy:infra now runs on every master push (not just on changes) - New deploy:supervisor job: * Deploys supervisor.sh and systemd service * Installs latest materia CLI from R2 * Configures environment with Pulumi ESC secrets * Restarts supervisor service Future enhancements documented: - SQLMesh-aware scheduling (check models before running) - Model tags for worker sizing (heavy/distributed hints) - Multi-pipeline support, distributed execution - Cost optimization with multi-cloud spot pricing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 22:23:55 +02:00
parent 7e6ff29dea
commit f207fb441d
6 changed files with 648 additions and 79 deletions
--- a/infra/main.py
+++ b/infra/main.py
@@ -16,31 +16,26 @@ hetzner_location = config.get("hetzner_location") or "nbg1"  # Nuremberg datacen
 # Cloudflare R2 Storage + Data Catalog (Iceberg)
 # ============================================================

-# R2 bucket for raw data (extraction outputs)
-raw_bucket = cloudflare.R2Bucket(
-    "materia-raw",
+# R2 bucket for artifacts (CLI + extract/transform packages)
+# Note: Import existing bucket with:
+# pulumi import cloudflare:index/r2Bucket:R2Bucket materia-artifacts <account_id>/beanflows-artifacts
+artifacts_bucket = cloudflare.R2Bucket(
+    "materia-artifacts",
    account_id=cloudflare_account_id,
-    name="materia-raw",
+    name="beanflows-artifacts",
    location="weur",  # Western Europe
 )

 # R2 bucket for lakehouse (Iceberg tables)
+# Note: Import existing bucket with:
+# pulumi import cloudflare:index/r2Bucket:R2Bucket materia-lakehouse <account_id>/beanflows-data-prod
 lakehouse_bucket = cloudflare.R2Bucket(
    "materia-lakehouse",
    account_id=cloudflare_account_id,
-    name="materia-lakehouse",
+    name="beanflows-data-prod",
    location="weur",
 )

-# TODO: Enable R2 Data Catalog (Iceberg) on lakehouse bucket
-# Note: As of Oct 2025, R2 Data Catalog is in public beta
-# May need to enable via Cloudflare dashboard or API once SDK supports it
-# For now, document manual step in README
-
-# API token for R2 access (needs R2 + Data Catalog permissions)
-# Note: Create this manually in Cloudflare dashboard and store in Pulumi config
-# pulumi config set --secret cloudflare_r2_token <token>
-
 # ============================================================
 # Hetzner Cloud Infrastructure
 # ============================================================
@@ -52,57 +47,41 @@ ssh_key = hcloud.SshKey(
    public_key=config.require_secret("ssh_public_key"),
 )

-# Small CCX instance for scheduler/orchestrator
-# This runs the cron scheduler + lightweight tasks
-scheduler_server = hcloud.Server(
-    "materia-scheduler",
-    name="materia-scheduler",
-    server_type="ccx12",  # 2 vCPU, 8GB RAM, ~€6/mo
+# Small CCX instance for supervisor (runs materia CLI to orchestrate pipelines)
+# This is an always-on instance that creates/destroys ephemeral workers on-demand
+supervisor_server = hcloud.Server(
+    "materia-supervisor",
+    name="materia-supervisor",
+    server_type="ccx11",  # 2 vCPU, 4GB RAM, ~€4/mo (cheapest option)
    image="ubuntu-24.04",
    location=hetzner_location,
    ssh_keys=[ssh_key.id],
    labels={
-        "role": "scheduler",
+        "role": "supervisor",
        "project": "materia",
    },
    user_data="""#!/bin/bash
+set -e
+
 # Basic server setup
 apt-get update
-apt-get install -y python3.13 python3-pip git curl
+apt-get install -y python3.13 python3-pip curl unzip

-# Install uv
-curl -LsSf https://astral.sh/uv/install.sh | sh
+# Install Pulumi ESC CLI
+curl -fsSL https://get.pulumi.com/esc/install.sh | sh
+export PATH="$HOME/.pulumi/bin:$PATH"
+echo 'export PATH="$HOME/.pulumi/bin:$PATH"' >> /root/.bashrc
+
+# Create deployment directory
+mkdir -p /opt/materia

 # Configure environment
-echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
+echo 'Setup complete. Materia CLI will be deployed via CI/CD.' > /opt/materia/README.txt
 """,
 )

-# Larger CCX instance for heavy SQLMesh workloads
-# This gets spun up on-demand for big transformations
-worker_server = hcloud.Server(
-    "materia-worker-01",
-    name="materia-worker-01",
-    server_type="ccx22",  # 4 vCPU, 16GB RAM, ~€24/mo
-    image="ubuntu-24.04",
-    location=hetzner_location,
-    ssh_keys=[ssh_key.id],
-    labels={
-        "role": "worker",
-        "project": "materia",
-    },
-    user_data="""#!/bin/bash
-# Basic server setup
-apt-get update
-apt-get install -y python3.13 python3-pip git curl
-
-# Install uv
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Configure environment
-echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
-""",
-)
+# Note: Workers are created on-demand by the materia CLI
+# No always-on worker instances in this architecture

 # Firewall for servers (restrict to SSH + outbound only)
 firewall = hcloud.Firewall(
@@ -132,27 +111,20 @@ firewall = hcloud.Firewall(
    ],
 )

-# Apply firewall to all servers
-scheduler_firewall = hcloud.FirewallAttachment(
-    "scheduler-firewall",
+# Apply firewall to supervisor
+supervisor_firewall = hcloud.FirewallAttachment(
+    "supervisor-firewall",
    firewall_id=firewall.id,
-    server_ids=[scheduler_server.id],
-)
-
-worker_firewall = hcloud.FirewallAttachment(
-    "worker-firewall",
-    firewall_id=firewall.id,
-    server_ids=[worker_server.id],
+    server_ids=[supervisor_server.id],
 )

 # ============================================================
 # Outputs
 # ============================================================

-pulumi.export("raw_bucket_name", raw_bucket.name)
+pulumi.export("artifacts_bucket_name", artifacts_bucket.name)
 pulumi.export("lakehouse_bucket_name", lakehouse_bucket.name)
-pulumi.export("scheduler_ip", scheduler_server.ipv4_address)
-pulumi.export("worker_ip", worker_server.ipv4_address)
+pulumi.export("supervisor_ip", supervisor_server.ipv4_address)

 # Export connection info for DuckDB
 pulumi.export(