Files
beanflows/infra/__main__.py
2025-10-12 14:26:55 +02:00

168 lines
4.9 KiB
Python

"""
BeanFlows.coffee Infrastructure
Cloudflare R2 + Iceberg + Hetzner compute stack
"""
import pulumi
import pulumi_cloudflare as cloudflare
import pulumi_hcloud as hcloud
# Load configuration
config = pulumi.Config()
cloudflare_account_id = config.require("cloudflare_account_id")
hetzner_location = config.get("hetzner_location") or "nbg1" # Nuremberg datacenter
# ============================================================
# Cloudflare R2 Storage + Data Catalog (Iceberg)
# ============================================================
# R2 bucket for raw data (extraction outputs)
raw_bucket = cloudflare.R2Bucket(
"materia-raw",
account_id=cloudflare_account_id,
name="materia-raw",
location="weur", # Western Europe
)
# R2 bucket for lakehouse (Iceberg tables)
lakehouse_bucket = cloudflare.R2Bucket(
"materia-lakehouse",
account_id=cloudflare_account_id,
name="materia-lakehouse",
location="weur",
)
# TODO: Enable R2 Data Catalog (Iceberg) on lakehouse bucket
# Note: As of Oct 2025, R2 Data Catalog is in public beta
# May need to enable via Cloudflare dashboard or API once SDK supports it
# For now, document manual step in README
# API token for R2 access (needs R2 + Data Catalog permissions)
# Note: Create this manually in Cloudflare dashboard and store in Pulumi config
# pulumi config set --secret cloudflare_r2_token <token>
# ============================================================
# Hetzner Cloud Infrastructure
# ============================================================
# SSH key for server access
ssh_key = hcloud.SshKey(
"materia-ssh-key",
name="materia-deployment-key",
public_key=config.require_secret("ssh_public_key"),
)
# Small CCX instance for scheduler/orchestrator
# This runs the cron scheduler + lightweight tasks
scheduler_server = hcloud.Server(
"materia-scheduler",
name="materia-scheduler",
server_type="ccx12", # 2 vCPU, 8GB RAM, ~€6/mo
image="ubuntu-24.04",
location=hetzner_location,
ssh_keys=[ssh_key.id],
labels={
"role": "scheduler",
"project": "materia",
},
user_data="""#!/bin/bash
# Basic server setup
apt-get update
apt-get install -y python3.13 python3-pip git curl
# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh
# Configure environment
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
""",
)
# Larger CCX instance for heavy SQLMesh workloads
# This gets spun up on-demand for big transformations
worker_server = hcloud.Server(
"materia-worker-01",
name="materia-worker-01",
server_type="ccx22", # 4 vCPU, 16GB RAM, ~€24/mo
image="ubuntu-24.04",
location=hetzner_location,
ssh_keys=[ssh_key.id],
labels={
"role": "worker",
"project": "materia",
},
user_data="""#!/bin/bash
# Basic server setup
apt-get update
apt-get install -y python3.13 python3-pip git curl
# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh
# Configure environment
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
""",
)
# Firewall for servers (restrict to SSH + outbound only)
firewall = hcloud.Firewall(
"materia-firewall",
name="materia-firewall",
rules=[
# Allow SSH from anywhere (consider restricting to your IP)
hcloud.FirewallRuleArgs(
direction="in",
protocol="tcp",
port="22",
source_ips=["0.0.0.0/0", "::/0"],
),
# Allow all outbound traffic
hcloud.FirewallRuleArgs(
direction="out",
protocol="tcp",
port="any",
destination_ips=["0.0.0.0/0", "::/0"],
),
hcloud.FirewallRuleArgs(
direction="out",
protocol="udp",
port="any",
destination_ips=["0.0.0.0/0", "::/0"],
),
],
)
# Apply firewall to all servers
scheduler_firewall = hcloud.FirewallAttachment(
"scheduler-firewall",
firewall_id=firewall.id,
server_ids=[scheduler_server.id],
)
worker_firewall = hcloud.FirewallAttachment(
"worker-firewall",
firewall_id=firewall.id,
server_ids=[worker_server.id],
)
# ============================================================
# Outputs
# ============================================================
pulumi.export("raw_bucket_name", raw_bucket.name)
pulumi.export("lakehouse_bucket_name", lakehouse_bucket.name)
pulumi.export("scheduler_ip", scheduler_server.ipv4_address)
pulumi.export("worker_ip", worker_server.ipv4_address)
# Export connection info for DuckDB
pulumi.export(
"duckdb_r2_config",
pulumi.Output.all(cloudflare_account_id, lakehouse_bucket.name).apply(
lambda args: {
"account_id": args[0],
"bucket": args[1],
"catalog_uri": f"https://catalog.cloudflarestorage.com/{args[0]}/r2-data-catalog",
}
),
)