beanflows/infra/__main__.py

"""
BeanFlows.coffee Infrastructure
Cloudflare R2 + Iceberg + Hetzner compute stack
"""

import pulumi
import pulumi_cloudflare as cloudflare
import pulumi_hcloud as hcloud

# Load configuration
config = pulumi.Config()
cloudflare_account_id = config.require("cloudflare_account_id")
hetzner_location = config.get("hetzner_location") or "nbg1"  # Nuremberg datacenter

# ============================================================
# Cloudflare R2 Storage + Data Catalog (Iceberg)
# ============================================================

# R2 bucket for raw data (extraction outputs)
raw_bucket = cloudflare.R2Bucket(
    "materia-raw",
    account_id=cloudflare_account_id,
    name="materia-raw",
    location="weur",  # Western Europe
)

# R2 bucket for lakehouse (Iceberg tables)
lakehouse_bucket = cloudflare.R2Bucket(
    "materia-lakehouse",
    account_id=cloudflare_account_id,
    name="materia-lakehouse",
    location="weur",
)

# TODO: Enable R2 Data Catalog (Iceberg) on lakehouse bucket
# Note: As of Oct 2025, R2 Data Catalog is in public beta
# May need to enable via Cloudflare dashboard or API once SDK supports it
# For now, document manual step in README

# API token for R2 access (needs R2 + Data Catalog permissions)
# Note: Create this manually in Cloudflare dashboard and store in Pulumi config
# pulumi config set --secret cloudflare_r2_token <token>

# ============================================================
# Hetzner Cloud Infrastructure
# ============================================================

# SSH key for server access
ssh_key = hcloud.SshKey(
    "materia-ssh-key",
    name="materia-deployment-key",
    public_key=config.require_secret("ssh_public_key"),
)

# Small CCX instance for scheduler/orchestrator
# This runs the cron scheduler + lightweight tasks
scheduler_server = hcloud.Server(
    "materia-scheduler",
    name="materia-scheduler",
    server_type="ccx12",  # 2 vCPU, 8GB RAM, ~€6/mo
    image="ubuntu-24.04",
    location=hetzner_location,
    ssh_keys=[ssh_key.id],
    labels={
        "role": "scheduler",
        "project": "materia",
    },
    user_data="""#!/bin/bash
# Basic server setup
apt-get update
apt-get install -y python3.13 python3-pip git curl

# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh

# Configure environment
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
""",
)

# Larger CCX instance for heavy SQLMesh workloads
# This gets spun up on-demand for big transformations
worker_server = hcloud.Server(
    "materia-worker-01",
    name="materia-worker-01",
    server_type="ccx22",  # 4 vCPU, 16GB RAM, ~€24/mo
    image="ubuntu-24.04",
    location=hetzner_location,
    ssh_keys=[ssh_key.id],
    labels={
        "role": "worker",
        "project": "materia",
    },
    user_data="""#!/bin/bash
# Basic server setup
apt-get update
apt-get install -y python3.13 python3-pip git curl

# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh

# Configure environment
echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
""",
)

# Firewall for servers (restrict to SSH + outbound only)
firewall = hcloud.Firewall(
    "materia-firewall",
    name="materia-firewall",
    rules=[
        # Allow SSH from anywhere (consider restricting to your IP)
        hcloud.FirewallRuleArgs(
            direction="in",
            protocol="tcp",
            port="22",
            source_ips=["0.0.0.0/0", "::/0"],
        ),
        # Allow all outbound traffic
        hcloud.FirewallRuleArgs(
            direction="out",
            protocol="tcp",
            port="any",
            destination_ips=["0.0.0.0/0", "::/0"],
        ),
        hcloud.FirewallRuleArgs(
            direction="out",
            protocol="udp",
            port="any",
            destination_ips=["0.0.0.0/0", "::/0"],
        ),
    ],
)

# Apply firewall to all servers
scheduler_firewall = hcloud.FirewallAttachment(
    "scheduler-firewall",
    firewall_id=firewall.id,
    server_ids=[scheduler_server.id],
)

worker_firewall = hcloud.FirewallAttachment(
    "worker-firewall",
    firewall_id=firewall.id,
    server_ids=[worker_server.id],
)

# ============================================================
# Outputs
# ============================================================

pulumi.export("raw_bucket_name", raw_bucket.name)
pulumi.export("lakehouse_bucket_name", lakehouse_bucket.name)
pulumi.export("scheduler_ip", scheduler_server.ipv4_address)
pulumi.export("worker_ip", worker_server.ipv4_address)

# Export connection info for DuckDB
pulumi.export(
    "duckdb_r2_config",
    pulumi.Output.all(cloudflare_account_id, lakehouse_bucket.name).apply(
        lambda args: {
            "account_id": args[0],
            "bucket": args[1],
            "catalog_uri": f"https://catalog.cloudflarestorage.com/{args[0]}/r2-data-catalog",
        }
    ),
)