Refactor to local-first architecture on Hetzner NVMe
Remove distributed R2/Iceberg/SSH pipeline architecture in favor of
local subprocess execution with NVMe storage. Landing data backed up
to R2 via rclone timer.
- Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow
- Pipelines run via subprocess.run() with bounded timeouts
- Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip
- SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH
- Delete unused provider stubs (ovh, scaleway, oracle)
- Add rclone systemd timer for R2 backup every 6h
- Update supervisor to run pipelines with env vars
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
9
infra/backup/materia-backup.service
Normal file
9
infra/backup/materia-backup.service
Normal file
@@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
Description=Materia Landing Data Backup to R2
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/bin/rclone sync /data/materia/landing/ r2:materia-raw/landing/ --log-level INFO
|
||||
TimeoutStartSec=1800
|
||||
10
infra/backup/materia-backup.timer
Normal file
10
infra/backup/materia-backup.timer
Normal file
@@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=Materia Landing Data Backup Timer
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 00/6:00:00
|
||||
RandomizedDelaySec=300
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
14
infra/backup/rclone.conf.example
Normal file
14
infra/backup/rclone.conf.example
Normal file
@@ -0,0 +1,14 @@
|
||||
# Cloudflare R2 remote for landing data backup
|
||||
# Copy to /root/.config/rclone/rclone.conf and fill in credentials
|
||||
#
|
||||
# Get credentials from: Cloudflare Dashboard → R2 → Manage R2 API Tokens
|
||||
# Or from Pulumi ESC: esc env open beanflows/prod --format shell
|
||||
|
||||
[r2]
|
||||
type = s3
|
||||
provider = Cloudflare
|
||||
access_key_id = <R2_ACCESS_KEY_ID>
|
||||
secret_access_key = <R2_SECRET_ACCESS_KEY>
|
||||
endpoint = https://<CLOUDFLARE_ACCOUNT_ID>.r2.cloudflarestorage.com
|
||||
acl = private
|
||||
no_check_bucket = true
|
||||
@@ -79,6 +79,9 @@ else
|
||||
cd "$REPO_DIR"
|
||||
fi
|
||||
|
||||
echo "--- Creating data directories ---"
|
||||
mkdir -p /data/materia/landing/psd
|
||||
|
||||
echo "--- Installing Python dependencies ---"
|
||||
uv sync
|
||||
|
||||
@@ -88,6 +91,8 @@ cat > "$REPO_DIR/.env" <<EOF
|
||||
# Loaded from Pulumi ESC: beanflows/prod
|
||||
PULUMI_ACCESS_TOKEN=${PULUMI_ACCESS_TOKEN}
|
||||
PATH=/root/.cargo/bin:/root/.pulumi/bin:/usr/local/bin:/usr/bin:/bin
|
||||
LANDING_DIR=/data/materia/landing
|
||||
DUCKDB_PATH=/data/materia/lakehouse.duckdb
|
||||
EOF
|
||||
|
||||
echo "--- Setting up systemd service ---"
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:14
|
||||
environment:
|
||||
POSTGRES_USER: prefect
|
||||
POSTGRES_PASSWORD: prefect
|
||||
POSTGRES_DB: prefect
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U prefect"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
dragonfly:
|
||||
image: 'docker.dragonflydb.io/dragonflydb/dragonfly'
|
||||
ulimits:
|
||||
memlock: -1
|
||||
volumes:
|
||||
- dragonflydata:/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "redis-cli ping"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
prefect-server:
|
||||
image: prefecthq/prefect:3-latest
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
dragonfly:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
PREFECT_API_DATABASE_CONNECTION_URL: postgresql+asyncpg://prefect:prefect@postgres:5432/prefect
|
||||
PREFECT_SERVER_API_HOST: 0.0.0.0
|
||||
PREFECT_UI_API_URL: http://localhost:4200/api
|
||||
PREFECT_MESSAGING_BROKER: prefect_redis.messaging
|
||||
PREFECT_MESSAGING_CACHE: prefect_redis.messaging
|
||||
PREFECT_REDIS_MESSAGING_HOST: dragonfly
|
||||
PREFECT_REDIS_MESSAGING_PORT: 6379
|
||||
PREFECT_REDIS_MESSAGING_DB: 0
|
||||
command: prefect server start --no-services
|
||||
ports:
|
||||
- "4200:4200"
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request as u; u.urlopen('http://localhost:4200/api/health', timeout=1)"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
prefect-services:
|
||||
image: prefecthq/prefect:3-latest
|
||||
depends_on:
|
||||
prefect-server:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
PREFECT_API_DATABASE_CONNECTION_URL: postgresql+asyncpg://prefect:prefect@postgres:5432/prefect
|
||||
PREFECT_MESSAGING_BROKER: prefect_redis.messaging
|
||||
PREFECT_MESSAGING_CACHE: prefect_redis.messaging
|
||||
PREFECT_REDIS_MESSAGING_HOST: dragonfly
|
||||
PREFECT_REDIS_MESSAGING_PORT: 6379
|
||||
PREFECT_REDIS_MESSAGING_DB: 0
|
||||
command: prefect server services start
|
||||
|
||||
prefect-worker:
|
||||
image: prefecthq/prefect:3-latest
|
||||
depends_on:
|
||||
prefect-server:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
PREFECT_API_URL: http://prefect-server:4200/api
|
||||
command: prefect worker start --pool local-pool
|
||||
restart: on-failure
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
dragonflydata:
|
||||
200
infra/readme.md
200
infra/readme.md
@@ -1,161 +1,85 @@
|
||||
# Materia Infrastructure
|
||||
|
||||
Pulumi-managed infrastructure for BeanFlows.coffee
|
||||
Single-server local-first setup for BeanFlows.coffee on Hetzner NVMe.
|
||||
|
||||
## Stack Overview
|
||||
## Architecture
|
||||
|
||||
- **Storage:** Cloudflare R2 buckets with Iceberg Data Catalog
|
||||
- **Compute:** Hetzner Cloud CCX dedicated vCPU instances
|
||||
- **Orchestration:** Custom Python scheduler (see `src/orchestrator/`)
|
||||
```
|
||||
Hetzner Server (NVMe)
|
||||
├── /opt/materia/ # Git repo, code, uv environment
|
||||
├── /data/materia/landing/ # Extracted USDA data (year/month subdirs)
|
||||
├── /data/materia/lakehouse.duckdb # SQLMesh output database
|
||||
└── systemd services:
|
||||
├── materia-supervisor # Pulls git, runs extract + transform daily
|
||||
└── materia-backup.timer # Syncs landing/ to R2 every 6 hours
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
## Data Flow
|
||||
|
||||
1. **Cloudflare Account**
|
||||
- Sign up at https://dash.cloudflare.com
|
||||
- Create API token with R2 + Data Catalog permissions
|
||||
- Get your Account ID from dashboard
|
||||
1. **Extract**: USDA API → `/data/materia/landing/psd/{year}/{month}/{etag}.csv.gzip`
|
||||
2. **Transform**: SQLMesh reads landing CSVs → writes to `/data/materia/lakehouse.duckdb`
|
||||
3. **Backup**: rclone syncs `/data/materia/landing/` → R2 `materia-raw/landing/`
|
||||
4. **Web**: Reads `lakehouse.duckdb` (read-only)
|
||||
|
||||
2. **Hetzner Cloud Account**
|
||||
- Sign up at https://console.hetzner.cloud
|
||||
- Create API token with Read & Write permissions
|
||||
## Setup
|
||||
|
||||
3. **Pulumi Account** (optional, can use local state)
|
||||
- Sign up at https://app.pulumi.com
|
||||
- Or use local state with `pulumi login --local`
|
||||
### Prerequisites
|
||||
|
||||
4. **SSH Key**
|
||||
- Generate if needed: `ssh-keygen -t ed25519 -C "materia-deploy"`
|
||||
- Hetzner server with NVMe storage
|
||||
- Pulumi ESC configured (`beanflows/prod` environment)
|
||||
- `GITLAB_READ_TOKEN` and `PULUMI_ACCESS_TOKEN` set
|
||||
|
||||
## Initial Setup
|
||||
### Bootstrap
|
||||
|
||||
```bash
|
||||
# From local machine or CI:
|
||||
ssh root@<server_ip> 'bash -s' < infra/bootstrap_supervisor.sh
|
||||
```
|
||||
|
||||
This installs dependencies, clones the repo, creates data directories, and starts the supervisor service.
|
||||
|
||||
### R2 Backup
|
||||
|
||||
1. Install rclone: `apt install rclone`
|
||||
2. Copy and configure: `cp infra/backup/rclone.conf.example /root/.config/rclone/rclone.conf`
|
||||
3. Fill in R2 credentials from Pulumi ESC
|
||||
4. Install systemd units:
|
||||
|
||||
```bash
|
||||
cp infra/backup/materia-backup.service /etc/systemd/system/
|
||||
cp infra/backup/materia-backup.timer /etc/systemd/system/
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now materia-backup.timer
|
||||
```
|
||||
|
||||
## Pulumi IaC
|
||||
|
||||
Still manages Cloudflare R2 buckets and can provision Hetzner instances:
|
||||
|
||||
```bash
|
||||
cd infra
|
||||
|
||||
# Login to Pulumi (local or cloud)
|
||||
pulumi login # or: pulumi login --local
|
||||
|
||||
# Initialize the stack
|
||||
pulumi stack init dev
|
||||
|
||||
# Configure secrets
|
||||
pulumi config set --secret cloudflare:apiToken <your-cloudflare-token>
|
||||
pulumi config set cloudflare_account_id <your-account-id>
|
||||
pulumi config set --secret hcloud:token <your-hetzner-token>
|
||||
pulumi config set --secret ssh_public_key "$(cat ~/.ssh/id_ed25519.pub)"
|
||||
|
||||
# Preview changes
|
||||
pulumi preview
|
||||
|
||||
# Deploy infrastructure
|
||||
pulumi login
|
||||
pulumi stack select prod
|
||||
pulumi up
|
||||
```
|
||||
|
||||
## What Gets Provisioned
|
||||
|
||||
### Cloudflare R2 Buckets
|
||||
|
||||
1. **materia-raw** - Raw data from extraction (immutable archives)
|
||||
2. **materia-lakehouse** - Iceberg tables for SQLMesh (ACID transactions)
|
||||
|
||||
### Hetzner Cloud Servers
|
||||
|
||||
1. **materia-scheduler** (CCX12: 2 vCPU, 8GB RAM)
|
||||
- Runs cron scheduler
|
||||
- Lightweight orchestration tasks
|
||||
- Always-on, low cost (~€6/mo)
|
||||
|
||||
2. **materia-worker-01** (CCX22: 4 vCPU, 16GB RAM)
|
||||
- Heavy SQLMesh transformations
|
||||
- Can be stopped when not in use
|
||||
- Scale up to CCX32/CCX42 for larger workloads (~€24-90/mo)
|
||||
|
||||
3. **materia-firewall**
|
||||
- SSH access (port 22)
|
||||
- All outbound traffic allowed
|
||||
- No inbound HTTP/HTTPS (we're not running web services yet)
|
||||
|
||||
## Enabling R2 Data Catalog (Iceberg)
|
||||
|
||||
As of October 2025, R2 Data Catalog is in public beta. Enable it manually:
|
||||
|
||||
1. Go to Cloudflare Dashboard → R2
|
||||
2. Select the `materia-lakehouse` bucket
|
||||
3. Navigate to Settings → Data Catalog
|
||||
4. Click "Enable Data Catalog"
|
||||
|
||||
Once enabled, you can connect DuckDB to the Iceberg REST catalog:
|
||||
|
||||
```python
|
||||
import duckdb
|
||||
|
||||
# Get catalog URI from Pulumi outputs
|
||||
# pulumi stack output duckdb_r2_config
|
||||
|
||||
conn = duckdb.connect()
|
||||
conn.execute("INSTALL iceberg; LOAD iceberg;")
|
||||
conn.execute(f"""
|
||||
ATTACH 'iceberg_rest://catalog.cloudflarestorage.com/<account_id>/r2-data-catalog'
|
||||
AS lakehouse (
|
||||
TYPE ICEBERG_REST,
|
||||
SECRET '<r2_api_token>'
|
||||
);
|
||||
""")
|
||||
```
|
||||
|
||||
## Server Access
|
||||
|
||||
Get server IPs from Pulumi outputs:
|
||||
## Monitoring
|
||||
|
||||
```bash
|
||||
pulumi stack output scheduler_ip
|
||||
pulumi stack output worker_ip
|
||||
# Supervisor status and logs
|
||||
systemctl status materia-supervisor
|
||||
journalctl -u materia-supervisor -f
|
||||
|
||||
# Backup timer status
|
||||
systemctl list-timers materia-backup.timer
|
||||
journalctl -u materia-backup -f
|
||||
```
|
||||
|
||||
SSH into servers:
|
||||
|
||||
```bash
|
||||
ssh root@<scheduler_ip>
|
||||
ssh root@<worker_ip>
|
||||
```
|
||||
|
||||
## Cost Estimates (Monthly)
|
||||
## Cost
|
||||
|
||||
| Resource | Type | Cost |
|
||||
|----------|------|------|
|
||||
| R2 Storage | 10 GB | $0.15 |
|
||||
| R2 Operations | 1M reads | $0.36 |
|
||||
| R2 Egress | Unlimited | $0.00 (zero egress!) |
|
||||
| Scheduler | CCX12 | €6.00 |
|
||||
| Worker (on-demand) | CCX22 | €24.00 |
|
||||
| **Total** | | **~€30/mo (~$33)** |
|
||||
|
||||
Compare to AWS equivalent: ~$300-500/mo with S3 + EC2 + egress fees.
|
||||
|
||||
## Scaling Workers
|
||||
|
||||
To add more worker capacity or different instance sizes:
|
||||
|
||||
1. Edit `infra/__main__.py` to add new server resources
|
||||
2. Update worker config in `src/orchestrator/workers.yaml`
|
||||
3. Run `pulumi up` to provision
|
||||
|
||||
Example worker sizes:
|
||||
- CCX12: 2 vCPU, 8GB RAM (light workloads)
|
||||
- CCX22: 4 vCPU, 16GB RAM (medium workloads)
|
||||
- CCX32: 8 vCPU, 32GB RAM (heavy workloads)
|
||||
- CCX42: 16 vCPU, 64GB RAM (very heavy workloads)
|
||||
|
||||
## Destroying Infrastructure
|
||||
|
||||
```bash
|
||||
cd infra
|
||||
pulumi destroy
|
||||
```
|
||||
|
||||
**Warning:** This will delete all buckets and servers. Backup data first!
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Deploy orchestrator to scheduler server (see `src/orchestrator/README.md`)
|
||||
2. Configure SQLMesh to use R2 lakehouse (see `transform/sqlmesh_materia/config.yaml`)
|
||||
3. Set up CI/CD pipeline to deploy on push (see `.gitlab-ci.yml`)
|
||||
| Hetzner Server | CCX22 (4 vCPU, 16GB) | ~€24/mo |
|
||||
| R2 Storage | Backup (~10 GB) | $0.15/mo |
|
||||
| R2 Egress | Zero | $0.00 |
|
||||
| **Total** | | **~€24/mo (~$26)** |
|
||||
|
||||
@@ -11,6 +11,8 @@ ExecStart=/opt/materia/infra/supervisor/supervisor.sh
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
EnvironmentFile=/opt/materia/.env
|
||||
Environment=LANDING_DIR=/data/materia/landing
|
||||
Environment=DUCKDB_PATH=/data/materia/lakehouse.duckdb
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
|
||||
@@ -24,9 +24,14 @@ do
|
||||
git switch --discard-changes --detach origin/master
|
||||
uv sync
|
||||
|
||||
# Run pipelines (SQLMesh handles scheduling)
|
||||
#uv run materia pipeline run extract
|
||||
#uv run materia pipeline run transform
|
||||
# Run pipelines
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run extract
|
||||
|
||||
LANDING_DIR="${LANDING_DIR:-/data/materia/landing}" \
|
||||
DUCKDB_PATH="${DUCKDB_PATH:-/data/materia/lakehouse.duckdb}" \
|
||||
uv run materia pipeline run transform
|
||||
|
||||
) || sleep 600 # Sleep 10 min on failure to avoid busy-loop retries
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user