From 790e802edd4fa4bd89027b4d980bef0d59f95c19 Mon Sep 17 00:00:00 2001
From: Deeman <hendriknote@gmail.com>
Date: Sun, 12 Oct 2025 14:26:55 +0200
Subject: [PATCH] updates

---
 VISION.md                       | 261 ++++++++++++++++++++++++++++++++
 infra/Pulumi.dev.yaml           |  15 ++
 infra/Pulumi.yaml               |   6 +
 infra/README.md                 | 161 ++++++++++++++++++++
 infra/__main__.py               | 167 ++++++++++++++++++++
 src/orchestrator/pipelines.yaml |  98 ++++++++++++
 6 files changed, 708 insertions(+)
 create mode 100644 VISION.md
 create mode 100644 infra/Pulumi.dev.yaml
 create mode 100644 infra/Pulumi.yaml
 create mode 100644 infra/README.md
 create mode 100644 infra/__main__.py
 create mode 100644 src/orchestrator/pipelines.yaml

diff --git a/VISION.md b/VISION.md
new file mode 100644
index 0000000..638cab7
--- /dev/null
+++ b/VISION.md
@@ -0,0 +1,261 @@
+# VISION.md
+
+## Mission
+
+Build the fastest, most accurate, and most affordable commodity analytics platform for independent traders and small firms—without enterprise sales bullshit.
+
+## Product: BeanFlows.coffee
+
+**Tagline:** Real-time commodity intelligence for traders who think for themselves.
+
+**Beachhead Market:** Coffee commodities
+**Long-term Vision:** Expand to all major commodity markets (~35-40 global contracts)
+
+## Why We Exist
+
+Platforms like Kpler dominate the commodity analytics space but are:
+- Slow and complex
+- Prohibitively expensive
+- Designed for enterprise buyers with bloated sales processes
+- Built on legacy infrastructure that prioritizes features over performance
+
+We're building the anti-Kpler: **better, faster, cheaper**.
+
+## Who We Are
+
+A two-person indie hacker startup:
+- **Data Engineer:** Building the platform
+- **Commodity Trader:** Domain expertise and product direction
+
+We move fast, ship incrementally, and prioritize value over vanity metrics.
+
+## Technical Philosophy
+
+### Core Principles
+
+1. **Simplicity over complexity**
+   - Minimal dependencies
+   - Clear, readable code
+   - Avoid premature abstraction
+
+2. **Performance over features**
+   - DuckDB over Spark
+   - Hetzner/Cloudflare over AWS
+   - SQL/Python/C over heavyweight frameworks
+
+3. **Accuracy over speed-to-market**
+   - Data quality is non-negotiable
+   - Rigorous validation at every layer
+   - Build trust through reliability
+
+4. **Build over buy**
+   - We're not afraid to write code from scratch
+   - Third-party tools must earn their place
+   - Control our destiny, minimize vendor lock-in
+
+### Technology Stack
+
+**Languages:**
+- SQL (primary transformation language)
+- Python (orchestration, extraction, APIs)
+- C (performance-critical extensions)
+
+**Infrastructure:**
+- **Storage:** Cloudflare R2 (not S3)
+- **Compute:** Hetzner bare metal (not AWS/GCP)
+- **Database:** DuckDB (not Spark/Snowflake)
+- **Orchestration:** SQLMesh + custom Python (not Airflow)
+
+**Development:**
+- **Monorepo:** uv workspace
+- **Package Manager:** uv (not pip/poetry)
+- **Version Control:** Git (GitLab)
+- **CI/CD:** GitLab CI
+
+### Architectural Philosophy
+
+**Data-Oriented Design:**
+- No OOP spaghetti
+- Data flows are explicit and traceable
+- Functions transform data, not objects with hidden state
+
+**Layered Architecture:**
+- Raw → Staging → Cleaned → Serving
+- Each layer has a single, clear purpose
+- Immutable raw data, reproducible transformations
+
+**Incremental Everything:**
+- Models update incrementally by time ranges
+- Avoid full table scans
+- Pay only for what changed
+
+## Current State (October 2025)
+
+### What's Working
+- USDA PSD Online extraction (2006-present, monthly archives)
+- 4-layer SQLMesh pipeline (raw → staging → cleaned → serving)
+- DuckDB backend with 13GB dev database
+- Incremental-by-time-range models with deduplication
+- Development environment with pre-commit hooks, linting, formatting
+
+### What We Have
+- Comprehensive commodity supply/demand data (USDA PSD)
+- Established naming conventions and data quality patterns
+- GitLab CI pipeline (lint, test, build)
+- Documentation (CLAUDE.md, layer conventions)
+
+## Roadmap
+
+### Phase 1: Coffee Market Foundation (Current)
+**Goal:** Build complete coffee analytics from supply to price
+
+**Data Sources to Integrate:**
+- ✅ USDA PSD Online (production, stocks, consumption)
+- ⬜ ICO (International Coffee Organization) data
+- ⬜ Yahoo Finance / Alpha Vantage (coffee futures prices - KC=F)
+- ⬜ Weather data for coffee-growing regions (OpenWeatherMap, NOAA)
+- ⬜ CFTC COT data (trader positioning)
+- ⬜ ICE warehouse stocks (web scraping)
+
+**Features to Build:**
+- ⬜ Historical price correlation analysis
+- ⬜ Supply/demand balance modeling
+- ⬜ Weather impact scoring
+- ⬜ Trader sentiment indicators (COT)
+- ⬜ Simple web dashboard (read-only analytics)
+- ⬜ Data export APIs (JSON, CSV, Parquet)
+
+**Infrastructure:**
+- ⬜ Move to Cloudflare R2 for raw data storage
+- ⬜ Deploy SQLMesh to Hetzner production environment
+- ⬜ Set up automated daily extraction + transformation pipeline
+- ⬜ Implement monitoring and alerting
+
+### Phase 2: Product Market Fit
+**Goal:** Validate with real traders, iterate on feedback
+
+- ⬜ Beta access for small group of coffee traders
+- ⬜ Usage analytics (what queries matter?)
+- ⬜ Performance optimization based on real workloads
+- ⬜ Pricing model experimentation ($X/month, pay-as-you-go?)
+
+### Phase 3: Expand Commodity Coverage
+**Goal:** Prove architecture scales across commodities
+
+**Priority Markets:**
+1. Other softs (cocoa, sugar, cotton, OJ)
+2. Grains (corn, wheat, soybeans)
+3. Energy (crude oil, natural gas)
+4. Metals (gold, silver, copper)
+
+**Reusable Patterns:**
+- Abstract extraction logic (API connectors, scrapers)
+- Standardized staging layer for price/volume data
+- Common serving models (time series, correlations, anomalies)
+
+### Phase 4: Advanced Analytics
+**Goal:** Differentiation through unique insights
+
+- ⬜ Satellite imagery integration (NASA, Planet) for crop monitoring
+- ⬜ Custom yield forecasting models
+- ⬜ Real-time alert system (price thresholds, supply shocks)
+- ⬜ Historical backtesting framework for trading strategies
+- ⬜ Sentiment analysis from news/reports (USDA GAIN, FAO)
+
+### Phase 5: Scale & Polish
+**Goal:** Handle growth, maintain performance advantage
+
+- ⬜ Multi-region deployment (low latency globally)
+- ⬜ Advanced caching strategies
+- ⬜ Self-service onboarding (no sales calls)
+- ⬜ Public documentation and API reference
+- ⬜ Community/forum for traders
+
+## Key Decisions & Trade-offs
+
+### Why DuckDB over Spark?
+- **Speed:** In-process OLAP is faster for our workloads
+- **Simplicity:** No cluster management, no JVM
+- **Cost:** Runs on a single beefy server, not 100 nodes
+- **Developer experience:** SQL-first, Python-friendly
+
+### Why SQLMesh over dbt/Airflow?
+- **Unified:** Orchestration + transformation in one tool
+- **Performance:** Built for incremental execution
+- **Virtual environments:** Test changes without breaking prod
+- **Python-native:** Extend with custom macros
+
+### Why Cloudflare R2 over S3?
+- **Cost:** No egress fees (huge for data-heavy platform)
+- **Performance:** Global edge network
+- **Simplicity:** S3-compatible API, easy migration path
+
+### Why Hetzner over AWS?
+- **Cost:** 10x cheaper for equivalent compute
+- **Performance:** Bare metal = no noisy neighbors
+- **Simplicity:** Less surface area, fewer services to manage
+
+### Why Monorepo?
+- **Atomic changes:** Update extraction + transformation together
+- **Shared code:** Reusable utilities across packages
+- **Simplified CI:** One pipeline, consistent tooling
+
+## Anti-Goals
+
+Things we explicitly do NOT want:
+
+- ❌ Enterprise sales team
+- ❌ Complex onboarding processes
+- ❌ Vendor lock-in (AWS, Snowflake, etc.)
+- ❌ OOP frameworks (Django ORM, SQLAlchemy magic)
+- ❌ Microservices (until we need them, which is not now)
+- ❌ Kubernetes (overkill for our scale)
+- ❌ Feature bloat (every feature has a performance cost)
+
+## Success Metrics
+
+**Phase 1 (Foundation):**
+- All coffee data sources integrated
+- Daily pipeline runs reliably (<5% failure rate)
+- Query latency <500ms for common analytics
+
+**Phase 2 (PMF):**
+- 10+ paying beta users
+- 90%+ data accuracy (validated against spot checks)
+- Monthly churn <10%
+
+**Phase 3 (Expansion):**
+- 5+ commodity markets covered
+- 100+ active users
+- Break-even on infrastructure costs
+
+**Long-term (Scale):**
+- Cover all ~35-40 major commodity contracts
+- 1000+ traders using the platform
+- Recognized as the go-to alternative to Kpler for indie traders
+
+## Guiding Questions
+
+When making decisions, ask:
+
+1. **Does this make us faster?** (Performance)
+2. **Does this make us more accurate?** (Data quality)
+3. **Does this make us simpler?** (Maintainability)
+4. **Does this help traders make better decisions?** (Value)
+5. **Can we afford to run this at scale?** (Unit economics)
+
+If the answer to any of these is "no," reconsider.
+
+## Current Priorities (Q4 2025)
+
+1. Integrate coffee futures price data (Yahoo Finance)
+2. Build time-series serving models for price/supply correlation
+3. Deploy production pipeline to Hetzner
+4. Set up Cloudflare R2 for raw data storage
+5. Create simple read-only dashboard for coffee analytics
+6. Document API for beta testers
+
+---
+
+**Last Updated:** October 2025
+**Next Review:** End of Q4 2025 (adjust based on Phase 1 progress)
diff --git a/infra/Pulumi.dev.yaml b/infra/Pulumi.dev.yaml
new file mode 100644
index 0000000..f35b984
--- /dev/null
+++ b/infra/Pulumi.dev.yaml
@@ -0,0 +1,15 @@
+# Development stack configuration
+# Set actual values with: pulumi config set <key> <value>
+# Set secrets with: pulumi config set --secret <key> <value>
+
+config:
+  # Cloudflare configuration
+  cloudflare:apiToken: # Set with: pulumi config set --secret cloudflare:apiToken <token>
+  materia-infrastructure:cloudflare_account_id: # Set with: pulumi config set cloudflare_account_id <id>
+
+  # Hetzner configuration
+  hcloud:token: # Set with: pulumi config set --secret hcloud:token <token>
+  materia-infrastructure:hetzner_location: "nbg1"  # Nuremberg, Germany
+
+  # SSH key for server access
+  materia-infrastructure:ssh_public_key: # Set with: pulumi config set --secret ssh_public_key "$(cat ~/.ssh/id_rsa.pub)"
diff --git a/infra/Pulumi.yaml b/infra/Pulumi.yaml
new file mode 100644
index 0000000..1f7ec04
--- /dev/null
+++ b/infra/Pulumi.yaml
@@ -0,0 +1,6 @@
+name: materia-infrastructure
+runtime:
+  name: python
+  options:
+    virtualenv: ../.venv
+description: BeanFlows.coffee infrastructure on Cloudflare R2 + Hetzner
diff --git a/infra/README.md b/infra/README.md
new file mode 100644
index 0000000..bc74e80
--- /dev/null
+++ b/infra/README.md
@@ -0,0 +1,161 @@
+# Materia Infrastructure
+
+Pulumi-managed infrastructure for BeanFlows.coffee
+
+## Stack Overview
+
+- **Storage:** Cloudflare R2 buckets with Iceberg Data Catalog
+- **Compute:** Hetzner Cloud CCX dedicated vCPU instances
+- **Orchestration:** Custom Python scheduler (see `src/orchestrator/`)
+
+## Prerequisites
+
+1. **Cloudflare Account**
+   - Sign up at https://dash.cloudflare.com
+   - Create API token with R2 + Data Catalog permissions
+   - Get your Account ID from dashboard
+
+2. **Hetzner Cloud Account**
+   - Sign up at https://console.hetzner.cloud
+   - Create API token with Read & Write permissions
+
+3. **Pulumi Account** (optional, can use local state)
+   - Sign up at https://app.pulumi.com
+   - Or use local state with `pulumi login --local`
+
+4. **SSH Key**
+   - Generate if needed: `ssh-keygen -t ed25519 -C "materia-deploy"`
+
+## Initial Setup
+
+```bash
+cd infra
+
+# Login to Pulumi (local or cloud)
+pulumi login  # or: pulumi login --local
+
+# Initialize the stack
+pulumi stack init dev
+
+# Configure secrets
+pulumi config set --secret cloudflare:apiToken <your-cloudflare-token>
+pulumi config set cloudflare_account_id <your-account-id>
+pulumi config set --secret hcloud:token <your-hetzner-token>
+pulumi config set --secret ssh_public_key "$(cat ~/.ssh/id_ed25519.pub)"
+
+# Preview changes
+pulumi preview
+
+# Deploy infrastructure
+pulumi up
+```
+
+## What Gets Provisioned
+
+### Cloudflare R2 Buckets
+
+1. **materia-raw** - Raw data from extraction (immutable archives)
+2. **materia-lakehouse** - Iceberg tables for SQLMesh (ACID transactions)
+
+### Hetzner Cloud Servers
+
+1. **materia-scheduler** (CCX12: 2 vCPU, 8GB RAM)
+   - Runs cron scheduler
+   - Lightweight orchestration tasks
+   - Always-on, low cost (~€6/mo)
+
+2. **materia-worker-01** (CCX22: 4 vCPU, 16GB RAM)
+   - Heavy SQLMesh transformations
+   - Can be stopped when not in use
+   - Scale up to CCX32/CCX42 for larger workloads (~€24-90/mo)
+
+3. **materia-firewall**
+   - SSH access (port 22)
+   - All outbound traffic allowed
+   - No inbound HTTP/HTTPS (we're not running web services yet)
+
+## Enabling R2 Data Catalog (Iceberg)
+
+As of October 2025, R2 Data Catalog is in public beta. Enable it manually:
+
+1. Go to Cloudflare Dashboard → R2
+2. Select the `materia-lakehouse` bucket
+3. Navigate to Settings → Data Catalog
+4. Click "Enable Data Catalog"
+
+Once enabled, you can connect DuckDB to the Iceberg REST catalog:
+
+```python
+import duckdb
+
+# Get catalog URI from Pulumi outputs
+# pulumi stack output duckdb_r2_config
+
+conn = duckdb.connect()
+conn.execute("INSTALL iceberg; LOAD iceberg;")
+conn.execute(f"""
+    ATTACH 'iceberg_rest://catalog.cloudflarestorage.com/<account_id>/r2-data-catalog'
+    AS lakehouse (
+        TYPE ICEBERG_REST,
+        SECRET '<r2_api_token>'
+    );
+""")
+```
+
+## Server Access
+
+Get server IPs from Pulumi outputs:
+
+```bash
+pulumi stack output scheduler_ip
+pulumi stack output worker_ip
+```
+
+SSH into servers:
+
+```bash
+ssh root@<scheduler_ip>
+ssh root@<worker_ip>
+```
+
+## Cost Estimates (Monthly)
+
+| Resource | Type | Cost |
+|----------|------|------|
+| R2 Storage | 10 GB | $0.15 |
+| R2 Operations | 1M reads | $0.36 |
+| R2 Egress | Unlimited | $0.00 (zero egress!) |
+| Scheduler | CCX12 | €6.00 |
+| Worker (on-demand) | CCX22 | €24.00 |
+| **Total** | | **~€30/mo (~$33)** |
+
+Compare to AWS equivalent: ~$300-500/mo with S3 + EC2 + egress fees.
+
+## Scaling Workers
+
+To add more worker capacity or different instance sizes:
+
+1. Edit `infra/__main__.py` to add new server resources
+2. Update worker config in `src/orchestrator/workers.yaml`
+3. Run `pulumi up` to provision
+
+Example worker sizes:
+- CCX12: 2 vCPU, 8GB RAM (light workloads)
+- CCX22: 4 vCPU, 16GB RAM (medium workloads)
+- CCX32: 8 vCPU, 32GB RAM (heavy workloads)
+- CCX42: 16 vCPU, 64GB RAM (very heavy workloads)
+
+## Destroying Infrastructure
+
+```bash
+cd infra
+pulumi destroy
+```
+
+**Warning:** This will delete all buckets and servers. Backup data first!
+
+## Next Steps
+
+1. Deploy orchestrator to scheduler server (see `src/orchestrator/README.md`)
+2. Configure SQLMesh to use R2 lakehouse (see `transform/sqlmesh_materia/config.yaml`)
+3. Set up CI/CD pipeline to deploy on push (see `.gitlab-ci.yml`)
diff --git a/infra/__main__.py b/infra/__main__.py
new file mode 100644
index 0000000..85e6054
--- /dev/null
+++ b/infra/__main__.py
@@ -0,0 +1,167 @@
+"""
+BeanFlows.coffee Infrastructure
+Cloudflare R2 + Iceberg + Hetzner compute stack
+"""
+
+import pulumi
+import pulumi_cloudflare as cloudflare
+import pulumi_hcloud as hcloud
+
+# Load configuration
+config = pulumi.Config()
+cloudflare_account_id = config.require("cloudflare_account_id")
+hetzner_location = config.get("hetzner_location") or "nbg1"  # Nuremberg datacenter
+
+# ============================================================
+# Cloudflare R2 Storage + Data Catalog (Iceberg)
+# ============================================================
+
+# R2 bucket for raw data (extraction outputs)
+raw_bucket = cloudflare.R2Bucket(
+    "materia-raw",
+    account_id=cloudflare_account_id,
+    name="materia-raw",
+    location="weur",  # Western Europe
+)
+
+# R2 bucket for lakehouse (Iceberg tables)
+lakehouse_bucket = cloudflare.R2Bucket(
+    "materia-lakehouse",
+    account_id=cloudflare_account_id,
+    name="materia-lakehouse",
+    location="weur",
+)
+
+# TODO: Enable R2 Data Catalog (Iceberg) on lakehouse bucket
+# Note: As of Oct 2025, R2 Data Catalog is in public beta
+# May need to enable via Cloudflare dashboard or API once SDK supports it
+# For now, document manual step in README
+
+# API token for R2 access (needs R2 + Data Catalog permissions)
+# Note: Create this manually in Cloudflare dashboard and store in Pulumi config
+# pulumi config set --secret cloudflare_r2_token <token>
+
+# ============================================================
+# Hetzner Cloud Infrastructure
+# ============================================================
+
+# SSH key for server access
+ssh_key = hcloud.SshKey(
+    "materia-ssh-key",
+    name="materia-deployment-key",
+    public_key=config.require_secret("ssh_public_key"),
+)
+
+# Small CCX instance for scheduler/orchestrator
+# This runs the cron scheduler + lightweight tasks
+scheduler_server = hcloud.Server(
+    "materia-scheduler",
+    name="materia-scheduler",
+    server_type="ccx12",  # 2 vCPU, 8GB RAM, ~€6/mo
+    image="ubuntu-24.04",
+    location=hetzner_location,
+    ssh_keys=[ssh_key.id],
+    labels={
+        "role": "scheduler",
+        "project": "materia",
+    },
+    user_data="""#!/bin/bash
+# Basic server setup
+apt-get update
+apt-get install -y python3.13 python3-pip git curl
+
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Configure environment
+echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
+""",
+)
+
+# Larger CCX instance for heavy SQLMesh workloads
+# This gets spun up on-demand for big transformations
+worker_server = hcloud.Server(
+    "materia-worker-01",
+    name="materia-worker-01",
+    server_type="ccx22",  # 4 vCPU, 16GB RAM, ~€24/mo
+    image="ubuntu-24.04",
+    location=hetzner_location,
+    ssh_keys=[ssh_key.id],
+    labels={
+        "role": "worker",
+        "project": "materia",
+    },
+    user_data="""#!/bin/bash
+# Basic server setup
+apt-get update
+apt-get install -y python3.13 python3-pip git curl
+
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Configure environment
+echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc
+""",
+)
+
+# Firewall for servers (restrict to SSH + outbound only)
+firewall = hcloud.Firewall(
+    "materia-firewall",
+    name="materia-firewall",
+    rules=[
+        # Allow SSH from anywhere (consider restricting to your IP)
+        hcloud.FirewallRuleArgs(
+            direction="in",
+            protocol="tcp",
+            port="22",
+            source_ips=["0.0.0.0/0", "::/0"],
+        ),
+        # Allow all outbound traffic
+        hcloud.FirewallRuleArgs(
+            direction="out",
+            protocol="tcp",
+            port="any",
+            destination_ips=["0.0.0.0/0", "::/0"],
+        ),
+        hcloud.FirewallRuleArgs(
+            direction="out",
+            protocol="udp",
+            port="any",
+            destination_ips=["0.0.0.0/0", "::/0"],
+        ),
+    ],
+)
+
+# Apply firewall to all servers
+scheduler_firewall = hcloud.FirewallAttachment(
+    "scheduler-firewall",
+    firewall_id=firewall.id,
+    server_ids=[scheduler_server.id],
+)
+
+worker_firewall = hcloud.FirewallAttachment(
+    "worker-firewall",
+    firewall_id=firewall.id,
+    server_ids=[worker_server.id],
+)
+
+# ============================================================
+# Outputs
+# ============================================================
+
+pulumi.export("raw_bucket_name", raw_bucket.name)
+pulumi.export("lakehouse_bucket_name", lakehouse_bucket.name)
+pulumi.export("scheduler_ip", scheduler_server.ipv4_address)
+pulumi.export("worker_ip", worker_server.ipv4_address)
+
+# Export connection info for DuckDB
+pulumi.export(
+    "duckdb_r2_config",
+    pulumi.Output.all(cloudflare_account_id, lakehouse_bucket.name).apply(
+        lambda args: {
+            "account_id": args[0],
+            "bucket": args[1],
+            "catalog_uri": f"https://catalog.cloudflarestorage.com/{args[0]}/r2-data-catalog",
+        }
+    ),
+)
diff --git a/src/orchestrator/pipelines.yaml b/src/orchestrator/pipelines.yaml
new file mode 100644
index 0000000..0948e07
--- /dev/null
+++ b/src/orchestrator/pipelines.yaml
@@ -0,0 +1,98 @@
+# Pipeline Configuration
+# Defines SQLMesh pipelines, schedules, and worker requirements
+
+pipelines:
+  # Daily extraction of USDA PSD data
+  - name: extract_psd
+    type: extraction
+    schedule: "0 2 * * *"  # 2 AM UTC daily
+    command: "extract_psd"
+    worker:
+      instance_type: scheduler  # Runs on lightweight scheduler instance
+      timeout_minutes: 30
+    on_success:
+      - trigger: transform_psd_staging
+
+  # Transform raw PSD data to staging layer
+  - name: transform_psd_staging
+    type: transformation
+    schedule: "0 3 * * *"  # 3 AM UTC daily (or triggered after extraction)
+    command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:staging"
+    worker:
+      instance_type: worker  # Needs more resources for DuckDB
+      min_memory_gb: 8
+      timeout_minutes: 60
+    on_success:
+      - trigger: transform_psd_cleaned
+
+  # Transform staging to cleaned layer
+  - name: transform_psd_cleaned
+    type: transformation
+    schedule: "0 4 * * *"  # 4 AM UTC daily
+    command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:cleaned"
+    worker:
+      instance_type: worker
+      min_memory_gb: 16  # Larger transformations
+      timeout_minutes: 120
+    on_success:
+      - trigger: transform_psd_serving
+
+  # Transform cleaned to serving layer
+  - name: transform_psd_serving
+    type: transformation
+    schedule: "0 5 * * *"  # 5 AM UTC daily
+    command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:serving"
+    worker:
+      instance_type: worker
+      min_memory_gb: 8
+      timeout_minutes: 60
+    on_success:
+      - notify: slack  # TODO: Add Slack webhook
+
+  # Full refresh pipeline (weekly)
+  - name: full_refresh
+    type: maintenance
+    schedule: "0 1 * * 0"  # 1 AM UTC every Sunday
+    command: "cd transform/sqlmesh_materia && sqlmesh plan --no-auto-apply --select-model * --full-refresh"
+    worker:
+      instance_type: worker
+      min_memory_gb: 32  # Needs big instance for full refresh
+      timeout_minutes: 360  # 6 hours max
+    enabled: false  # Disabled by default, enable manually when needed
+
+# Worker instance mapping
+# Maps instance types to actual Hetzner server IPs/names
+workers:
+  scheduler:
+    type: persistent
+    server: materia-scheduler  # Always running
+    max_concurrent_jobs: 3
+
+  worker:
+    type: on_demand
+    servers:
+      - name: materia-worker-01
+        instance_type: ccx22  # 4 vCPU, 16GB RAM
+        memory_gb: 16
+        max_concurrent_jobs: 2
+      # Add more workers as needed:
+      # - name: materia-worker-02
+      #   instance_type: ccx32  # 8 vCPU, 32GB RAM
+      #   memory_gb: 32
+      #   max_concurrent_jobs: 4
+
+# Notification channels
+notifications:
+  slack:
+    enabled: false
+    webhook_url_secret: SLACK_WEBHOOK_URL
+    notify_on:
+      - failure
+      - success_after_failure
+
+  email:
+    enabled: false
+    recipients:
+      - hendrik.note@gmail.com
+    notify_on:
+      - failure