From 790e802edd4fa4bd89027b4d980bef0d59f95c19 Mon Sep 17 00:00:00 2001 From: Deeman Date: Sun, 12 Oct 2025 14:26:55 +0200 Subject: [PATCH] updates --- VISION.md | 261 ++++++++++++++++++++++++++++++++ infra/Pulumi.dev.yaml | 15 ++ infra/Pulumi.yaml | 6 + infra/README.md | 161 ++++++++++++++++++++ infra/__main__.py | 167 ++++++++++++++++++++ src/orchestrator/pipelines.yaml | 98 ++++++++++++ 6 files changed, 708 insertions(+) create mode 100644 VISION.md create mode 100644 infra/Pulumi.dev.yaml create mode 100644 infra/Pulumi.yaml create mode 100644 infra/README.md create mode 100644 infra/__main__.py create mode 100644 src/orchestrator/pipelines.yaml diff --git a/VISION.md b/VISION.md new file mode 100644 index 0000000..638cab7 --- /dev/null +++ b/VISION.md @@ -0,0 +1,261 @@ +# VISION.md + +## Mission + +Build the fastest, most accurate, and most affordable commodity analytics platform for independent traders and small firms—without enterprise sales bullshit. + +## Product: BeanFlows.coffee + +**Tagline:** Real-time commodity intelligence for traders who think for themselves. + +**Beachhead Market:** Coffee commodities +**Long-term Vision:** Expand to all major commodity markets (~35-40 global contracts) + +## Why We Exist + +Platforms like Kpler dominate the commodity analytics space but are: +- Slow and complex +- Prohibitively expensive +- Designed for enterprise buyers with bloated sales processes +- Built on legacy infrastructure that prioritizes features over performance + +We're building the anti-Kpler: **better, faster, cheaper**. + +## Who We Are + +A two-person indie hacker startup: +- **Data Engineer:** Building the platform +- **Commodity Trader:** Domain expertise and product direction + +We move fast, ship incrementally, and prioritize value over vanity metrics. + +## Technical Philosophy + +### Core Principles + +1. **Simplicity over complexity** + - Minimal dependencies + - Clear, readable code + - Avoid premature abstraction + +2. **Performance over features** + - DuckDB over Spark + - Hetzner/Cloudflare over AWS + - SQL/Python/C over heavyweight frameworks + +3. **Accuracy over speed-to-market** + - Data quality is non-negotiable + - Rigorous validation at every layer + - Build trust through reliability + +4. **Build over buy** + - We're not afraid to write code from scratch + - Third-party tools must earn their place + - Control our destiny, minimize vendor lock-in + +### Technology Stack + +**Languages:** +- SQL (primary transformation language) +- Python (orchestration, extraction, APIs) +- C (performance-critical extensions) + +**Infrastructure:** +- **Storage:** Cloudflare R2 (not S3) +- **Compute:** Hetzner bare metal (not AWS/GCP) +- **Database:** DuckDB (not Spark/Snowflake) +- **Orchestration:** SQLMesh + custom Python (not Airflow) + +**Development:** +- **Monorepo:** uv workspace +- **Package Manager:** uv (not pip/poetry) +- **Version Control:** Git (GitLab) +- **CI/CD:** GitLab CI + +### Architectural Philosophy + +**Data-Oriented Design:** +- No OOP spaghetti +- Data flows are explicit and traceable +- Functions transform data, not objects with hidden state + +**Layered Architecture:** +- Raw → Staging → Cleaned → Serving +- Each layer has a single, clear purpose +- Immutable raw data, reproducible transformations + +**Incremental Everything:** +- Models update incrementally by time ranges +- Avoid full table scans +- Pay only for what changed + +## Current State (October 2025) + +### What's Working +- USDA PSD Online extraction (2006-present, monthly archives) +- 4-layer SQLMesh pipeline (raw → staging → cleaned → serving) +- DuckDB backend with 13GB dev database +- Incremental-by-time-range models with deduplication +- Development environment with pre-commit hooks, linting, formatting + +### What We Have +- Comprehensive commodity supply/demand data (USDA PSD) +- Established naming conventions and data quality patterns +- GitLab CI pipeline (lint, test, build) +- Documentation (CLAUDE.md, layer conventions) + +## Roadmap + +### Phase 1: Coffee Market Foundation (Current) +**Goal:** Build complete coffee analytics from supply to price + +**Data Sources to Integrate:** +- ✅ USDA PSD Online (production, stocks, consumption) +- ⬜ ICO (International Coffee Organization) data +- ⬜ Yahoo Finance / Alpha Vantage (coffee futures prices - KC=F) +- ⬜ Weather data for coffee-growing regions (OpenWeatherMap, NOAA) +- ⬜ CFTC COT data (trader positioning) +- ⬜ ICE warehouse stocks (web scraping) + +**Features to Build:** +- ⬜ Historical price correlation analysis +- ⬜ Supply/demand balance modeling +- ⬜ Weather impact scoring +- ⬜ Trader sentiment indicators (COT) +- ⬜ Simple web dashboard (read-only analytics) +- ⬜ Data export APIs (JSON, CSV, Parquet) + +**Infrastructure:** +- ⬜ Move to Cloudflare R2 for raw data storage +- ⬜ Deploy SQLMesh to Hetzner production environment +- ⬜ Set up automated daily extraction + transformation pipeline +- ⬜ Implement monitoring and alerting + +### Phase 2: Product Market Fit +**Goal:** Validate with real traders, iterate on feedback + +- ⬜ Beta access for small group of coffee traders +- ⬜ Usage analytics (what queries matter?) +- ⬜ Performance optimization based on real workloads +- ⬜ Pricing model experimentation ($X/month, pay-as-you-go?) + +### Phase 3: Expand Commodity Coverage +**Goal:** Prove architecture scales across commodities + +**Priority Markets:** +1. Other softs (cocoa, sugar, cotton, OJ) +2. Grains (corn, wheat, soybeans) +3. Energy (crude oil, natural gas) +4. Metals (gold, silver, copper) + +**Reusable Patterns:** +- Abstract extraction logic (API connectors, scrapers) +- Standardized staging layer for price/volume data +- Common serving models (time series, correlations, anomalies) + +### Phase 4: Advanced Analytics +**Goal:** Differentiation through unique insights + +- ⬜ Satellite imagery integration (NASA, Planet) for crop monitoring +- ⬜ Custom yield forecasting models +- ⬜ Real-time alert system (price thresholds, supply shocks) +- ⬜ Historical backtesting framework for trading strategies +- ⬜ Sentiment analysis from news/reports (USDA GAIN, FAO) + +### Phase 5: Scale & Polish +**Goal:** Handle growth, maintain performance advantage + +- ⬜ Multi-region deployment (low latency globally) +- ⬜ Advanced caching strategies +- ⬜ Self-service onboarding (no sales calls) +- ⬜ Public documentation and API reference +- ⬜ Community/forum for traders + +## Key Decisions & Trade-offs + +### Why DuckDB over Spark? +- **Speed:** In-process OLAP is faster for our workloads +- **Simplicity:** No cluster management, no JVM +- **Cost:** Runs on a single beefy server, not 100 nodes +- **Developer experience:** SQL-first, Python-friendly + +### Why SQLMesh over dbt/Airflow? +- **Unified:** Orchestration + transformation in one tool +- **Performance:** Built for incremental execution +- **Virtual environments:** Test changes without breaking prod +- **Python-native:** Extend with custom macros + +### Why Cloudflare R2 over S3? +- **Cost:** No egress fees (huge for data-heavy platform) +- **Performance:** Global edge network +- **Simplicity:** S3-compatible API, easy migration path + +### Why Hetzner over AWS? +- **Cost:** 10x cheaper for equivalent compute +- **Performance:** Bare metal = no noisy neighbors +- **Simplicity:** Less surface area, fewer services to manage + +### Why Monorepo? +- **Atomic changes:** Update extraction + transformation together +- **Shared code:** Reusable utilities across packages +- **Simplified CI:** One pipeline, consistent tooling + +## Anti-Goals + +Things we explicitly do NOT want: + +- ❌ Enterprise sales team +- ❌ Complex onboarding processes +- ❌ Vendor lock-in (AWS, Snowflake, etc.) +- ❌ OOP frameworks (Django ORM, SQLAlchemy magic) +- ❌ Microservices (until we need them, which is not now) +- ❌ Kubernetes (overkill for our scale) +- ❌ Feature bloat (every feature has a performance cost) + +## Success Metrics + +**Phase 1 (Foundation):** +- All coffee data sources integrated +- Daily pipeline runs reliably (<5% failure rate) +- Query latency <500ms for common analytics + +**Phase 2 (PMF):** +- 10+ paying beta users +- 90%+ data accuracy (validated against spot checks) +- Monthly churn <10% + +**Phase 3 (Expansion):** +- 5+ commodity markets covered +- 100+ active users +- Break-even on infrastructure costs + +**Long-term (Scale):** +- Cover all ~35-40 major commodity contracts +- 1000+ traders using the platform +- Recognized as the go-to alternative to Kpler for indie traders + +## Guiding Questions + +When making decisions, ask: + +1. **Does this make us faster?** (Performance) +2. **Does this make us more accurate?** (Data quality) +3. **Does this make us simpler?** (Maintainability) +4. **Does this help traders make better decisions?** (Value) +5. **Can we afford to run this at scale?** (Unit economics) + +If the answer to any of these is "no," reconsider. + +## Current Priorities (Q4 2025) + +1. Integrate coffee futures price data (Yahoo Finance) +2. Build time-series serving models for price/supply correlation +3. Deploy production pipeline to Hetzner +4. Set up Cloudflare R2 for raw data storage +5. Create simple read-only dashboard for coffee analytics +6. Document API for beta testers + +--- + +**Last Updated:** October 2025 +**Next Review:** End of Q4 2025 (adjust based on Phase 1 progress) diff --git a/infra/Pulumi.dev.yaml b/infra/Pulumi.dev.yaml new file mode 100644 index 0000000..f35b984 --- /dev/null +++ b/infra/Pulumi.dev.yaml @@ -0,0 +1,15 @@ +# Development stack configuration +# Set actual values with: pulumi config set +# Set secrets with: pulumi config set --secret + +config: + # Cloudflare configuration + cloudflare:apiToken: # Set with: pulumi config set --secret cloudflare:apiToken + materia-infrastructure:cloudflare_account_id: # Set with: pulumi config set cloudflare_account_id + + # Hetzner configuration + hcloud:token: # Set with: pulumi config set --secret hcloud:token + materia-infrastructure:hetzner_location: "nbg1" # Nuremberg, Germany + + # SSH key for server access + materia-infrastructure:ssh_public_key: # Set with: pulumi config set --secret ssh_public_key "$(cat ~/.ssh/id_rsa.pub)" diff --git a/infra/Pulumi.yaml b/infra/Pulumi.yaml new file mode 100644 index 0000000..1f7ec04 --- /dev/null +++ b/infra/Pulumi.yaml @@ -0,0 +1,6 @@ +name: materia-infrastructure +runtime: + name: python + options: + virtualenv: ../.venv +description: BeanFlows.coffee infrastructure on Cloudflare R2 + Hetzner diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..bc74e80 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,161 @@ +# Materia Infrastructure + +Pulumi-managed infrastructure for BeanFlows.coffee + +## Stack Overview + +- **Storage:** Cloudflare R2 buckets with Iceberg Data Catalog +- **Compute:** Hetzner Cloud CCX dedicated vCPU instances +- **Orchestration:** Custom Python scheduler (see `src/orchestrator/`) + +## Prerequisites + +1. **Cloudflare Account** + - Sign up at https://dash.cloudflare.com + - Create API token with R2 + Data Catalog permissions + - Get your Account ID from dashboard + +2. **Hetzner Cloud Account** + - Sign up at https://console.hetzner.cloud + - Create API token with Read & Write permissions + +3. **Pulumi Account** (optional, can use local state) + - Sign up at https://app.pulumi.com + - Or use local state with `pulumi login --local` + +4. **SSH Key** + - Generate if needed: `ssh-keygen -t ed25519 -C "materia-deploy"` + +## Initial Setup + +```bash +cd infra + +# Login to Pulumi (local or cloud) +pulumi login # or: pulumi login --local + +# Initialize the stack +pulumi stack init dev + +# Configure secrets +pulumi config set --secret cloudflare:apiToken +pulumi config set cloudflare_account_id +pulumi config set --secret hcloud:token +pulumi config set --secret ssh_public_key "$(cat ~/.ssh/id_ed25519.pub)" + +# Preview changes +pulumi preview + +# Deploy infrastructure +pulumi up +``` + +## What Gets Provisioned + +### Cloudflare R2 Buckets + +1. **materia-raw** - Raw data from extraction (immutable archives) +2. **materia-lakehouse** - Iceberg tables for SQLMesh (ACID transactions) + +### Hetzner Cloud Servers + +1. **materia-scheduler** (CCX12: 2 vCPU, 8GB RAM) + - Runs cron scheduler + - Lightweight orchestration tasks + - Always-on, low cost (~€6/mo) + +2. **materia-worker-01** (CCX22: 4 vCPU, 16GB RAM) + - Heavy SQLMesh transformations + - Can be stopped when not in use + - Scale up to CCX32/CCX42 for larger workloads (~€24-90/mo) + +3. **materia-firewall** + - SSH access (port 22) + - All outbound traffic allowed + - No inbound HTTP/HTTPS (we're not running web services yet) + +## Enabling R2 Data Catalog (Iceberg) + +As of October 2025, R2 Data Catalog is in public beta. Enable it manually: + +1. Go to Cloudflare Dashboard → R2 +2. Select the `materia-lakehouse` bucket +3. Navigate to Settings → Data Catalog +4. Click "Enable Data Catalog" + +Once enabled, you can connect DuckDB to the Iceberg REST catalog: + +```python +import duckdb + +# Get catalog URI from Pulumi outputs +# pulumi stack output duckdb_r2_config + +conn = duckdb.connect() +conn.execute("INSTALL iceberg; LOAD iceberg;") +conn.execute(f""" + ATTACH 'iceberg_rest://catalog.cloudflarestorage.com//r2-data-catalog' + AS lakehouse ( + TYPE ICEBERG_REST, + SECRET '' + ); +""") +``` + +## Server Access + +Get server IPs from Pulumi outputs: + +```bash +pulumi stack output scheduler_ip +pulumi stack output worker_ip +``` + +SSH into servers: + +```bash +ssh root@ +ssh root@ +``` + +## Cost Estimates (Monthly) + +| Resource | Type | Cost | +|----------|------|------| +| R2 Storage | 10 GB | $0.15 | +| R2 Operations | 1M reads | $0.36 | +| R2 Egress | Unlimited | $0.00 (zero egress!) | +| Scheduler | CCX12 | €6.00 | +| Worker (on-demand) | CCX22 | €24.00 | +| **Total** | | **~€30/mo (~$33)** | + +Compare to AWS equivalent: ~$300-500/mo with S3 + EC2 + egress fees. + +## Scaling Workers + +To add more worker capacity or different instance sizes: + +1. Edit `infra/__main__.py` to add new server resources +2. Update worker config in `src/orchestrator/workers.yaml` +3. Run `pulumi up` to provision + +Example worker sizes: +- CCX12: 2 vCPU, 8GB RAM (light workloads) +- CCX22: 4 vCPU, 16GB RAM (medium workloads) +- CCX32: 8 vCPU, 32GB RAM (heavy workloads) +- CCX42: 16 vCPU, 64GB RAM (very heavy workloads) + +## Destroying Infrastructure + +```bash +cd infra +pulumi destroy +``` + +**Warning:** This will delete all buckets and servers. Backup data first! + +## Next Steps + +1. Deploy orchestrator to scheduler server (see `src/orchestrator/README.md`) +2. Configure SQLMesh to use R2 lakehouse (see `transform/sqlmesh_materia/config.yaml`) +3. Set up CI/CD pipeline to deploy on push (see `.gitlab-ci.yml`) diff --git a/infra/__main__.py b/infra/__main__.py new file mode 100644 index 0000000..85e6054 --- /dev/null +++ b/infra/__main__.py @@ -0,0 +1,167 @@ +""" +BeanFlows.coffee Infrastructure +Cloudflare R2 + Iceberg + Hetzner compute stack +""" + +import pulumi +import pulumi_cloudflare as cloudflare +import pulumi_hcloud as hcloud + +# Load configuration +config = pulumi.Config() +cloudflare_account_id = config.require("cloudflare_account_id") +hetzner_location = config.get("hetzner_location") or "nbg1" # Nuremberg datacenter + +# ============================================================ +# Cloudflare R2 Storage + Data Catalog (Iceberg) +# ============================================================ + +# R2 bucket for raw data (extraction outputs) +raw_bucket = cloudflare.R2Bucket( + "materia-raw", + account_id=cloudflare_account_id, + name="materia-raw", + location="weur", # Western Europe +) + +# R2 bucket for lakehouse (Iceberg tables) +lakehouse_bucket = cloudflare.R2Bucket( + "materia-lakehouse", + account_id=cloudflare_account_id, + name="materia-lakehouse", + location="weur", +) + +# TODO: Enable R2 Data Catalog (Iceberg) on lakehouse bucket +# Note: As of Oct 2025, R2 Data Catalog is in public beta +# May need to enable via Cloudflare dashboard or API once SDK supports it +# For now, document manual step in README + +# API token for R2 access (needs R2 + Data Catalog permissions) +# Note: Create this manually in Cloudflare dashboard and store in Pulumi config +# pulumi config set --secret cloudflare_r2_token + +# ============================================================ +# Hetzner Cloud Infrastructure +# ============================================================ + +# SSH key for server access +ssh_key = hcloud.SshKey( + "materia-ssh-key", + name="materia-deployment-key", + public_key=config.require_secret("ssh_public_key"), +) + +# Small CCX instance for scheduler/orchestrator +# This runs the cron scheduler + lightweight tasks +scheduler_server = hcloud.Server( + "materia-scheduler", + name="materia-scheduler", + server_type="ccx12", # 2 vCPU, 8GB RAM, ~€6/mo + image="ubuntu-24.04", + location=hetzner_location, + ssh_keys=[ssh_key.id], + labels={ + "role": "scheduler", + "project": "materia", + }, + user_data="""#!/bin/bash +# Basic server setup +apt-get update +apt-get install -y python3.13 python3-pip git curl + +# Install uv +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Configure environment +echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc +""", +) + +# Larger CCX instance for heavy SQLMesh workloads +# This gets spun up on-demand for big transformations +worker_server = hcloud.Server( + "materia-worker-01", + name="materia-worker-01", + server_type="ccx22", # 4 vCPU, 16GB RAM, ~€24/mo + image="ubuntu-24.04", + location=hetzner_location, + ssh_keys=[ssh_key.id], + labels={ + "role": "worker", + "project": "materia", + }, + user_data="""#!/bin/bash +# Basic server setup +apt-get update +apt-get install -y python3.13 python3-pip git curl + +# Install uv +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Configure environment +echo 'export PATH="$HOME/.cargo/bin:$PATH"' >> /root/.bashrc +""", +) + +# Firewall for servers (restrict to SSH + outbound only) +firewall = hcloud.Firewall( + "materia-firewall", + name="materia-firewall", + rules=[ + # Allow SSH from anywhere (consider restricting to your IP) + hcloud.FirewallRuleArgs( + direction="in", + protocol="tcp", + port="22", + source_ips=["0.0.0.0/0", "::/0"], + ), + # Allow all outbound traffic + hcloud.FirewallRuleArgs( + direction="out", + protocol="tcp", + port="any", + destination_ips=["0.0.0.0/0", "::/0"], + ), + hcloud.FirewallRuleArgs( + direction="out", + protocol="udp", + port="any", + destination_ips=["0.0.0.0/0", "::/0"], + ), + ], +) + +# Apply firewall to all servers +scheduler_firewall = hcloud.FirewallAttachment( + "scheduler-firewall", + firewall_id=firewall.id, + server_ids=[scheduler_server.id], +) + +worker_firewall = hcloud.FirewallAttachment( + "worker-firewall", + firewall_id=firewall.id, + server_ids=[worker_server.id], +) + +# ============================================================ +# Outputs +# ============================================================ + +pulumi.export("raw_bucket_name", raw_bucket.name) +pulumi.export("lakehouse_bucket_name", lakehouse_bucket.name) +pulumi.export("scheduler_ip", scheduler_server.ipv4_address) +pulumi.export("worker_ip", worker_server.ipv4_address) + +# Export connection info for DuckDB +pulumi.export( + "duckdb_r2_config", + pulumi.Output.all(cloudflare_account_id, lakehouse_bucket.name).apply( + lambda args: { + "account_id": args[0], + "bucket": args[1], + "catalog_uri": f"https://catalog.cloudflarestorage.com/{args[0]}/r2-data-catalog", + } + ), +) diff --git a/src/orchestrator/pipelines.yaml b/src/orchestrator/pipelines.yaml new file mode 100644 index 0000000..0948e07 --- /dev/null +++ b/src/orchestrator/pipelines.yaml @@ -0,0 +1,98 @@ +# Pipeline Configuration +# Defines SQLMesh pipelines, schedules, and worker requirements + +pipelines: + # Daily extraction of USDA PSD data + - name: extract_psd + type: extraction + schedule: "0 2 * * *" # 2 AM UTC daily + command: "extract_psd" + worker: + instance_type: scheduler # Runs on lightweight scheduler instance + timeout_minutes: 30 + on_success: + - trigger: transform_psd_staging + + # Transform raw PSD data to staging layer + - name: transform_psd_staging + type: transformation + schedule: "0 3 * * *" # 3 AM UTC daily (or triggered after extraction) + command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:staging" + worker: + instance_type: worker # Needs more resources for DuckDB + min_memory_gb: 8 + timeout_minutes: 60 + on_success: + - trigger: transform_psd_cleaned + + # Transform staging to cleaned layer + - name: transform_psd_cleaned + type: transformation + schedule: "0 4 * * *" # 4 AM UTC daily + command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:cleaned" + worker: + instance_type: worker + min_memory_gb: 16 # Larger transformations + timeout_minutes: 120 + on_success: + - trigger: transform_psd_serving + + # Transform cleaned to serving layer + - name: transform_psd_serving + type: transformation + schedule: "0 5 * * *" # 5 AM UTC daily + command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:serving" + worker: + instance_type: worker + min_memory_gb: 8 + timeout_minutes: 60 + on_success: + - notify: slack # TODO: Add Slack webhook + + # Full refresh pipeline (weekly) + - name: full_refresh + type: maintenance + schedule: "0 1 * * 0" # 1 AM UTC every Sunday + command: "cd transform/sqlmesh_materia && sqlmesh plan --no-auto-apply --select-model * --full-refresh" + worker: + instance_type: worker + min_memory_gb: 32 # Needs big instance for full refresh + timeout_minutes: 360 # 6 hours max + enabled: false # Disabled by default, enable manually when needed + +# Worker instance mapping +# Maps instance types to actual Hetzner server IPs/names +workers: + scheduler: + type: persistent + server: materia-scheduler # Always running + max_concurrent_jobs: 3 + + worker: + type: on_demand + servers: + - name: materia-worker-01 + instance_type: ccx22 # 4 vCPU, 16GB RAM + memory_gb: 16 + max_concurrent_jobs: 2 + # Add more workers as needed: + # - name: materia-worker-02 + # instance_type: ccx32 # 8 vCPU, 32GB RAM + # memory_gb: 32 + # max_concurrent_jobs: 4 + +# Notification channels +notifications: + slack: + enabled: false + webhook_url_secret: SLACK_WEBHOOK_URL + notify_on: + - failure + - success_after_failure + + email: + enabled: false + recipients: + - hendrik.note@gmail.com + notify_on: + - failure