This commit is contained in:
Deeman
2025-10-12 14:26:55 +02:00
parent 77dd277ebf
commit 790e802edd
6 changed files with 708 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
# Pipeline Configuration
# Defines SQLMesh pipelines, schedules, and worker requirements
pipelines:
# Daily extraction of USDA PSD data
- name: extract_psd
type: extraction
schedule: "0 2 * * *" # 2 AM UTC daily
command: "extract_psd"
worker:
instance_type: scheduler # Runs on lightweight scheduler instance
timeout_minutes: 30
on_success:
- trigger: transform_psd_staging
# Transform raw PSD data to staging layer
- name: transform_psd_staging
type: transformation
schedule: "0 3 * * *" # 3 AM UTC daily (or triggered after extraction)
command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:staging"
worker:
instance_type: worker # Needs more resources for DuckDB
min_memory_gb: 8
timeout_minutes: 60
on_success:
- trigger: transform_psd_cleaned
# Transform staging to cleaned layer
- name: transform_psd_cleaned
type: transformation
schedule: "0 4 * * *" # 4 AM UTC daily
command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:cleaned"
worker:
instance_type: worker
min_memory_gb: 16 # Larger transformations
timeout_minutes: 120
on_success:
- trigger: transform_psd_serving
# Transform cleaned to serving layer
- name: transform_psd_serving
type: transformation
schedule: "0 5 * * *" # 5 AM UTC daily
command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:serving"
worker:
instance_type: worker
min_memory_gb: 8
timeout_minutes: 60
on_success:
- notify: slack # TODO: Add Slack webhook
# Full refresh pipeline (weekly)
- name: full_refresh
type: maintenance
schedule: "0 1 * * 0" # 1 AM UTC every Sunday
command: "cd transform/sqlmesh_materia && sqlmesh plan --no-auto-apply --select-model * --full-refresh"
worker:
instance_type: worker
min_memory_gb: 32 # Needs big instance for full refresh
timeout_minutes: 360 # 6 hours max
enabled: false # Disabled by default, enable manually when needed
# Worker instance mapping
# Maps instance types to actual Hetzner server IPs/names
workers:
scheduler:
type: persistent
server: materia-scheduler # Always running
max_concurrent_jobs: 3
worker:
type: on_demand
servers:
- name: materia-worker-01
instance_type: ccx22 # 4 vCPU, 16GB RAM
memory_gb: 16
max_concurrent_jobs: 2
# Add more workers as needed:
# - name: materia-worker-02
# instance_type: ccx32 # 8 vCPU, 32GB RAM
# memory_gb: 32
# max_concurrent_jobs: 4
# Notification channels
notifications:
slack:
enabled: false
webhook_url_secret: SLACK_WEBHOOK_URL
notify_on:
- failure
- success_after_failure
email:
enabled: false
recipients:
- hendrik.note@gmail.com
notify_on:
- failure