updates
This commit is contained in:
98
src/orchestrator/pipelines.yaml
Normal file
98
src/orchestrator/pipelines.yaml
Normal file
@@ -0,0 +1,98 @@
|
||||
# Pipeline Configuration
|
||||
# Defines SQLMesh pipelines, schedules, and worker requirements
|
||||
|
||||
pipelines:
|
||||
# Daily extraction of USDA PSD data
|
||||
- name: extract_psd
|
||||
type: extraction
|
||||
schedule: "0 2 * * *" # 2 AM UTC daily
|
||||
command: "extract_psd"
|
||||
worker:
|
||||
instance_type: scheduler # Runs on lightweight scheduler instance
|
||||
timeout_minutes: 30
|
||||
on_success:
|
||||
- trigger: transform_psd_staging
|
||||
|
||||
# Transform raw PSD data to staging layer
|
||||
- name: transform_psd_staging
|
||||
type: transformation
|
||||
schedule: "0 3 * * *" # 3 AM UTC daily (or triggered after extraction)
|
||||
command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:staging"
|
||||
worker:
|
||||
instance_type: worker # Needs more resources for DuckDB
|
||||
min_memory_gb: 8
|
||||
timeout_minutes: 60
|
||||
on_success:
|
||||
- trigger: transform_psd_cleaned
|
||||
|
||||
# Transform staging to cleaned layer
|
||||
- name: transform_psd_cleaned
|
||||
type: transformation
|
||||
schedule: "0 4 * * *" # 4 AM UTC daily
|
||||
command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:cleaned"
|
||||
worker:
|
||||
instance_type: worker
|
||||
min_memory_gb: 16 # Larger transformations
|
||||
timeout_minutes: 120
|
||||
on_success:
|
||||
- trigger: transform_psd_serving
|
||||
|
||||
# Transform cleaned to serving layer
|
||||
- name: transform_psd_serving
|
||||
type: transformation
|
||||
schedule: "0 5 * * *" # 5 AM UTC daily
|
||||
command: "cd transform/sqlmesh_materia && sqlmesh plan --select-model tag:serving"
|
||||
worker:
|
||||
instance_type: worker
|
||||
min_memory_gb: 8
|
||||
timeout_minutes: 60
|
||||
on_success:
|
||||
- notify: slack # TODO: Add Slack webhook
|
||||
|
||||
# Full refresh pipeline (weekly)
|
||||
- name: full_refresh
|
||||
type: maintenance
|
||||
schedule: "0 1 * * 0" # 1 AM UTC every Sunday
|
||||
command: "cd transform/sqlmesh_materia && sqlmesh plan --no-auto-apply --select-model * --full-refresh"
|
||||
worker:
|
||||
instance_type: worker
|
||||
min_memory_gb: 32 # Needs big instance for full refresh
|
||||
timeout_minutes: 360 # 6 hours max
|
||||
enabled: false # Disabled by default, enable manually when needed
|
||||
|
||||
# Worker instance mapping
|
||||
# Maps instance types to actual Hetzner server IPs/names
|
||||
workers:
|
||||
scheduler:
|
||||
type: persistent
|
||||
server: materia-scheduler # Always running
|
||||
max_concurrent_jobs: 3
|
||||
|
||||
worker:
|
||||
type: on_demand
|
||||
servers:
|
||||
- name: materia-worker-01
|
||||
instance_type: ccx22 # 4 vCPU, 16GB RAM
|
||||
memory_gb: 16
|
||||
max_concurrent_jobs: 2
|
||||
# Add more workers as needed:
|
||||
# - name: materia-worker-02
|
||||
# instance_type: ccx32 # 8 vCPU, 32GB RAM
|
||||
# memory_gb: 32
|
||||
# max_concurrent_jobs: 4
|
||||
|
||||
# Notification channels
|
||||
notifications:
|
||||
slack:
|
||||
enabled: false
|
||||
webhook_url_secret: SLACK_WEBHOOK_URL
|
||||
notify_on:
|
||||
- failure
|
||||
- success_after_failure
|
||||
|
||||
email:
|
||||
enabled: false
|
||||
recipients:
|
||||
- hendrik.note@gmail.com
|
||||
notify_on:
|
||||
- failure
|
||||
Reference in New Issue
Block a user