Refactor to local-first architecture on Hetzner NVMe

Remove distributed R2/Iceberg/SSH pipeline architecture in favor of
local subprocess execution with NVMe storage. Landing data backed up
to R2 via rclone timer.

- Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow
- Pipelines run via subprocess.run() with bounded timeouts
- Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip
- SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH
- Delete unused provider stubs (ovh, scaleway, oracle)
- Add rclone systemd timer for R2 backup every 6h
- Update supervisor to run pipelines with env vars

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-18 18:05:41 +01:00
parent 910424c956
commit c1d00dcdc4
25 changed files with 231 additions and 1807 deletions

View File

@@ -80,15 +80,12 @@ app.add_typer(pipeline_app, name="pipeline")
@pipeline_app.command("run")
def pipeline_run(
name: Annotated[str, typer.Argument(help="Pipeline name (extract, transform)")],
worker_type: Annotated[str | None, typer.Option("--worker", "-w")] = None,
provider: Annotated[str, typer.Option("--provider", "-p")] = "hetzner",
keep: Annotated[bool, typer.Option("--keep", help="Keep worker after completion")] = False,
):
"""Run a pipeline on an ephemeral worker."""
"""Run a pipeline locally."""
from materia.pipelines import run_pipeline
typer.echo(f"Running pipeline '{name}'...")
result = run_pipeline(name, worker_type, auto_destroy=not keep, provider=provider)
result = run_pipeline(name)
if result.success:
typer.echo(result.output)
@@ -105,7 +102,8 @@ def pipeline_list():
typer.echo("Available pipelines:")
for name, config in PIPELINES.items():
typer.echo(f"{name:<15} (worker: {config.worker_type}, artifact: {config.artifact})")
cmd = " ".join(config["command"])
typer.echo(f"{name:<15} (command: {cmd}, timeout: {config['timeout_seconds']}s)")
secrets_app = typer.Typer(help="Manage secrets via Pulumi ESC")