Refactor to local-first architecture on Hetzner NVMe

Remove distributed R2/Iceberg/SSH pipeline architecture in favor of
local subprocess execution with NVMe storage. Landing data backed up
to R2 via rclone timer.

- Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow
- Pipelines run via subprocess.run() with bounded timeouts
- Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip
- SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH
- Delete unused provider stubs (ovh, scaleway, oracle)
- Add rclone systemd timer for R2 backup every 6h
- Update supervisor to run pipelines with env vars

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-18 18:05:41 +01:00
parent 910424c956
commit c1d00dcdc4
25 changed files with 231 additions and 1807 deletions

View File

@@ -80,15 +80,12 @@ app.add_typer(pipeline_app, name="pipeline")
@pipeline_app.command("run")
def pipeline_run(
name: Annotated[str, typer.Argument(help="Pipeline name (extract, transform)")],
worker_type: Annotated[str | None, typer.Option("--worker", "-w")] = None,
provider: Annotated[str, typer.Option("--provider", "-p")] = "hetzner",
keep: Annotated[bool, typer.Option("--keep", help="Keep worker after completion")] = False,
):
"""Run a pipeline on an ephemeral worker."""
"""Run a pipeline locally."""
from materia.pipelines import run_pipeline
typer.echo(f"Running pipeline '{name}'...")
result = run_pipeline(name, worker_type, auto_destroy=not keep, provider=provider)
result = run_pipeline(name)
if result.success:
typer.echo(result.output)
@@ -105,7 +102,8 @@ def pipeline_list():
typer.echo("Available pipelines:")
for name, config in PIPELINES.items():
typer.echo(f"{name:<15} (worker: {config.worker_type}, artifact: {config.artifact})")
cmd = " ".join(config["command"])
typer.echo(f"{name:<15} (command: {cmd}, timeout: {config['timeout_seconds']}s)")
secrets_app = typer.Typer(help="Manage secrets via Pulumi ESC")

View File

@@ -1,21 +1,8 @@
"""Pipeline execution on ephemeral workers."""
"""Pipeline execution via local subprocess."""
import contextlib
import subprocess
from dataclasses import dataclass
import paramiko
from materia.secrets import get_secret
from materia.workers import create_worker, destroy_worker
@dataclass
class PipelineConfig:
worker_type: str
artifact: str
command: str
secrets: list[str]
@dataclass
class PipelineResult:
@@ -25,56 +12,20 @@ class PipelineResult:
PIPELINES = {
"extract": PipelineConfig(
worker_type="ccx12",
artifact="materia-extract-latest.tar.gz",
command="./extract_psd",
secrets=["R2_ACCESS_KEY_ID", "R2_SECRET_ACCESS_KEY", "R2_ENDPOINT", "R2_ARTIFACTS_BUCKET"],
),
"transform": PipelineConfig(
worker_type="ccx22",
artifact="materia-transform-latest.tar.gz",
command="cd sqlmesh_materia && ./sqlmesh plan prod",
secrets=[
"CLOUDFLARE_API_TOKEN",
"ICEBERG_REST_URI",
"R2_WAREHOUSE_NAME",
],
),
"extract": {
"command": ["uv", "run", "--package", "psdonline", "extract_psd"],
"timeout_seconds": 1800,
},
"transform": {
"command": ["uv", "run", "--package", "sqlmesh_materia", "sqlmesh", "-p", "transform/sqlmesh_materia", "plan", "prod", "--no-prompts", "--auto-apply"],
"timeout_seconds": 3600,
},
}
def _execute_ssh_command(ip: str, command: str, env_vars: dict[str, str]) -> tuple[str, str, int]:
ssh_key_path = get_secret("SSH_PRIVATE_KEY_PATH")
if not ssh_key_path:
raise ValueError("SSH_PRIVATE_KEY_PATH not found in secrets")
def run_pipeline(pipeline_name: str) -> PipelineResult:
assert pipeline_name, "pipeline_name must not be empty"
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
pkey = paramiko.RSAKey.from_private_key_file(ssh_key_path)
client.connect(ip, username="root", pkey=pkey)
env_string = " ".join([f"export {k}='{v}' &&" for k, v in env_vars.items()])
full_command = f"{env_string} {command}" if env_vars else command
stdin, stdout, stderr = client.exec_command(full_command)
exit_code = stdout.channel.recv_exit_status()
output = stdout.read().decode()
error = stderr.read().decode()
client.close()
return output, error, exit_code
def run_pipeline(
pipeline_name: str,
worker_type: str | None = None,
auto_destroy: bool = True,
provider: str = "hetzner",
) -> PipelineResult:
if pipeline_name not in PIPELINES:
return PipelineResult(
success=False,
@@ -82,58 +33,24 @@ def run_pipeline(
error=f"Unknown pipeline: {pipeline_name}. Available: {', '.join(PIPELINES.keys())}",
)
pipeline_config = PIPELINES[pipeline_name]
worker_type = worker_type or pipeline_config.worker_type
worker_name = f"materia-{pipeline_name}-worker"
pipeline = PIPELINES[pipeline_name]
timeout_seconds = pipeline["timeout_seconds"]
r2_bucket = get_secret("R2_ARTIFACTS_BUCKET") or "materia-artifacts"
r2_endpoint = get_secret("R2_ENDPOINT")
if not r2_endpoint:
try:
result = subprocess.run(
pipeline["command"],
capture_output=True,
text=True,
timeout=timeout_seconds,
)
return PipelineResult(
success=result.returncode == 0,
output=result.stdout,
error=result.stderr if result.returncode != 0 else None,
)
except subprocess.TimeoutExpired:
return PipelineResult(
success=False,
output="",
error="R2_ENDPOINT not configured in secrets",
error=f"Pipeline '{pipeline_name}' timed out after {timeout_seconds} seconds",
)
try:
worker = create_worker(worker_name, worker_type, provider)
artifact_url = f"https://{r2_endpoint}/{r2_bucket}/{pipeline_config.artifact}"
bootstrap_commands = [
f"curl -fsSL -o artifact.tar.gz {artifact_url}",
"tar -xzf artifact.tar.gz",
"chmod +x -R .",
]
for cmd in bootstrap_commands:
_, error, exit_code = _execute_ssh_command(worker.ip, cmd, {})
if exit_code != 0:
return PipelineResult(
success=False,
output="",
error=f"Bootstrap failed: {error}",
)
env_vars = {}
for secret_key in pipeline_config.secrets:
value = get_secret(secret_key)
if value:
env_vars[secret_key] = value
command = pipeline_config.command
output, error, exit_code = _execute_ssh_command(worker.ip, command, env_vars)
success = exit_code == 0
return PipelineResult(
success=success,
output=output,
error=error if not success else None,
)
finally:
if auto_destroy:
with contextlib.suppress(Exception):
destroy_worker(worker_name, provider)

View File

@@ -1,7 +1,6 @@
"""Cloud provider abstraction for worker management."""
"""Cloud provider for worker management."""
from dataclasses import dataclass
from typing import Protocol
@dataclass
@@ -14,35 +13,10 @@ class Instance:
type: str
class ProviderModule(Protocol):
def create_instance(
self: str,
instance_type: str,
ssh_key: str,
location: str | None = None,
) -> Instance: ...
def destroy_instance(self: str) -> None: ...
def list_instances(self: str | None = None) -> list[Instance]: ...
def get_instance(self: str) -> Instance | None: ...
def wait_for_ssh(self: str, timeout: int = 300) -> bool: ...
def get_provider(provider_name: str) -> ProviderModule:
def get_provider(provider_name: str):
if provider_name == "hetzner":
from materia.providers import hetzner
return hetzner
elif provider_name == "ovh":
from materia.providers import ovh
return ovh
elif provider_name == "scaleway":
from materia.providers import scaleway
return scaleway
elif provider_name == "oracle":
from materia.providers import oracle
return oracle
else:
raise ValueError(f"Unknown provider: {provider_name}")

View File

@@ -1,28 +0,0 @@
"""Oracle Cloud provider implementation."""
from materia.providers import Instance
def create_instance(
name: str,
instance_type: str,
ssh_key: str,
location: str | None = None,
) -> Instance:
raise NotImplementedError("Oracle Cloud provider not yet implemented")
def destroy_instance(instance_id: str) -> None:
raise NotImplementedError("Oracle Cloud provider not yet implemented")
def list_instances(label: str | None = None) -> list[Instance]:
raise NotImplementedError("Oracle Cloud provider not yet implemented")
def get_instance(name: str) -> Instance | None:
raise NotImplementedError("Oracle Cloud provider not yet implemented")
def wait_for_ssh(ip: str, timeout: int = 300) -> bool:
raise NotImplementedError("Oracle Cloud provider not yet implemented")

View File

@@ -1,28 +0,0 @@
"""OVH Cloud provider implementation."""
from materia.providers import Instance
def create_instance(
name: str,
instance_type: str,
ssh_key: str,
location: str | None = None,
) -> Instance:
raise NotImplementedError("OVH provider not yet implemented")
def destroy_instance(instance_id: str) -> None:
raise NotImplementedError("OVH provider not yet implemented")
def list_instances(label: str | None = None) -> list[Instance]:
raise NotImplementedError("OVH provider not yet implemented")
def get_instance(name: str) -> Instance | None:
raise NotImplementedError("OVH provider not yet implemented")
def wait_for_ssh(ip: str, timeout: int = 300) -> bool:
raise NotImplementedError("OVH provider not yet implemented")

View File

@@ -1,28 +0,0 @@
"""Scaleway provider implementation."""
from materia.providers import Instance
def create_instance(
name: str,
instance_type: str,
ssh_key: str,
location: str | None = None,
) -> Instance:
raise NotImplementedError("Scaleway provider not yet implemented")
def destroy_instance(instance_id: str) -> None:
raise NotImplementedError("Scaleway provider not yet implemented")
def list_instances(label: str | None = None) -> list[Instance]:
raise NotImplementedError("Scaleway provider not yet implemented")
def get_instance(name: str) -> Instance | None:
raise NotImplementedError("Scaleway provider not yet implemented")
def wait_for_ssh(ip: str, timeout: int = 300) -> bool:
raise NotImplementedError("Scaleway provider not yet implemented")