Refactor to local-first architecture on Hetzner NVMe
Remove distributed R2/Iceberg/SSH pipeline architecture in favor of
local subprocess execution with NVMe storage. Landing data backed up
to R2 via rclone timer.
- Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow
- Pipelines run via subprocess.run() with bounded timeouts
- Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip
- SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH
- Delete unused provider stubs (ovh, scaleway, oracle)
- Add rclone systemd timer for R2 backup every 6h
- Update supervisor to run pipelines with env vars
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -80,15 +80,12 @@ app.add_typer(pipeline_app, name="pipeline")
|
||||
@pipeline_app.command("run")
|
||||
def pipeline_run(
|
||||
name: Annotated[str, typer.Argument(help="Pipeline name (extract, transform)")],
|
||||
worker_type: Annotated[str | None, typer.Option("--worker", "-w")] = None,
|
||||
provider: Annotated[str, typer.Option("--provider", "-p")] = "hetzner",
|
||||
keep: Annotated[bool, typer.Option("--keep", help="Keep worker after completion")] = False,
|
||||
):
|
||||
"""Run a pipeline on an ephemeral worker."""
|
||||
"""Run a pipeline locally."""
|
||||
from materia.pipelines import run_pipeline
|
||||
|
||||
typer.echo(f"Running pipeline '{name}'...")
|
||||
result = run_pipeline(name, worker_type, auto_destroy=not keep, provider=provider)
|
||||
result = run_pipeline(name)
|
||||
|
||||
if result.success:
|
||||
typer.echo(result.output)
|
||||
@@ -105,7 +102,8 @@ def pipeline_list():
|
||||
|
||||
typer.echo("Available pipelines:")
|
||||
for name, config in PIPELINES.items():
|
||||
typer.echo(f" • {name:<15} (worker: {config.worker_type}, artifact: {config.artifact})")
|
||||
cmd = " ".join(config["command"])
|
||||
typer.echo(f" • {name:<15} (command: {cmd}, timeout: {config['timeout_seconds']}s)")
|
||||
|
||||
|
||||
secrets_app = typer.Typer(help="Manage secrets via Pulumi ESC")
|
||||
|
||||
@@ -1,21 +1,8 @@
|
||||
"""Pipeline execution on ephemeral workers."""
|
||||
"""Pipeline execution via local subprocess."""
|
||||
|
||||
import contextlib
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
|
||||
import paramiko
|
||||
|
||||
from materia.secrets import get_secret
|
||||
from materia.workers import create_worker, destroy_worker
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineConfig:
|
||||
worker_type: str
|
||||
artifact: str
|
||||
command: str
|
||||
secrets: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
@@ -25,56 +12,20 @@ class PipelineResult:
|
||||
|
||||
|
||||
PIPELINES = {
|
||||
"extract": PipelineConfig(
|
||||
worker_type="ccx12",
|
||||
artifact="materia-extract-latest.tar.gz",
|
||||
command="./extract_psd",
|
||||
secrets=["R2_ACCESS_KEY_ID", "R2_SECRET_ACCESS_KEY", "R2_ENDPOINT", "R2_ARTIFACTS_BUCKET"],
|
||||
),
|
||||
"transform": PipelineConfig(
|
||||
worker_type="ccx22",
|
||||
artifact="materia-transform-latest.tar.gz",
|
||||
command="cd sqlmesh_materia && ./sqlmesh plan prod",
|
||||
secrets=[
|
||||
"CLOUDFLARE_API_TOKEN",
|
||||
"ICEBERG_REST_URI",
|
||||
"R2_WAREHOUSE_NAME",
|
||||
],
|
||||
),
|
||||
"extract": {
|
||||
"command": ["uv", "run", "--package", "psdonline", "extract_psd"],
|
||||
"timeout_seconds": 1800,
|
||||
},
|
||||
"transform": {
|
||||
"command": ["uv", "run", "--package", "sqlmesh_materia", "sqlmesh", "-p", "transform/sqlmesh_materia", "plan", "prod", "--no-prompts", "--auto-apply"],
|
||||
"timeout_seconds": 3600,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _execute_ssh_command(ip: str, command: str, env_vars: dict[str, str]) -> tuple[str, str, int]:
|
||||
ssh_key_path = get_secret("SSH_PRIVATE_KEY_PATH")
|
||||
if not ssh_key_path:
|
||||
raise ValueError("SSH_PRIVATE_KEY_PATH not found in secrets")
|
||||
def run_pipeline(pipeline_name: str) -> PipelineResult:
|
||||
assert pipeline_name, "pipeline_name must not be empty"
|
||||
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
|
||||
pkey = paramiko.RSAKey.from_private_key_file(ssh_key_path)
|
||||
client.connect(ip, username="root", pkey=pkey)
|
||||
|
||||
env_string = " ".join([f"export {k}='{v}' &&" for k, v in env_vars.items()])
|
||||
full_command = f"{env_string} {command}" if env_vars else command
|
||||
|
||||
stdin, stdout, stderr = client.exec_command(full_command)
|
||||
exit_code = stdout.channel.recv_exit_status()
|
||||
|
||||
output = stdout.read().decode()
|
||||
error = stderr.read().decode()
|
||||
|
||||
client.close()
|
||||
|
||||
return output, error, exit_code
|
||||
|
||||
|
||||
def run_pipeline(
|
||||
pipeline_name: str,
|
||||
worker_type: str | None = None,
|
||||
auto_destroy: bool = True,
|
||||
provider: str = "hetzner",
|
||||
) -> PipelineResult:
|
||||
if pipeline_name not in PIPELINES:
|
||||
return PipelineResult(
|
||||
success=False,
|
||||
@@ -82,58 +33,24 @@ def run_pipeline(
|
||||
error=f"Unknown pipeline: {pipeline_name}. Available: {', '.join(PIPELINES.keys())}",
|
||||
)
|
||||
|
||||
pipeline_config = PIPELINES[pipeline_name]
|
||||
worker_type = worker_type or pipeline_config.worker_type
|
||||
worker_name = f"materia-{pipeline_name}-worker"
|
||||
pipeline = PIPELINES[pipeline_name]
|
||||
timeout_seconds = pipeline["timeout_seconds"]
|
||||
|
||||
r2_bucket = get_secret("R2_ARTIFACTS_BUCKET") or "materia-artifacts"
|
||||
r2_endpoint = get_secret("R2_ENDPOINT")
|
||||
|
||||
if not r2_endpoint:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
pipeline["command"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_seconds,
|
||||
)
|
||||
return PipelineResult(
|
||||
success=result.returncode == 0,
|
||||
output=result.stdout,
|
||||
error=result.stderr if result.returncode != 0 else None,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return PipelineResult(
|
||||
success=False,
|
||||
output="",
|
||||
error="R2_ENDPOINT not configured in secrets",
|
||||
error=f"Pipeline '{pipeline_name}' timed out after {timeout_seconds} seconds",
|
||||
)
|
||||
|
||||
try:
|
||||
worker = create_worker(worker_name, worker_type, provider)
|
||||
|
||||
artifact_url = f"https://{r2_endpoint}/{r2_bucket}/{pipeline_config.artifact}"
|
||||
|
||||
bootstrap_commands = [
|
||||
f"curl -fsSL -o artifact.tar.gz {artifact_url}",
|
||||
"tar -xzf artifact.tar.gz",
|
||||
"chmod +x -R .",
|
||||
]
|
||||
|
||||
for cmd in bootstrap_commands:
|
||||
_, error, exit_code = _execute_ssh_command(worker.ip, cmd, {})
|
||||
if exit_code != 0:
|
||||
return PipelineResult(
|
||||
success=False,
|
||||
output="",
|
||||
error=f"Bootstrap failed: {error}",
|
||||
)
|
||||
|
||||
env_vars = {}
|
||||
for secret_key in pipeline_config.secrets:
|
||||
value = get_secret(secret_key)
|
||||
if value:
|
||||
env_vars[secret_key] = value
|
||||
|
||||
command = pipeline_config.command
|
||||
output, error, exit_code = _execute_ssh_command(worker.ip, command, env_vars)
|
||||
|
||||
success = exit_code == 0
|
||||
|
||||
return PipelineResult(
|
||||
success=success,
|
||||
output=output,
|
||||
error=error if not success else None,
|
||||
)
|
||||
|
||||
finally:
|
||||
if auto_destroy:
|
||||
with contextlib.suppress(Exception):
|
||||
destroy_worker(worker_name, provider)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"""Cloud provider abstraction for worker management."""
|
||||
"""Cloud provider for worker management."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -14,35 +13,10 @@ class Instance:
|
||||
type: str
|
||||
|
||||
|
||||
class ProviderModule(Protocol):
|
||||
def create_instance(
|
||||
self: str,
|
||||
instance_type: str,
|
||||
ssh_key: str,
|
||||
location: str | None = None,
|
||||
) -> Instance: ...
|
||||
|
||||
def destroy_instance(self: str) -> None: ...
|
||||
|
||||
def list_instances(self: str | None = None) -> list[Instance]: ...
|
||||
|
||||
def get_instance(self: str) -> Instance | None: ...
|
||||
|
||||
def wait_for_ssh(self: str, timeout: int = 300) -> bool: ...
|
||||
|
||||
|
||||
def get_provider(provider_name: str) -> ProviderModule:
|
||||
def get_provider(provider_name: str):
|
||||
if provider_name == "hetzner":
|
||||
from materia.providers import hetzner
|
||||
|
||||
return hetzner
|
||||
elif provider_name == "ovh":
|
||||
from materia.providers import ovh
|
||||
return ovh
|
||||
elif provider_name == "scaleway":
|
||||
from materia.providers import scaleway
|
||||
return scaleway
|
||||
elif provider_name == "oracle":
|
||||
from materia.providers import oracle
|
||||
return oracle
|
||||
else:
|
||||
raise ValueError(f"Unknown provider: {provider_name}")
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
"""Oracle Cloud provider implementation."""
|
||||
|
||||
from materia.providers import Instance
|
||||
|
||||
|
||||
def create_instance(
|
||||
name: str,
|
||||
instance_type: str,
|
||||
ssh_key: str,
|
||||
location: str | None = None,
|
||||
) -> Instance:
|
||||
raise NotImplementedError("Oracle Cloud provider not yet implemented")
|
||||
|
||||
|
||||
def destroy_instance(instance_id: str) -> None:
|
||||
raise NotImplementedError("Oracle Cloud provider not yet implemented")
|
||||
|
||||
|
||||
def list_instances(label: str | None = None) -> list[Instance]:
|
||||
raise NotImplementedError("Oracle Cloud provider not yet implemented")
|
||||
|
||||
|
||||
def get_instance(name: str) -> Instance | None:
|
||||
raise NotImplementedError("Oracle Cloud provider not yet implemented")
|
||||
|
||||
|
||||
def wait_for_ssh(ip: str, timeout: int = 300) -> bool:
|
||||
raise NotImplementedError("Oracle Cloud provider not yet implemented")
|
||||
@@ -1,28 +0,0 @@
|
||||
"""OVH Cloud provider implementation."""
|
||||
|
||||
from materia.providers import Instance
|
||||
|
||||
|
||||
def create_instance(
|
||||
name: str,
|
||||
instance_type: str,
|
||||
ssh_key: str,
|
||||
location: str | None = None,
|
||||
) -> Instance:
|
||||
raise NotImplementedError("OVH provider not yet implemented")
|
||||
|
||||
|
||||
def destroy_instance(instance_id: str) -> None:
|
||||
raise NotImplementedError("OVH provider not yet implemented")
|
||||
|
||||
|
||||
def list_instances(label: str | None = None) -> list[Instance]:
|
||||
raise NotImplementedError("OVH provider not yet implemented")
|
||||
|
||||
|
||||
def get_instance(name: str) -> Instance | None:
|
||||
raise NotImplementedError("OVH provider not yet implemented")
|
||||
|
||||
|
||||
def wait_for_ssh(ip: str, timeout: int = 300) -> bool:
|
||||
raise NotImplementedError("OVH provider not yet implemented")
|
||||
@@ -1,28 +0,0 @@
|
||||
"""Scaleway provider implementation."""
|
||||
|
||||
from materia.providers import Instance
|
||||
|
||||
|
||||
def create_instance(
|
||||
name: str,
|
||||
instance_type: str,
|
||||
ssh_key: str,
|
||||
location: str | None = None,
|
||||
) -> Instance:
|
||||
raise NotImplementedError("Scaleway provider not yet implemented")
|
||||
|
||||
|
||||
def destroy_instance(instance_id: str) -> None:
|
||||
raise NotImplementedError("Scaleway provider not yet implemented")
|
||||
|
||||
|
||||
def list_instances(label: str | None = None) -> list[Instance]:
|
||||
raise NotImplementedError("Scaleway provider not yet implemented")
|
||||
|
||||
|
||||
def get_instance(name: str) -> Instance | None:
|
||||
raise NotImplementedError("Scaleway provider not yet implemented")
|
||||
|
||||
|
||||
def wait_for_ssh(ip: str, timeout: int = 300) -> bool:
|
||||
raise NotImplementedError("Scaleway provider not yet implemented")
|
||||
Reference in New Issue
Block a user