Refactor to local-first architecture on Hetzner NVMe

Remove distributed R2/Iceberg/SSH pipeline architecture in favor of local subprocess execution with NVMe storage. Landing data backed up to R2 via rclone timer. - Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow - Pipelines run via subprocess.run() with bounded timeouts - Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip - SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH - Delete unused provider stubs (ovh, scaleway, oracle) - Add rclone systemd timer for R2 backup every 6h - Update supervisor to run pipelines with env vars Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 18:05:41 +01:00
parent 910424c956
commit c1d00dcdc4
25 changed files with 231 additions and 1807 deletions
--- a/extract/psdonline/pyproject.toml
+++ b/extract/psdonline/pyproject.toml
@@ -2,16 +2,13 @@
 name = "psdonline"
 version = "0.1.0"
 description = "Add your description here"
-readme = "README.md"
 authors = [
    { name = "Deeman", email = "hendriknote@gmail.com" }
 ]
 requires-python = ">=3.13"

 dependencies = [
-    "boto3>=1.40.55",
    "niquests>=3.14.1",
-    "pendulum>=3.1.0",
 ]
 [project.scripts]
 extract_psd = "psdonline.execute:extract_psd_dataset"
--- a/extract/psdonline/src/psdonline/execute.py
+++ b/extract/psdonline/src/psdonline/execute.py
@@ -5,63 +5,32 @@ import pathlib
 import sys
 from datetime import datetime

-import boto3
 import niquests
-from botocore.exceptions import ClientError

 logging.basicConfig(
    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S',
-    handlers=[
-        logging.StreamHandler(sys.stdout)
-    ]
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
 )
 logger = logging.getLogger("PSDOnline Extractor")
-OUTPUT_DIR = pathlib.Path(__file__).parent / "data"
-OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-logger.info(f"Output dir: {OUTPUT_DIR}")

-# R2 configuration from environment
-R2_ENDPOINT = os.getenv('R2_ENDPOINT')
-R2_BUCKET = os.getenv('R2_BUCKET')
-R2_ACCESS_KEY = os.getenv('R2_ACCESS_KEY') or os.getenv('R2_ADMIN_ACCESS_KEY_ID')
-R2_SECRET_KEY = os.getenv('R2_SECRET_KEY') or os.getenv('R2_ADMIN_SECRET_ACCESS_KEY')
+LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
+LANDING_DIR.mkdir(parents=True, exist_ok=True)
+logger.info(f"Landing dir: {LANDING_DIR}")

 PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
 FIRST_YEAR = 2006
 FIRST_MONTH = 8

-def check_r2_file_exists(etag: str, s3_client) -> bool:
-    """Check if file exists in R2."""
-    r2_key = f"landing/psd/{etag}.csv.gzip"
-    try:
-        s3_client.head_object(Bucket=R2_BUCKET, Key=r2_key)
-        logger.info(f"File {r2_key} already exists in R2, skipping")
-        return True
-    except ClientError as e:
-        if e.response['Error']['Code'] == '404':
-            return False
-        raise
+HTTP_TIMEOUT_SECONDS = 60


-def upload_to_r2(content: bytes, etag: str, s3_client):
-    """Upload file content to R2."""
-    r2_key = f"landing/psd/{etag}.csv.gzip"
-    logger.info(f"Uploading to R2: {r2_key}")
-    s3_client.put_object(Bucket=R2_BUCKET, Key=r2_key, Body=content)
-    logger.info("Upload complete")
-
-
-def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niquests.Session, s3_client=None):
-    """
-    Extract PSD file either to local storage or R2.
-    If s3_client is provided, uploads to R2 only (no local storage).
-    If s3_client is None, downloads to local storage.
-    """
+def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
+    """Extract PSD file to local year/month subdirectory."""
    logger.info(f"Requesting file {url} ...")

-    response = http_session.head(url)
+    response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
    if response.status_code == 404:
        logger.error("File doesn't exist on server, received status code 404 Not Found")
        return
@@ -69,55 +38,31 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
        logger.error(f"Status code not ok, STATUS={response.status_code}")
        return

-    etag = response.headers.get("etag").replace('"',"").replace(":","_")
+    etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
+    assert etag, "USDA response missing etag header"

-    # R2 mode: check R2 and upload if needed
-    if s3_client:
-        if check_r2_file_exists(etag, s3_client):
-            return
-        response = http_session.get(url)
-        normalized_content = normalize_zipped_csv(response.content)
-        upload_to_r2(normalized_content, etag, s3_client)
-        return
-
-    # Local mode: check local and download if needed
+    extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
    local_file = extract_to_path / f"{etag}.csv.gzip"
    if local_file.exists():
-        logger.info(f"File {etag}.zip already exists locally, skipping")
+        logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
        return

-    response = http_session.get(url)
+    response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
    logger.info(f"Storing file to {local_file}")
    extract_to_path.mkdir(parents=True, exist_ok=True)
    normalized_content = normalize_zipped_csv(response.content)
    local_file.write_bytes(normalized_content)
+    assert local_file.exists(), f"File was not written: {local_file}"
    logger.info("Download complete")


 def extract_psd_dataset():
    today = datetime.now()

-    # Check if R2 credentials are configured
-    use_r2 = all([R2_ENDPOINT, R2_BUCKET, R2_ACCESS_KEY, R2_SECRET_KEY])
-
-    if use_r2:
-        logger.info("R2 credentials found, uploading to R2")
-        s3_client = boto3.client(
-            's3',
-            endpoint_url=R2_ENDPOINT,
-            aws_access_key_id=R2_ACCESS_KEY,
-            aws_secret_access_key=R2_SECRET_KEY
-        )
-    else:
-        logger.info("R2 credentials not found, downloading to local storage")
-        s3_client = None
-
-    # Try current month and previous 3 months (USDA data is published with lag)
    with niquests.Session() as session:
        for months_back in range(4):
            year = today.year
            month = today.month - months_back
-            # Handle year rollover
            while month < 1:
                month += 12
                year -= 1
@@ -125,11 +70,10 @@ def extract_psd_dataset():
            url = PSD_HISTORICAL_URL.format(year=year, month=month)
            logger.info(f"Trying {year}-{month:02d}...")

-            # Check if URL exists
-            response = session.head(url)
+            response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
            if response.status_code == 200:
                logger.info(f"Found latest data at {year}-{month:02d}")
-                extract_psd_file(url=url, http_session=session, extract_to_path=OUTPUT_DIR, s3_client=s3_client)
+                extract_psd_file(url=url, year=year, month=month, http_session=session)
                return
            elif response.status_code == 404:
                logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
@@ -141,5 +85,3 @@ def extract_psd_dataset():

 if __name__ == "__main__":
    extract_psd_dataset()
-
-