Refactor to local-first architecture on Hetzner NVMe

Remove distributed R2/Iceberg/SSH pipeline architecture in favor of
local subprocess execution with NVMe storage. Landing data backed up
to R2 via rclone timer.

- Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow
- Pipelines run via subprocess.run() with bounded timeouts
- Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip
- SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH
- Delete unused provider stubs (ovh, scaleway, oracle)
- Add rclone systemd timer for R2 backup every 6h
- Update supervisor to run pipelines with env vars

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-18 18:05:41 +01:00
parent 910424c956
commit c1d00dcdc4
25 changed files with 231 additions and 1807 deletions

View File

@@ -2,16 +2,13 @@
name = "psdonline"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "Deeman", email = "hendriknote@gmail.com" }
]
requires-python = ">=3.13"
dependencies = [
"boto3>=1.40.55",
"niquests>=3.14.1",
"pendulum>=3.1.0",
]
[project.scripts]
extract_psd = "psdonline.execute:extract_psd_dataset"

View File

@@ -5,63 +5,32 @@ import pathlib
import sys
from datetime import datetime
import boto3
import niquests
from botocore.exceptions import ClientError
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
handlers=[
logging.StreamHandler(sys.stdout)
]
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("PSDOnline Extractor")
OUTPUT_DIR = pathlib.Path(__file__).parent / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Output dir: {OUTPUT_DIR}")
# R2 configuration from environment
R2_ENDPOINT = os.getenv('R2_ENDPOINT')
R2_BUCKET = os.getenv('R2_BUCKET')
R2_ACCESS_KEY = os.getenv('R2_ACCESS_KEY') or os.getenv('R2_ADMIN_ACCESS_KEY_ID')
R2_SECRET_KEY = os.getenv('R2_SECRET_KEY') or os.getenv('R2_ADMIN_SECRET_ACCESS_KEY')
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
LANDING_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Landing dir: {LANDING_DIR}")
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
FIRST_YEAR = 2006
FIRST_MONTH = 8
def check_r2_file_exists(etag: str, s3_client) -> bool:
"""Check if file exists in R2."""
r2_key = f"landing/psd/{etag}.csv.gzip"
try:
s3_client.head_object(Bucket=R2_BUCKET, Key=r2_key)
logger.info(f"File {r2_key} already exists in R2, skipping")
return True
except ClientError as e:
if e.response['Error']['Code'] == '404':
return False
raise
HTTP_TIMEOUT_SECONDS = 60
def upload_to_r2(content: bytes, etag: str, s3_client):
"""Upload file content to R2."""
r2_key = f"landing/psd/{etag}.csv.gzip"
logger.info(f"Uploading to R2: {r2_key}")
s3_client.put_object(Bucket=R2_BUCKET, Key=r2_key, Body=content)
logger.info("Upload complete")
def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niquests.Session, s3_client=None):
"""
Extract PSD file either to local storage or R2.
If s3_client is provided, uploads to R2 only (no local storage).
If s3_client is None, downloads to local storage.
"""
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
"""Extract PSD file to local year/month subdirectory."""
logger.info(f"Requesting file {url} ...")
response = http_session.head(url)
response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
if response.status_code == 404:
logger.error("File doesn't exist on server, received status code 404 Not Found")
return
@@ -69,55 +38,31 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
logger.error(f"Status code not ok, STATUS={response.status_code}")
return
etag = response.headers.get("etag").replace('"',"").replace(":","_")
etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
assert etag, "USDA response missing etag header"
# R2 mode: check R2 and upload if needed
if s3_client:
if check_r2_file_exists(etag, s3_client):
return
response = http_session.get(url)
normalized_content = normalize_zipped_csv(response.content)
upload_to_r2(normalized_content, etag, s3_client)
return
# Local mode: check local and download if needed
extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
local_file = extract_to_path / f"{etag}.csv.gzip"
if local_file.exists():
logger.info(f"File {etag}.zip already exists locally, skipping")
logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
return
response = http_session.get(url)
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
logger.info(f"Storing file to {local_file}")
extract_to_path.mkdir(parents=True, exist_ok=True)
normalized_content = normalize_zipped_csv(response.content)
local_file.write_bytes(normalized_content)
assert local_file.exists(), f"File was not written: {local_file}"
logger.info("Download complete")
def extract_psd_dataset():
today = datetime.now()
# Check if R2 credentials are configured
use_r2 = all([R2_ENDPOINT, R2_BUCKET, R2_ACCESS_KEY, R2_SECRET_KEY])
if use_r2:
logger.info("R2 credentials found, uploading to R2")
s3_client = boto3.client(
's3',
endpoint_url=R2_ENDPOINT,
aws_access_key_id=R2_ACCESS_KEY,
aws_secret_access_key=R2_SECRET_KEY
)
else:
logger.info("R2 credentials not found, downloading to local storage")
s3_client = None
# Try current month and previous 3 months (USDA data is published with lag)
with niquests.Session() as session:
for months_back in range(4):
year = today.year
month = today.month - months_back
# Handle year rollover
while month < 1:
month += 12
year -= 1
@@ -125,11 +70,10 @@ def extract_psd_dataset():
url = PSD_HISTORICAL_URL.format(year=year, month=month)
logger.info(f"Trying {year}-{month:02d}...")
# Check if URL exists
response = session.head(url)
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
if response.status_code == 200:
logger.info(f"Found latest data at {year}-{month:02d}")
extract_psd_file(url=url, http_session=session, extract_to_path=OUTPUT_DIR, s3_client=s3_client)
extract_psd_file(url=url, year=year, month=month, http_session=session)
return
elif response.status_code == 404:
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
@@ -141,5 +85,3 @@ def extract_psd_dataset():
if __name__ == "__main__":
extract_psd_dataset()