Refactor to local-first architecture on Hetzner NVMe
Remove distributed R2/Iceberg/SSH pipeline architecture in favor of
local subprocess execution with NVMe storage. Landing data backed up
to R2 via rclone timer.
- Strip Iceberg catalog, httpfs, boto3, paramiko, prefect, pyarrow
- Pipelines run via subprocess.run() with bounded timeouts
- Extract writes to {LANDING_DIR}/psd/{year}/{month}/{etag}.csv.gzip
- SQLMesh reads LANDING_DIR variable, writes to DUCKDB_PATH
- Delete unused provider stubs (ovh, scaleway, oracle)
- Add rclone systemd timer for R2 backup every 6h
- Update supervisor to run pipelines with env vars
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,16 +2,13 @@
|
||||
name = "psdonline"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name = "Deeman", email = "hendriknote@gmail.com" }
|
||||
]
|
||||
requires-python = ">=3.13"
|
||||
|
||||
dependencies = [
|
||||
"boto3>=1.40.55",
|
||||
"niquests>=3.14.1",
|
||||
"pendulum>=3.1.0",
|
||||
]
|
||||
[project.scripts]
|
||||
extract_psd = "psdonline.execute:extract_psd_dataset"
|
||||
|
||||
@@ -5,63 +5,32 @@ import pathlib
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
import boto3
|
||||
import niquests
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger = logging.getLogger("PSDOnline Extractor")
|
||||
OUTPUT_DIR = pathlib.Path(__file__).parent / "data"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Output dir: {OUTPUT_DIR}")
|
||||
|
||||
# R2 configuration from environment
|
||||
R2_ENDPOINT = os.getenv('R2_ENDPOINT')
|
||||
R2_BUCKET = os.getenv('R2_BUCKET')
|
||||
R2_ACCESS_KEY = os.getenv('R2_ACCESS_KEY') or os.getenv('R2_ADMIN_ACCESS_KEY_ID')
|
||||
R2_SECRET_KEY = os.getenv('R2_SECRET_KEY') or os.getenv('R2_ADMIN_SECRET_ACCESS_KEY')
|
||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
LANDING_DIR.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Landing dir: {LANDING_DIR}")
|
||||
|
||||
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
|
||||
FIRST_YEAR = 2006
|
||||
FIRST_MONTH = 8
|
||||
|
||||
def check_r2_file_exists(etag: str, s3_client) -> bool:
|
||||
"""Check if file exists in R2."""
|
||||
r2_key = f"landing/psd/{etag}.csv.gzip"
|
||||
try:
|
||||
s3_client.head_object(Bucket=R2_BUCKET, Key=r2_key)
|
||||
logger.info(f"File {r2_key} already exists in R2, skipping")
|
||||
return True
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
return False
|
||||
raise
|
||||
HTTP_TIMEOUT_SECONDS = 60
|
||||
|
||||
|
||||
def upload_to_r2(content: bytes, etag: str, s3_client):
|
||||
"""Upload file content to R2."""
|
||||
r2_key = f"landing/psd/{etag}.csv.gzip"
|
||||
logger.info(f"Uploading to R2: {r2_key}")
|
||||
s3_client.put_object(Bucket=R2_BUCKET, Key=r2_key, Body=content)
|
||||
logger.info("Upload complete")
|
||||
|
||||
|
||||
def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niquests.Session, s3_client=None):
|
||||
"""
|
||||
Extract PSD file either to local storage or R2.
|
||||
If s3_client is provided, uploads to R2 only (no local storage).
|
||||
If s3_client is None, downloads to local storage.
|
||||
"""
|
||||
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
|
||||
"""Extract PSD file to local year/month subdirectory."""
|
||||
logger.info(f"Requesting file {url} ...")
|
||||
|
||||
response = http_session.head(url)
|
||||
response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if response.status_code == 404:
|
||||
logger.error("File doesn't exist on server, received status code 404 Not Found")
|
||||
return
|
||||
@@ -69,55 +38,31 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
|
||||
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
||||
return
|
||||
|
||||
etag = response.headers.get("etag").replace('"',"").replace(":","_")
|
||||
etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
|
||||
assert etag, "USDA response missing etag header"
|
||||
|
||||
# R2 mode: check R2 and upload if needed
|
||||
if s3_client:
|
||||
if check_r2_file_exists(etag, s3_client):
|
||||
return
|
||||
response = http_session.get(url)
|
||||
normalized_content = normalize_zipped_csv(response.content)
|
||||
upload_to_r2(normalized_content, etag, s3_client)
|
||||
return
|
||||
|
||||
# Local mode: check local and download if needed
|
||||
extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
|
||||
local_file = extract_to_path / f"{etag}.csv.gzip"
|
||||
if local_file.exists():
|
||||
logger.info(f"File {etag}.zip already exists locally, skipping")
|
||||
logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
|
||||
return
|
||||
|
||||
response = http_session.get(url)
|
||||
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
logger.info(f"Storing file to {local_file}")
|
||||
extract_to_path.mkdir(parents=True, exist_ok=True)
|
||||
normalized_content = normalize_zipped_csv(response.content)
|
||||
local_file.write_bytes(normalized_content)
|
||||
assert local_file.exists(), f"File was not written: {local_file}"
|
||||
logger.info("Download complete")
|
||||
|
||||
|
||||
def extract_psd_dataset():
|
||||
today = datetime.now()
|
||||
|
||||
# Check if R2 credentials are configured
|
||||
use_r2 = all([R2_ENDPOINT, R2_BUCKET, R2_ACCESS_KEY, R2_SECRET_KEY])
|
||||
|
||||
if use_r2:
|
||||
logger.info("R2 credentials found, uploading to R2")
|
||||
s3_client = boto3.client(
|
||||
's3',
|
||||
endpoint_url=R2_ENDPOINT,
|
||||
aws_access_key_id=R2_ACCESS_KEY,
|
||||
aws_secret_access_key=R2_SECRET_KEY
|
||||
)
|
||||
else:
|
||||
logger.info("R2 credentials not found, downloading to local storage")
|
||||
s3_client = None
|
||||
|
||||
# Try current month and previous 3 months (USDA data is published with lag)
|
||||
with niquests.Session() as session:
|
||||
for months_back in range(4):
|
||||
year = today.year
|
||||
month = today.month - months_back
|
||||
# Handle year rollover
|
||||
while month < 1:
|
||||
month += 12
|
||||
year -= 1
|
||||
@@ -125,11 +70,10 @@ def extract_psd_dataset():
|
||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||
logger.info(f"Trying {year}-{month:02d}...")
|
||||
|
||||
# Check if URL exists
|
||||
response = session.head(url)
|
||||
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if response.status_code == 200:
|
||||
logger.info(f"Found latest data at {year}-{month:02d}")
|
||||
extract_psd_file(url=url, http_session=session, extract_to_path=OUTPUT_DIR, s3_client=s3_client)
|
||||
extract_psd_file(url=url, year=year, month=month, http_session=session)
|
||||
return
|
||||
elif response.status_code == 404:
|
||||
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
|
||||
@@ -141,5 +85,3 @@ def extract_psd_dataset():
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_psd_dataset()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user