feat: extraction framework overhaul — extract_core shared package + SQLite state tracking

- Add extract/extract_core/ workspace package with three modules: - state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor) - http.py: niquests session factory + etag normalization helpers - files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes) - State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed - SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical - Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks): - Replace inline boilerplate with extract_core helpers - Add start_run/end_run tracking to every extraction entry point - extract_cot_year returns int (bytes_written) instead of bool - Update tests: assert result == 0 (not `is False`) for the return type change Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 14:37:50 +01:00
parent fc4121183c
commit 80c1163a7f
16 changed files with 702 additions and 290 deletions
--- a/extract/psdonline/pyproject.toml
+++ b/extract/psdonline/pyproject.toml
@@ -8,6 +8,7 @@ authors = [
 requires-python = ">=3.13"

 dependencies = [
+    "extract_core",
    "niquests>=3.14.1",
 ]
 [project.scripts]
--- a/extract/psdonline/src/psdonline/execute.py
+++ b/extract/psdonline/src/psdonline/execute.py
@@ -1,12 +1,14 @@
 from .normalize import normalize_zipped_csv
 import logging
 import os
-import pathlib
 import sys
 from datetime import datetime
 from io import BytesIO
+from pathlib import Path

 import niquests
+from extract_core import end_run, landing_path, normalize_etag, open_state_db, start_run
+from extract_core import write_bytes_atomic

 logging.basicConfig(
    level=logging.INFO,
@@ -16,7 +18,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("PSDOnline Extractor")

-LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
+LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
 LANDING_DIR.mkdir(parents=True, exist_ok=True)
 logger.info(f"Landing dir: {LANDING_DIR}")

@@ -27,61 +29,87 @@ FIRST_MONTH = 8
 HTTP_TIMEOUT_SECONDS = 60


-def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
-    """Extract PSD file to local year/month subdirectory."""
+def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session) -> int:
+    """Extract PSD file to local year/month subdirectory.
+
+    Returns bytes_written (0 if the file already existed and was skipped).
+    """
    logger.info(f"Requesting file {url} ...")

    response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
    if response.status_code == 404:
        logger.error("File doesn't exist on server, received status code 404 Not Found")
-        return
+        return 0
    elif response.status_code != 200:
        logger.error(f"Status code not ok, STATUS={response.status_code}")
-        return
+        return 0

-    etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
-    assert etag, "USDA response missing etag header"
+    raw_etag = response.headers.get("etag", "")
+    assert raw_etag, "USDA response missing etag header"
+    etag = normalize_etag(raw_etag)

-    extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
-    local_file = extract_to_path / f"{etag}.csv.gzip"
+    dest_dir = landing_path(LANDING_DIR, "psd", str(year), f"{month:02d}")
+    local_file = dest_dir / f"{etag}.csv.gzip"
    if local_file.exists():
        logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
-        return
+        return 0

    response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
    logger.info(f"Storing file to {local_file}")
-    extract_to_path.mkdir(parents=True, exist_ok=True)
    normalized_content = normalize_zipped_csv(BytesIO(response.content))
-    local_file.write_bytes(normalized_content.read())
-    assert local_file.exists(), f"File was not written: {local_file}"
+    bytes_written = write_bytes_atomic(local_file, normalized_content.read())
    logger.info("Download complete")
+    return bytes_written


 def extract_psd_dataset():
-    today = datetime.now()
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, "psdonline")
+    files_written = 0
+    files_skipped = 0
+    bytes_written = 0
+    cursor_value = None
+    try:
+        today = datetime.now()
+        with niquests.Session() as session:
+            for months_back in range(4):
+                year = today.year
+                month = today.month - months_back
+                while month < 1:
+                    month += 12
+                    year -= 1

-    with niquests.Session() as session:
-        for months_back in range(4):
-            year = today.year
-            month = today.month - months_back
-            while month < 1:
-                month += 12
-                year -= 1
+                url = PSD_HISTORICAL_URL.format(year=year, month=month)
+                logger.info(f"Trying {year}-{month:02d}...")

-            url = PSD_HISTORICAL_URL.format(year=year, month=month)
-            logger.info(f"Trying {year}-{month:02d}...")
-
-            response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
-            if response.status_code == 200:
-                logger.info(f"Found latest data at {year}-{month:02d}")
-                extract_psd_file(url=url, year=year, month=month, http_session=session)
-                return
-            elif response.status_code == 404:
-                logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
+                response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
+                if response.status_code == 200:
+                    logger.info(f"Found latest data at {year}-{month:02d}")
+                    result = extract_psd_file(url=url, year=year, month=month, http_session=session)
+                    if result > 0:
+                        files_written = 1
+                        bytes_written = result
+                    else:
+                        files_skipped = 1
+                    cursor_value = f"{year}-{month:02d}"
+                    break
+                elif response.status_code == 404:
+                    logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
+                else:
+                    logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
            else:
-                logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
+                logger.error("Could not find any available data in the last 4 months")

-        logger.error("Could not find any available data in the last 4 months")
+        end_run(
+            conn, run_id, status="success",
+            files_written=files_written, files_skipped=files_skipped,
+            bytes_written=bytes_written, cursor_value=cursor_value,
+        )
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()


 if __name__ == "__main__":