feat: extraction framework overhaul — extract_core shared package + SQLite state tracking

- Add extract/extract_core/ workspace package with three modules: - state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor) - http.py: niquests session factory + etag normalization helpers - files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes) - State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed - SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical - Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks): - Replace inline boilerplate with extract_core helpers - Add start_run/end_run tracking to every extraction entry point - extract_cot_year returns int (bytes_written) instead of bool - Update tests: assert result == 0 (not `is False`) for the return type change Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 14:37:50 +01:00
parent fc4121183c
commit 80c1163a7f
16 changed files with 702 additions and 290 deletions
--- a/extract/coffee_prices/pyproject.toml
+++ b/extract/coffee_prices/pyproject.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 description = "KC=F Coffee C futures price extractor"
 requires-python = ">=3.13"
 dependencies = [
+    "extract_core",
    "yfinance>=0.2.55",
 ]

--- a/extract/coffee_prices/src/coffee_prices/execute.py
+++ b/extract/coffee_prices/src/coffee_prices/execute.py
@@ -8,14 +8,15 @@ Landing path: LANDING_DIR/prices/coffee_kc/{hash8}.csv.gzip
 """

 import gzip
-import hashlib
 import io
 import logging
 import os
-import pathlib
 import sys
+from pathlib import Path

 import yfinance as yf
+from extract_core import content_hash, end_run, landing_path, open_state_db, start_run
+from extract_core import write_bytes_atomic

 logging.basicConfig(
    level=logging.INFO,
@@ -25,7 +26,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("Coffee Prices Extractor")

-LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
+LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
 TICKER = "KC=F"
 DEST_SUBDIR = "prices/coffee_kc"

@@ -40,52 +41,54 @@ def extract_coffee_prices() -> None:
    On first run downloads full history (period='max'). On subsequent runs
    the hash matches if no new trading days have closed since last run.
    """
-    logger.info(f"Downloading {TICKER} daily OHLCV from Yahoo Finance...")
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, "coffee_prices")
+    try:
+        logger.info(f"Downloading {TICKER} daily OHLCV from Yahoo Finance...")

-    ticker = yf.Ticker(TICKER)
-    df = ticker.history(period="max", interval="1d", auto_adjust=False, timeout=DOWNLOAD_TIMEOUT_SECONDS)
+        ticker = yf.Ticker(TICKER)
+        df = ticker.history(period="max", interval="1d", auto_adjust=False, timeout=DOWNLOAD_TIMEOUT_SECONDS)

-    assert df is not None and len(df) > 0, f"yfinance returned empty DataFrame for {TICKER}"
+        assert df is not None and len(df) > 0, f"yfinance returned empty DataFrame for {TICKER}"

-    # Reset index so Date becomes a plain column
-    df = df.reset_index()
+        # Reset index so Date becomes a plain column
+        df = df.reset_index()

-    # Keep standard OHLCV columns only; yfinance may return extra columns
-    keep_cols = [c for c in ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"] if c in df.columns]
-    df = df[keep_cols]
+        # Keep standard OHLCV columns only; yfinance may return extra columns
+        keep_cols = [c for c in ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"] if c in df.columns]
+        df = df[keep_cols]

-    # Normalize Date to ISO string for CSV stability across timezones
-    df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
+        # Normalize Date to ISO string for CSV stability across timezones
+        df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")

-    # Serialize to CSV bytes
-    csv_buf = io.StringIO()
-    df.to_csv(csv_buf, index=False)
-    csv_bytes = csv_buf.getvalue().encode("utf-8")
+        # Serialize to CSV bytes
+        csv_buf = io.StringIO()
+        df.to_csv(csv_buf, index=False)
+        csv_bytes = csv_buf.getvalue().encode("utf-8")

-    assert len(csv_bytes) > 0, "CSV serialization produced empty output"
+        assert len(csv_bytes) > 0, "CSV serialization produced empty output"

-    # Hash-based idempotency key (first 8 hex chars of SHA256)
-    sha256 = hashlib.sha256(csv_bytes).hexdigest()
-    etag = sha256[:8]
+        etag = content_hash(csv_bytes)
+        dest_dir = landing_path(LANDING_DIR, DEST_SUBDIR)
+        local_file = dest_dir / f"{etag}.csv.gzip"

-    dest_dir = LANDING_DIR / DEST_SUBDIR
-    local_file = dest_dir / f"{etag}.csv.gzip"
+        if local_file.exists():
+            logger.info(f"File {local_file.name} already exists — no new data, skipping")
+            end_run(conn, run_id, status="success", files_skipped=1)
+            return

-    if local_file.exists():
-        logger.info(f"File {local_file.name} already exists — no new data, skipping")
-        return
+        compressed = gzip.compress(csv_bytes)
+        bytes_written = write_bytes_atomic(local_file, compressed)

-    # Compress and write
-    dest_dir.mkdir(parents=True, exist_ok=True)
-    compressed = gzip.compress(csv_bytes)
-    local_file.write_bytes(compressed)
-
-    assert local_file.exists(), f"File was not written: {local_file}"
-    assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
-
-    logger.info(
-        f"Stored {local_file} ({local_file.stat().st_size:,} bytes, {len(df):,} rows)"
-    )
+        logger.info(
+            f"Stored {local_file} ({bytes_written:,} bytes, {len(df):,} rows)"
+        )
+        end_run(conn, run_id, status="success", files_written=1, bytes_written=bytes_written)
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()


 if __name__ == "__main__":