feat: extraction framework overhaul — extract_core shared package + SQLite state tracking

- Add extract/extract_core/ workspace package with three modules: - state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor) - http.py: niquests session factory + etag normalization helpers - files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes) - State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed - SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical - Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks): - Replace inline boilerplate with extract_core helpers - Add start_run/end_run tracking to every extraction entry point - extract_cot_year returns int (bytes_written) instead of bool - Update tests: assert result == 0 (not `is False`) for the return type change Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 14:37:50 +01:00
parent fc4121183c
commit 80c1163a7f
16 changed files with 702 additions and 290 deletions
--- a/extract/ice_stocks/pyproject.toml
+++ b/extract/ice_stocks/pyproject.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 description = "ICE certified warehouse stocks extractor"
 requires-python = ">=3.13"
 dependencies = [
+    "extract_core",
    "niquests>=3.14.1",
    "xlrd>=2.0.1",
 ]
--- a/extract/ice_stocks/src/ice_stocks/execute.py
+++ b/extract/ice_stocks/src/ice_stocks/execute.py
@@ -20,16 +20,23 @@ CSV schemas:

 import csv
 import gzip
-import hashlib
 import io
 import logging
 import os
-import pathlib
 import sys
 from datetime import datetime
+from pathlib import Path

 import niquests
 import xlrd
+from extract_core import (
+    content_hash,
+    end_run,
+    landing_path,
+    open_state_db,
+    start_run,
+    write_bytes_atomic,
+)

 from ice_stocks.ice_api import find_all_reports, find_latest_report
 from ice_stocks.xls_parse import OLE2_MAGIC, detect_file_format, xls_to_rows
@@ -42,7 +49,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("ICE Stocks Extractor")

-LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
+LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))

 # ── ice_stocks (daily rolling) ──────────────────────────────────────────────
 DEST_SUBDIR = "ice_stocks"
@@ -105,31 +112,30 @@ HISTORICAL_PORT_COLS = [

 # ── shared helpers ───────────────────────────────────────────────────────────

-def _write_landing_file(canonical_csv: bytes, dest_subdir: str, date_label: str) -> None:
-    """SHA256-hash canonical_csv, skip if exists, else gzip and write."""
+def _write_landing_file(canonical_csv: bytes, dest_subdir: str, date_label: str) -> int:
+    """SHA256-hash canonical_csv, skip if exists, else gzip and write atomically.
+
+    Returns bytes_written (0 if skipped).
+    """
    assert canonical_csv, "canonical_csv must not be empty"
    assert dest_subdir, "dest_subdir must not be empty"
    assert date_label, "date_label must not be empty"

-    sha256 = hashlib.sha256(canonical_csv).hexdigest()
-    etag = sha256[:8]
+    etag = content_hash(canonical_csv)
    year = date_label[:4]

-    dest_dir = LANDING_DIR / dest_subdir / year
+    dest_dir = landing_path(LANDING_DIR, dest_subdir, year)
    local_file = dest_dir / f"{date_label}_{etag}.csv.gzip"

    if local_file.exists():
        logger.info(f"File {local_file.name} already exists — content unchanged, skipping")
-        return
+        return 0

-    dest_dir.mkdir(parents=True, exist_ok=True)
    compressed = gzip.compress(canonical_csv)
-    local_file.write_bytes(compressed)
+    bytes_written = write_bytes_atomic(local_file, compressed)

-    assert local_file.exists(), f"File was not written: {local_file}"
-    assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
-
-    logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
+    logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
+    return bytes_written


 def _build_csv_bytes(fieldnames: list[str], rows: list[dict]) -> bytes:
@@ -243,47 +249,66 @@ def extract_ice_stocks() -> None:
    discovery to find the latest 'Daily Warehouse Stocks' report.
    Idempotent: skips if content hash already on disk.
    """
-    with niquests.Session() as session:
-        logger.info(f"Trying ICE rolling CSV: {ICE_ROLLING_CSV_URL}")
-        try:
-            response = session.get(ICE_ROLLING_CSV_URL, timeout=HTTP_TIMEOUT_SECONDS)
-        except Exception as e:
-            logger.warning(f"Rolling CSV fetch failed: {e} — trying API discovery")
-            response = None
-
-        use_api = response is None or response.status_code == 404
-
-        if use_api:
-            logger.info("Falling back to ICE API discovery for Daily Warehouse Stocks")
-            report = find_latest_report(session, ICE_STOCKS_LABEL)
-            if not report:
-                logger.error("ICE API: no 'Daily Warehouse Stocks' report found")
-                return
-            logger.info(f"Found report via API: {report['download_label']} ({report['publish_date']})")
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, "ice_stocks")
+    try:
+        with niquests.Session() as session:
+            logger.info(f"Trying ICE rolling CSV: {ICE_ROLLING_CSV_URL}")
            try:
-                response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
+                response = session.get(ICE_ROLLING_CSV_URL, timeout=HTTP_TIMEOUT_SECONDS)
            except Exception as e:
-                logger.error(f"Failed to download report from API URL: {e}")
+                logger.warning(f"Rolling CSV fetch failed: {e} — trying API discovery")
+                response = None
+
+            use_api = response is None or response.status_code == 404
+
+            if use_api:
+                logger.info("Falling back to ICE API discovery for Daily Warehouse Stocks")
+                report = find_latest_report(session, ICE_STOCKS_LABEL)
+                if not report:
+                    logger.error("ICE API: no 'Daily Warehouse Stocks' report found")
+                    end_run(conn, run_id, status="failed", error_message="No report found via API")
+                    return
+                logger.info(f"Found report via API: {report['download_label']} ({report['publish_date']})")
+                try:
+                    response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
+                except Exception as e:
+                    logger.error(f"Failed to download report from API URL: {e}")
+                    end_run(conn, run_id, status="failed", error_message=str(e))
+                    return
+
+            if response.status_code != 200:
+                logger.error(f"Unexpected status {response.status_code}")
+                end_run(conn, run_id, status="failed", error_message=f"HTTP {response.status_code}")
                return

-        if response.status_code != 200:
-            logger.error(f"Unexpected status {response.status_code}")
+            assert len(response.content) > 0, "Downloaded empty file from ICE"
+
+            fmt = detect_file_format(response.content)
+            if fmt == "xls":
+                canonical_csv = _build_canonical_csv_from_xls(response.content)
+            else:
+                canonical_csv = _build_canonical_csv_from_csv(response.content)
+
+        if not canonical_csv:
+            logger.warning("ICE stocks parsed to 0 rows — check column mapping or XLS structure")
+            end_run(conn, run_id, status="failed", error_message="Parsed 0 rows")
            return

-        assert len(response.content) > 0, "Downloaded empty file from ICE"
-
-        fmt = detect_file_format(response.content)
-        if fmt == "xls":
-            canonical_csv = _build_canonical_csv_from_xls(response.content)
-        else:
-            canonical_csv = _build_canonical_csv_from_csv(response.content)
-
-    if not canonical_csv:
-        logger.warning("ICE stocks parsed to 0 rows — check column mapping or XLS structure")
-        return
-
-    today = datetime.now().strftime("%Y-%m-%d")
-    _write_landing_file(canonical_csv, DEST_SUBDIR, today)
+        today = datetime.now().strftime("%Y-%m-%d")
+        bytes_written = _write_landing_file(canonical_csv, DEST_SUBDIR, today)
+        end_run(
+            conn, run_id, status="success",
+            files_written=1 if bytes_written > 0 else 0,
+            files_skipped=1 if bytes_written == 0 else 0,
+            bytes_written=bytes_written,
+            cursor_value=today,
+        )
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()


 # ── ice_aging (monthly aging report) ────────────────────────────────────────
@@ -309,65 +334,85 @@ def extract_ice_aging() -> None:
    Monthly report: stock quantities by age bucket × port.
    Idempotent: skips if content hash already on disk.
    """
-    with niquests.Session() as session:
-        logger.info("Fetching latest ICE Aging Report via API")
-        report = find_latest_report(session, ICE_AGING_LABEL)
-        if not report:
-            logger.error(f"ICE API: no report matching {ICE_AGING_LABEL!r}")
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, "ice_aging")
+    try:
+        with niquests.Session() as session:
+            logger.info("Fetching latest ICE Aging Report via API")
+            report = find_latest_report(session, ICE_AGING_LABEL)
+            if not report:
+                logger.error(f"ICE API: no report matching {ICE_AGING_LABEL!r}")
+                end_run(conn, run_id, status="failed", error_message="No aging report found via API")
+                return
+
+            logger.info(f"Downloading: {report['download_label']} ({report['publish_date']})")
+            try:
+                response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
+            except Exception as e:
+                logger.error(f"Failed to download aging report: {e}")
+                end_run(conn, run_id, status="failed", error_message=str(e))
+                return
+
+            assert response.status_code == 200, f"HTTP {response.status_code}"
+            assert response.content[:4] == OLE2_MAGIC, "Aging report is not an XLS file"
+
+        rows = xls_to_rows(response.content)
+
+        report_date = _parse_aging_date(str(rows[0][0]) if rows else "")
+        if not report_date:
+            msg = f"Could not parse aging report date from row 0: {rows[0] if rows else '(empty)'!r}"
+            logger.error(msg)
+            end_run(conn, run_id, status="failed", error_message=msg)
            return

-        logger.info(f"Downloading: {report['download_label']} ({report['publish_date']})")
-        try:
-            response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
-        except Exception as e:
-            logger.error(f"Failed to download aging report: {e}")
-            return
+        # Row 3+ are data rows; stop at row labelled "Total"
+        fieldnames = ["report_date", "age_bucket"] + AGING_PORT_HEADERS
+        data_rows = []

-        assert response.status_code == 200, f"HTTP {response.status_code}"
-        assert response.content[:4] == OLE2_MAGIC, "Aging report is not an XLS file"
+        for row in rows[3:]:
+            if not row or not str(row[0]).strip():
+                continue
+            label = str(row[0]).strip()
+            if label.lower() == "total":
+                break

-    rows = xls_to_rows(response.content)
+            port_values = []
+            for cell in row[1:]:
+                if isinstance(cell, float):
+                    port_values.append(str(int(cell)))
+                elif str(cell).strip() in ("-", ""):
+                    port_values.append("0")
+                else:
+                    port_values.append(str(cell).replace(",", "").strip())

-    report_date = _parse_aging_date(str(rows[0][0]) if rows else "")
-    if not report_date:
-        logger.error(f"Could not parse aging report date from row 0: {rows[0] if rows else '(empty)'!r}")
-        return
-
-    # Row 3+ are data rows; stop at row labelled "Total"
-    fieldnames = ["report_date", "age_bucket"] + AGING_PORT_HEADERS
-    data_rows = []
-
-    for row in rows[3:]:
-        if not row or not str(row[0]).strip():
-            continue
-        label = str(row[0]).strip()
-        if label.lower() == "total":
-            break
-
-        port_values = []
-        for cell in row[1:]:
-            if isinstance(cell, float):
-                port_values.append(str(int(cell)))
-            elif str(cell).strip() in ("-", ""):
+            while len(port_values) < len(AGING_PORT_HEADERS):
                port_values.append("0")
-            else:
-                port_values.append(str(cell).replace(",", "").strip())
+            port_values = port_values[:len(AGING_PORT_HEADERS)]

-        while len(port_values) < len(AGING_PORT_HEADERS):
-            port_values.append("0")
-        port_values = port_values[:len(AGING_PORT_HEADERS)]
+            record = {"report_date": report_date, "age_bucket": label}
+            for col, val in zip(AGING_PORT_HEADERS, port_values):
+                record[col] = val
+            data_rows.append(record)

-        record = {"report_date": report_date, "age_bucket": label}
-        for col, val in zip(AGING_PORT_HEADERS, port_values):
-            record[col] = val
-        data_rows.append(record)
+        if not data_rows:
+            logger.warning("Aging report parsed to 0 data rows")
+            end_run(conn, run_id, status="failed", error_message="Parsed 0 data rows")
+            return

-    if not data_rows:
-        logger.warning("Aging report parsed to 0 data rows")
-        return
-
-    canonical_csv = _build_csv_bytes(fieldnames, data_rows)
-    _write_landing_file(canonical_csv, AGING_DEST_SUBDIR, report_date)
+        canonical_csv = _build_csv_bytes(fieldnames, data_rows)
+        bytes_written = _write_landing_file(canonical_csv, AGING_DEST_SUBDIR, report_date)
+        end_run(
+            conn, run_id, status="success",
+            files_written=1 if bytes_written > 0 else 0,
+            files_skipped=1 if bytes_written == 0 else 0,
+            bytes_written=bytes_written,
+            cursor_value=report_date,
+        )
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()


 # ── ice_stocks_by_port (historical) ─────────────────────────────────────────
@@ -387,63 +432,80 @@ def extract_ice_historical() -> None:
    Static URL updated monthly. Covers Nov 1996 to present.
    Idempotent: skips if content hash already on disk.
    """
-    logger.info(f"Downloading ICE historical by-port XLS: {ICE_HISTORICAL_URL}")
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, "ice_historical")
+    try:
+        logger.info(f"Downloading ICE historical by-port XLS: {ICE_HISTORICAL_URL}")

-    with niquests.Session() as session:
-        try:
-            response = session.get(ICE_HISTORICAL_URL, timeout=HISTORICAL_HTTP_TIMEOUT_SECONDS)
-        except Exception as e:
-            logger.error(f"Failed to download historical XLS: {e}")
+        with niquests.Session() as session:
+            try:
+                response = session.get(ICE_HISTORICAL_URL, timeout=HISTORICAL_HTTP_TIMEOUT_SECONDS)
+            except Exception as e:
+                logger.error(f"Failed to download historical XLS: {e}")
+                end_run(conn, run_id, status="failed", error_message=str(e))
+                return
+
+            assert response.status_code == 200, f"HTTP {response.status_code}"
+            assert response.content[:4] == OLE2_MAGIC, "Historical file is not an XLS"
+
+        book = xlrd.open_workbook(file_contents=response.content)
+        datemode = book.datemode
+        rows = xls_to_rows(response.content)
+
+        # Data starts at row 8 (0-indexed); rows 0-7 are headers
+        fieldnames = ["report_date"] + HISTORICAL_PORT_COLS
+        data_rows = []
+
+        for row in rows[8:]:
+            if not row or len(row) < 2:
+                continue
+
+            serial_cell = row[1]
+            if not isinstance(serial_cell, float) or serial_cell <= 0:
+                continue
+
+            report_date = _excel_serial_to_date(serial_cell, datemode)
+            if not report_date:
+                continue
+
+            port_cells = row[2:2 + len(HISTORICAL_PORT_COLS)]
+            port_values = []
+            for cell in port_cells:
+                if cell == "" or str(cell).strip() in ("-", ""):
+                    port_values.append("0")
+                elif isinstance(cell, float):
+                    port_values.append(str(int(cell)))
+                else:
+                    port_values.append(str(cell).replace(",", "").strip())
+
+            while len(port_values) < len(HISTORICAL_PORT_COLS):
+                port_values.append("0")
+
+            record = {"report_date": report_date}
+            for col, val in zip(HISTORICAL_PORT_COLS, port_values):
+                record[col] = val
+            data_rows.append(record)
+
+        if not data_rows:
+            logger.warning("Historical XLS parsed to 0 data rows")
+            end_run(conn, run_id, status="failed", error_message="Parsed 0 data rows")
            return

-        assert response.status_code == 200, f"HTTP {response.status_code}"
-        assert response.content[:4] == OLE2_MAGIC, "Historical file is not an XLS"
-
-    book = xlrd.open_workbook(file_contents=response.content)
-    datemode = book.datemode
-    rows = xls_to_rows(response.content)
-
-    # Data starts at row 8 (0-indexed); rows 0-7 are headers
-    fieldnames = ["report_date"] + HISTORICAL_PORT_COLS
-    data_rows = []
-
-    for row in rows[8:]:
-        if not row or len(row) < 2:
-            continue
-
-        serial_cell = row[1]
-        if not isinstance(serial_cell, float) or serial_cell <= 0:
-            continue
-
-        report_date = _excel_serial_to_date(serial_cell, datemode)
-        if not report_date:
-            continue
-
-        port_cells = row[2:2 + len(HISTORICAL_PORT_COLS)]
-        port_values = []
-        for cell in port_cells:
-            if cell == "" or str(cell).strip() in ("-", ""):
-                port_values.append("0")
-            elif isinstance(cell, float):
-                port_values.append(str(int(cell)))
-            else:
-                port_values.append(str(cell).replace(",", "").strip())
-
-        while len(port_values) < len(HISTORICAL_PORT_COLS):
-            port_values.append("0")
-
-        record = {"report_date": report_date}
-        for col, val in zip(HISTORICAL_PORT_COLS, port_values):
-            record[col] = val
-        data_rows.append(record)
-
-    if not data_rows:
-        logger.warning("Historical XLS parsed to 0 data rows")
-        return
-
-    canonical_csv = _build_csv_bytes(fieldnames, data_rows)
-    today = datetime.now().strftime("%Y-%m-%d")
-    _write_landing_file(canonical_csv, HISTORICAL_DEST_SUBDIR, today)
+        canonical_csv = _build_csv_bytes(fieldnames, data_rows)
+        today = datetime.now().strftime("%Y-%m-%d")
+        bytes_written = _write_landing_file(canonical_csv, HISTORICAL_DEST_SUBDIR, today)
+        end_run(
+            conn, run_id, status="success",
+            files_written=1 if bytes_written > 0 else 0,
+            files_skipped=1 if bytes_written == 0 else 0,
+            bytes_written=bytes_written,
+            cursor_value=today,
+        )
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()


 def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
@@ -458,50 +520,63 @@ def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
    """
    assert max_pages > 0, f"max_pages must be positive, got {max_pages}"

-    with niquests.Session() as session:
-        logger.info(f"Fetching all available Daily Warehouse Stocks reports (max {max_pages} pages)...")
-        reports = find_all_reports(session, ICE_STOCKS_LABEL, max_pages=max_pages)
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, "ice_stocks_backfill")
+    files_written = 0
+    files_skipped = 0
+    bytes_written_total = 0
+    try:
+        with niquests.Session() as session:
+            logger.info(f"Fetching all available Daily Warehouse Stocks reports (max {max_pages} pages)...")
+            reports = find_all_reports(session, ICE_STOCKS_LABEL, max_pages=max_pages)

-        if not reports:
-            logger.error("ICE API: no 'Daily Warehouse Stocks' reports found")
-            return
+            if not reports:
+                logger.error("ICE API: no 'Daily Warehouse Stocks' reports found")
+                end_run(conn, run_id, status="failed", error_message="No reports found via API")
+                return

-        logger.info(f"Found {len(reports)} reports: {reports[-1]['publish_date']} → {reports[0]['publish_date']}")
-        downloaded = 0
-        skipped = 0
+            logger.info(f"Found {len(reports)} reports: {reports[-1]['publish_date']} → {reports[0]['publish_date']}")

-        for report in reports:
-            publish_date = report["publish_date"]
-            try:
-                response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
-            except Exception as e:
-                logger.warning(f"Failed to download {publish_date}: {e}")
-                continue
+            for report in reports:
+                publish_date = report["publish_date"]
+                try:
+                    response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
+                except Exception as e:
+                    logger.warning(f"Failed to download {publish_date}: {e}")
+                    continue

-            if response.status_code != 200:
-                logger.warning(f"HTTP {response.status_code} for {publish_date} — skipping")
-                continue
+                if response.status_code != 200:
+                    logger.warning(f"HTTP {response.status_code} for {publish_date} — skipping")
+                    continue

-            fmt = detect_file_format(response.content)
-            if fmt == "xls":
-                canonical_csv = _build_canonical_csv_from_xls(response.content)
-            else:
-                canonical_csv = _build_canonical_csv_from_csv(response.content)
+                fmt = detect_file_format(response.content)
+                if fmt == "xls":
+                    canonical_csv = _build_canonical_csv_from_xls(response.content)
+                else:
+                    canonical_csv = _build_canonical_csv_from_csv(response.content)

-            if not canonical_csv:
-                logger.warning(f"Parsed 0 rows for {publish_date} — skipping")
-                continue
+                if not canonical_csv:
+                    logger.warning(f"Parsed 0 rows for {publish_date} — skipping")
+                    continue

-            # Use the report's publish date as the file date label
-            file_count_before = sum(1 for _ in (LANDING_DIR / DEST_SUBDIR).rglob("*.csv.gzip"))
-            _write_landing_file(canonical_csv, DEST_SUBDIR, publish_date)
-            file_count_after = sum(1 for _ in (LANDING_DIR / DEST_SUBDIR).rglob("*.csv.gzip"))
-            if file_count_after > file_count_before:
-                downloaded += 1
-            else:
-                skipped += 1
+                result = _write_landing_file(canonical_csv, DEST_SUBDIR, publish_date)
+                if result > 0:
+                    files_written += 1
+                    bytes_written_total += result
+                else:
+                    files_skipped += 1

-    logger.info(f"Backfill complete: {downloaded} new files downloaded, {skipped} already existed")
+        logger.info(f"Backfill complete: {files_written} new files downloaded, {files_skipped} already existed")
+        end_run(
+            conn, run_id, status="success",
+            files_written=files_written, files_skipped=files_skipped,
+            bytes_written=bytes_written_total,
+        )
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()


 def extract_ice_all() -> None: