feat: extraction framework overhaul — extract_core shared package + SQLite state tracking

- Add extract/extract_core/ workspace package with three modules: - state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor) - http.py: niquests session factory + etag normalization helpers - files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes) - State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed - SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical - Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks): - Replace inline boilerplate with extract_core helpers - Add start_run/end_run tracking to every extraction entry point - extract_cot_year returns int (bytes_written) instead of bool - Update tests: assert result == 0 (not `is False`) for the return type change Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 14:37:50 +01:00
parent fc4121183c
commit 80c1163a7f
16 changed files with 702 additions and 290 deletions
--- a/extract/extract_core/pyproject.toml
+++ b/extract/extract_core/pyproject.toml
@@ -0,0 +1,15 @@
+[project]
+name = "extract_core"
+version = "0.1.0"
+description = "Shared extraction utilities: SQLite state tracking, HTTP helpers, file I/O"
+requires-python = ">=3.13"
+dependencies = [
+    "niquests>=3.14.1",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/extract_core"]
--- a/extract/extract_core/src/extract_core/init.py
+++ b/extract/extract_core/src/extract_core/init.py
@@ -0,0 +1,17 @@
+from .files import content_hash, landing_path, write_bytes_atomic
+from .http import create_session, head_etag, normalize_etag
+from .state import end_run, get_last_cursor, get_recent_runs, open_state_db, start_run
+
+__all__ = [
+    "open_state_db",
+    "start_run",
+    "end_run",
+    "get_last_cursor",
+    "get_recent_runs",
+    "create_session",
+    "head_etag",
+    "normalize_etag",
+    "landing_path",
+    "content_hash",
+    "write_bytes_atomic",
+]
--- a/extract/extract_core/src/extract_core/files.py
+++ b/extract/extract_core/src/extract_core/files.py
@@ -0,0 +1,53 @@
+"""Landing zone file I/O helpers for extraction pipelines."""
+
+import hashlib
+from pathlib import Path
+
+
+def landing_path(landing_dir: str | Path, *parts: str) -> Path:
+    """Return path to a subdirectory of landing_dir, creating it if absent.
+
+    Example:
+        dest_dir = landing_path("data/landing", "psd", "2025", "07")
+        # Returns Path("data/landing/psd/2025/07"), all directories created.
+
+    Use this instead of manually calling mkdir — it makes the intent clear
+    and ensures the directory always exists before the caller writes to it.
+    """
+    assert landing_dir, "landing_dir must not be empty"
+    path = Path(landing_dir).joinpath(*parts)
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def content_hash(data: bytes, prefix_bytes: int = 8) -> str:
+    """Return the first prefix_bytes hex chars of the SHA256 digest of data.
+
+    Used as a short idempotency key for content-addressed filenames. Two runs
+    that produce identical data will produce the same hash, so the file already
+    existing on disk is proof the content is unchanged.
+
+    prefix_bytes=8 gives 32-bit collision resistance — sufficient for a few
+    thousand files per extractor. Increase to 16 for very high-volume sources.
+    """
+    assert data, "data must not be empty"
+    assert 1 <= prefix_bytes <= 64, f"prefix_bytes must be 1-64, got {prefix_bytes}"
+    return hashlib.sha256(data).hexdigest()[:prefix_bytes]
+
+
+def write_bytes_atomic(path: Path, data: bytes) -> int:
+    """Write data to path atomically via a .tmp sibling file.
+
+    Writes to {path}.tmp first, then renames to path. On Linux, rename() is
+    atomic when src and dst are on the same filesystem — a reader never sees
+    a partial file. Returns the number of bytes written.
+
+    Callers are responsible for compression (e.g. gzip.compress) before calling
+    this function.
+    """
+    assert data, "data must not be empty"
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_bytes(data)
+    tmp.rename(path)
+    assert path.exists(), f"File was not written: {path}"
+    return len(data)
--- a/extract/extract_core/src/extract_core/http.py
+++ b/extract/extract_core/src/extract_core/http.py
@@ -0,0 +1,43 @@
+"""HTTP session factory and etag helpers for extraction pipelines."""
+
+import niquests
+
+
+def create_session(timeout_seconds: int = 60, max_retries: int = 3) -> niquests.Session:
+    """Return a new niquests Session.
+
+    timeout_seconds is stored as an attribute so callers that need a consistent
+    timeout can reference it via session.timeout_seconds.
+    """
+    assert timeout_seconds > 0, f"timeout_seconds must be positive, got {timeout_seconds}"
+    assert max_retries >= 0, f"max_retries must be non-negative, got {max_retries}"
+    session = niquests.Session()
+    session.timeout_seconds = timeout_seconds  # type: ignore[attr-defined]
+    return session
+
+
+def normalize_etag(raw: str) -> str:
+    """Normalize an HTTP etag for use as a filename component.
+
+    Strips surrounding quotes and replaces colons with underscores so the
+    result is safe as a filename on all platforms.
+
+    Example: '"abc:def"' → 'abc_def'
+    """
+    assert raw, "raw etag must not be empty"
+    return raw.strip().strip('"').replace(":", "_")
+
+
+def head_etag(session: niquests.Session, url: str, timeout_seconds: int = 60) -> str | None:
+    """Send a HEAD request and return the normalized etag, or None if absent.
+
+    Returns None (not raises) when the server omits the etag header, so callers
+    can fall back to a synthetic key (e.g. content-length + last-modified).
+    """
+    assert url, "url must not be empty"
+    assert timeout_seconds > 0, f"timeout_seconds must be positive, got {timeout_seconds}"
+    response = session.head(url, timeout=timeout_seconds)
+    raw = response.headers.get("etag", "")
+    if not raw:
+        return None
+    return normalize_etag(raw)
--- a/extract/extract_core/src/extract_core/state.py
+++ b/extract/extract_core/src/extract_core/state.py
@@ -0,0 +1,122 @@
+"""SQLite-backed extraction run state.
+
+State table lives at {LANDING_DIR}/.state.sqlite — derived from LANDING_DIR,
+no extra env var required. SQLite is used (not DuckDB) because state tracking
+is a transactional workload: single-row inserts and point-lookup updates, not
+analytical scans. WAL mode allows concurrent readers while a run is in progress.
+
+Schema is generic: extractor is a free-text name, cursor_value is a TEXT field
+that can hold any cursor type (date string, etag, page number, or JSON for
+multi-dimensional cursors). The schema should never need changing — add new
+extractor names by just using them.
+"""
+
+import sqlite3
+from pathlib import Path
+
+_CREATE_TABLE_SQL = """
+CREATE TABLE IF NOT EXISTS extraction_runs (
+    run_id        INTEGER PRIMARY KEY AUTOINCREMENT,
+    extractor     TEXT NOT NULL,
+    started_at    TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')),
+    finished_at   TEXT,
+    status        TEXT NOT NULL DEFAULT 'running',
+    files_written INTEGER DEFAULT 0,
+    files_skipped INTEGER DEFAULT 0,
+    bytes_written INTEGER DEFAULT 0,
+    cursor_value  TEXT,
+    error_message TEXT
+)
+"""
+
+
+def open_state_db(landing_dir: str | Path) -> sqlite3.Connection:
+    """Open (or create) the state DB at {landing_dir}/.state.sqlite.
+
+    Enables WAL mode so concurrent readers (e.g. monitoring scripts) can query
+    the DB while an extraction run is in progress. Caller must close when done.
+    """
+    assert landing_dir, "landing_dir must not be empty"
+    db_path = Path(landing_dir) / ".state.sqlite"
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(db_path))
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute(_CREATE_TABLE_SQL)
+    conn.commit()
+    return conn
+
+
+def start_run(conn: sqlite3.Connection, extractor: str) -> int:
+    """Insert a 'running' row for extractor. Returns the new run_id."""
+    assert extractor, "extractor name must not be empty"
+    cur = conn.execute(
+        "INSERT INTO extraction_runs (extractor, status) VALUES (?, 'running')",
+        (extractor,),
+    )
+    conn.commit()
+    assert cur.lastrowid is not None, "INSERT did not return a row ID"
+    return cur.lastrowid
+
+
+def end_run(
+    conn: sqlite3.Connection,
+    run_id: int,
+    *,
+    status: str,
+    files_written: int = 0,
+    files_skipped: int = 0,
+    bytes_written: int = 0,
+    cursor_value: str | None = None,
+    error_message: str | None = None,
+) -> None:
+    """Update the run row to its final state (success or failed)."""
+    assert status in ("success", "failed"), f"status must be 'success' or 'failed', got {status!r}"
+    assert run_id > 0, f"run_id must be positive, got {run_id}"
+    assert files_written >= 0, f"files_written must be non-negative, got {files_written}"
+    assert files_skipped >= 0, f"files_skipped must be non-negative, got {files_skipped}"
+    assert bytes_written >= 0, f"bytes_written must be non-negative, got {bytes_written}"
+    conn.execute(
+        """
+        UPDATE extraction_runs
+        SET finished_at   = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
+            status        = ?,
+            files_written = ?,
+            files_skipped = ?,
+            bytes_written = ?,
+            cursor_value  = ?,
+            error_message = ?
+        WHERE run_id = ?
+        """,
+        (status, files_written, files_skipped, bytes_written, cursor_value, error_message, run_id),
+    )
+    conn.commit()
+
+
+def get_last_cursor(conn: sqlite3.Connection, extractor: str) -> str | None:
+    """Return the cursor_value from the most recent successful run, or None."""
+    row = conn.execute(
+        """
+        SELECT cursor_value FROM extraction_runs
+        WHERE extractor = ? AND status = 'success' AND cursor_value IS NOT NULL
+        ORDER BY run_id DESC
+        LIMIT 1
+        """,
+        (extractor,),
+    ).fetchone()
+    return row["cursor_value"] if row else None
+
+
+def get_recent_runs(conn: sqlite3.Connection, extractor: str, limit: int = 10) -> list[dict]:
+    """Return the most recent runs for an extractor, newest first."""
+    assert limit > 0, f"limit must be positive, got {limit}"
+    rows = conn.execute(
+        """
+        SELECT * FROM extraction_runs
+        WHERE extractor = ?
+        ORDER BY run_id DESC
+        LIMIT ?
+        """,
+        (extractor, limit),
+    ).fetchall()
+    return [dict(r) for r in rows]