feat: extraction framework overhaul — extract_core shared package + SQLite state tracking

- Add extract/extract_core/ workspace package with three modules:
  - state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
  - http.py: niquests session factory + etag normalization helpers
  - files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
  - Replace inline boilerplate with extract_core helpers
  - Add start_run/end_run tracking to every extraction entry point
  - extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-22 14:37:50 +01:00
parent fc4121183c
commit 80c1163a7f
16 changed files with 702 additions and 290 deletions

View File

@@ -0,0 +1,15 @@
[project]
name = "extract_core"
version = "0.1.0"
description = "Shared extraction utilities: SQLite state tracking, HTTP helpers, file I/O"
requires-python = ">=3.13"
dependencies = [
"niquests>=3.14.1",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/extract_core"]

View File

@@ -0,0 +1,17 @@
from .files import content_hash, landing_path, write_bytes_atomic
from .http import create_session, head_etag, normalize_etag
from .state import end_run, get_last_cursor, get_recent_runs, open_state_db, start_run
__all__ = [
"open_state_db",
"start_run",
"end_run",
"get_last_cursor",
"get_recent_runs",
"create_session",
"head_etag",
"normalize_etag",
"landing_path",
"content_hash",
"write_bytes_atomic",
]

View File

@@ -0,0 +1,53 @@
"""Landing zone file I/O helpers for extraction pipelines."""
import hashlib
from pathlib import Path
def landing_path(landing_dir: str | Path, *parts: str) -> Path:
"""Return path to a subdirectory of landing_dir, creating it if absent.
Example:
dest_dir = landing_path("data/landing", "psd", "2025", "07")
# Returns Path("data/landing/psd/2025/07"), all directories created.
Use this instead of manually calling mkdir — it makes the intent clear
and ensures the directory always exists before the caller writes to it.
"""
assert landing_dir, "landing_dir must not be empty"
path = Path(landing_dir).joinpath(*parts)
path.mkdir(parents=True, exist_ok=True)
return path
def content_hash(data: bytes, prefix_bytes: int = 8) -> str:
"""Return the first prefix_bytes hex chars of the SHA256 digest of data.
Used as a short idempotency key for content-addressed filenames. Two runs
that produce identical data will produce the same hash, so the file already
existing on disk is proof the content is unchanged.
prefix_bytes=8 gives 32-bit collision resistance — sufficient for a few
thousand files per extractor. Increase to 16 for very high-volume sources.
"""
assert data, "data must not be empty"
assert 1 <= prefix_bytes <= 64, f"prefix_bytes must be 1-64, got {prefix_bytes}"
return hashlib.sha256(data).hexdigest()[:prefix_bytes]
def write_bytes_atomic(path: Path, data: bytes) -> int:
"""Write data to path atomically via a .tmp sibling file.
Writes to {path}.tmp first, then renames to path. On Linux, rename() is
atomic when src and dst are on the same filesystem — a reader never sees
a partial file. Returns the number of bytes written.
Callers are responsible for compression (e.g. gzip.compress) before calling
this function.
"""
assert data, "data must not be empty"
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_bytes(data)
tmp.rename(path)
assert path.exists(), f"File was not written: {path}"
return len(data)

View File

@@ -0,0 +1,43 @@
"""HTTP session factory and etag helpers for extraction pipelines."""
import niquests
def create_session(timeout_seconds: int = 60, max_retries: int = 3) -> niquests.Session:
"""Return a new niquests Session.
timeout_seconds is stored as an attribute so callers that need a consistent
timeout can reference it via session.timeout_seconds.
"""
assert timeout_seconds > 0, f"timeout_seconds must be positive, got {timeout_seconds}"
assert max_retries >= 0, f"max_retries must be non-negative, got {max_retries}"
session = niquests.Session()
session.timeout_seconds = timeout_seconds # type: ignore[attr-defined]
return session
def normalize_etag(raw: str) -> str:
"""Normalize an HTTP etag for use as a filename component.
Strips surrounding quotes and replaces colons with underscores so the
result is safe as a filename on all platforms.
Example: '"abc:def"''abc_def'
"""
assert raw, "raw etag must not be empty"
return raw.strip().strip('"').replace(":", "_")
def head_etag(session: niquests.Session, url: str, timeout_seconds: int = 60) -> str | None:
"""Send a HEAD request and return the normalized etag, or None if absent.
Returns None (not raises) when the server omits the etag header, so callers
can fall back to a synthetic key (e.g. content-length + last-modified).
"""
assert url, "url must not be empty"
assert timeout_seconds > 0, f"timeout_seconds must be positive, got {timeout_seconds}"
response = session.head(url, timeout=timeout_seconds)
raw = response.headers.get("etag", "")
if not raw:
return None
return normalize_etag(raw)

View File

@@ -0,0 +1,122 @@
"""SQLite-backed extraction run state.
State table lives at {LANDING_DIR}/.state.sqlite — derived from LANDING_DIR,
no extra env var required. SQLite is used (not DuckDB) because state tracking
is a transactional workload: single-row inserts and point-lookup updates, not
analytical scans. WAL mode allows concurrent readers while a run is in progress.
Schema is generic: extractor is a free-text name, cursor_value is a TEXT field
that can hold any cursor type (date string, etag, page number, or JSON for
multi-dimensional cursors). The schema should never need changing — add new
extractor names by just using them.
"""
import sqlite3
from pathlib import Path
_CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS extraction_runs (
run_id INTEGER PRIMARY KEY AUTOINCREMENT,
extractor TEXT NOT NULL,
started_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')),
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running',
files_written INTEGER DEFAULT 0,
files_skipped INTEGER DEFAULT 0,
bytes_written INTEGER DEFAULT 0,
cursor_value TEXT,
error_message TEXT
)
"""
def open_state_db(landing_dir: str | Path) -> sqlite3.Connection:
"""Open (or create) the state DB at {landing_dir}/.state.sqlite.
Enables WAL mode so concurrent readers (e.g. monitoring scripts) can query
the DB while an extraction run is in progress. Caller must close when done.
"""
assert landing_dir, "landing_dir must not be empty"
db_path = Path(landing_dir) / ".state.sqlite"
db_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute(_CREATE_TABLE_SQL)
conn.commit()
return conn
def start_run(conn: sqlite3.Connection, extractor: str) -> int:
"""Insert a 'running' row for extractor. Returns the new run_id."""
assert extractor, "extractor name must not be empty"
cur = conn.execute(
"INSERT INTO extraction_runs (extractor, status) VALUES (?, 'running')",
(extractor,),
)
conn.commit()
assert cur.lastrowid is not None, "INSERT did not return a row ID"
return cur.lastrowid
def end_run(
conn: sqlite3.Connection,
run_id: int,
*,
status: str,
files_written: int = 0,
files_skipped: int = 0,
bytes_written: int = 0,
cursor_value: str | None = None,
error_message: str | None = None,
) -> None:
"""Update the run row to its final state (success or failed)."""
assert status in ("success", "failed"), f"status must be 'success' or 'failed', got {status!r}"
assert run_id > 0, f"run_id must be positive, got {run_id}"
assert files_written >= 0, f"files_written must be non-negative, got {files_written}"
assert files_skipped >= 0, f"files_skipped must be non-negative, got {files_skipped}"
assert bytes_written >= 0, f"bytes_written must be non-negative, got {bytes_written}"
conn.execute(
"""
UPDATE extraction_runs
SET finished_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
status = ?,
files_written = ?,
files_skipped = ?,
bytes_written = ?,
cursor_value = ?,
error_message = ?
WHERE run_id = ?
""",
(status, files_written, files_skipped, bytes_written, cursor_value, error_message, run_id),
)
conn.commit()
def get_last_cursor(conn: sqlite3.Connection, extractor: str) -> str | None:
"""Return the cursor_value from the most recent successful run, or None."""
row = conn.execute(
"""
SELECT cursor_value FROM extraction_runs
WHERE extractor = ? AND status = 'success' AND cursor_value IS NOT NULL
ORDER BY run_id DESC
LIMIT 1
""",
(extractor,),
).fetchone()
return row["cursor_value"] if row else None
def get_recent_runs(conn: sqlite3.Connection, extractor: str, limit: int = 10) -> list[dict]:
"""Return the most recent runs for an extractor, newest first."""
assert limit > 0, f"limit must be positive, got {limit}"
rows = conn.execute(
"""
SELECT * FROM extraction_runs
WHERE extractor = ?
ORDER BY run_id DESC
LIMIT ?
""",
(extractor, limit),
).fetchall()
return [dict(r) for r in rows]