feat: extraction framework overhaul — extract_core shared package + SQLite state tracking
- Add extract/extract_core/ workspace package with three modules:
- state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
- http.py: niquests session factory + etag normalization helpers
- files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
- Replace inline boilerplate with extract_core helpers
- Add start_run/end_run tracking to every extraction entry point
- extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ version = "0.1.0"
|
||||
description = "CFTC Commitment of Traders data extractor"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"extract_core",
|
||||
"niquests>=3.14.1",
|
||||
]
|
||||
|
||||
|
||||
@@ -10,12 +10,20 @@ Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import niquests
|
||||
from extract_core import (
|
||||
end_run,
|
||||
landing_path,
|
||||
normalize_etag,
|
||||
open_state_db,
|
||||
start_run,
|
||||
write_bytes_atomic,
|
||||
)
|
||||
|
||||
from .normalize import find_csv_inner_filename, normalize_zipped_csv
|
||||
|
||||
@@ -27,7 +35,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger("CFTC COT Extractor")
|
||||
|
||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
|
||||
# CFTC publishes yearly ZIPs for the disaggregated futures-only report.
|
||||
# The file for the current year is updated each Friday at 3:30 PM ET.
|
||||
@@ -52,10 +60,10 @@ def _synthetic_etag(year: int, headers: dict) -> str:
|
||||
return etag
|
||||
|
||||
|
||||
def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
||||
def extract_cot_year(year: int, http_session: niquests.Session) -> int:
|
||||
"""Download and store COT data for a single year.
|
||||
|
||||
Returns True if a new file was written, False if skipped or unavailable.
|
||||
Returns bytes_written (0 if skipped or unavailable).
|
||||
"""
|
||||
url = COT_URL_TEMPLATE.format(year=year)
|
||||
logger.info(f"Checking COT data for {year}: {url}")
|
||||
@@ -63,20 +71,20 @@ def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
||||
head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if head.status_code == 404:
|
||||
logger.info(f"Year {year} not available (404) — skipping")
|
||||
return False
|
||||
return 0
|
||||
assert head.status_code == 200, (
|
||||
f"Unexpected HEAD status {head.status_code} for {url}"
|
||||
)
|
||||
|
||||
raw_etag = head.headers.get("etag", "")
|
||||
etag = raw_etag.replace('"', "").replace(":", "_") if raw_etag else _synthetic_etag(year, head.headers)
|
||||
etag = normalize_etag(raw_etag) if raw_etag else _synthetic_etag(year, head.headers)
|
||||
|
||||
dest_dir = LANDING_DIR / "cot" / str(year)
|
||||
dest_dir = landing_path(LANDING_DIR, "cot", str(year))
|
||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||
|
||||
if local_file.exists():
|
||||
logger.info(f"Year {year}: {etag}.csv.gzip already exists, skipping")
|
||||
return False
|
||||
return 0
|
||||
|
||||
logger.info(f"Downloading COT data for {year}...")
|
||||
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
@@ -89,14 +97,11 @@ def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
||||
inner_filename = find_csv_inner_filename(BytesIO(response.content))
|
||||
normalized = normalize_zipped_csv(zip_buffer, inner_filename)
|
||||
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
local_file.write_bytes(normalized.read())
|
||||
|
||||
assert local_file.exists(), f"File was not written: {local_file}"
|
||||
bytes_written = write_bytes_atomic(local_file, normalized.read())
|
||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
||||
|
||||
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
|
||||
return True
|
||||
logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
|
||||
return bytes_written
|
||||
|
||||
|
||||
def extract_cot_dataset():
|
||||
@@ -113,16 +118,36 @@ def extract_cot_dataset():
|
||||
f"Year range {len(years)} exceeds MAX_YEARS={MAX_YEARS}"
|
||||
)
|
||||
|
||||
new_count = 0
|
||||
with niquests.Session() as session:
|
||||
for year in years:
|
||||
try:
|
||||
if extract_cot_year(year, session):
|
||||
new_count += 1
|
||||
except Exception:
|
||||
logger.exception(f"Failed to extract COT data for {year}, continuing")
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "cftc_cot")
|
||||
files_written = 0
|
||||
files_skipped = 0
|
||||
bytes_written_total = 0
|
||||
try:
|
||||
with niquests.Session() as session:
|
||||
for year in years:
|
||||
try:
|
||||
result = extract_cot_year(year, session)
|
||||
if result > 0:
|
||||
files_written += 1
|
||||
bytes_written_total += result
|
||||
else:
|
||||
files_skipped += 1
|
||||
except Exception:
|
||||
logger.exception(f"Failed to extract COT data for {year}, continuing")
|
||||
|
||||
logger.info(f"COT extraction complete: {new_count} new file(s) downloaded")
|
||||
logger.info(f"COT extraction complete: {files_written} new file(s) downloaded")
|
||||
end_run(
|
||||
conn, run_id, status="success",
|
||||
files_written=files_written, files_skipped=files_skipped,
|
||||
bytes_written=bytes_written_total,
|
||||
cursor_value=str(current_year),
|
||||
)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user