feat: extraction framework overhaul — extract_core shared package + SQLite state tracking
- Add extract/extract_core/ workspace package with three modules:
- state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
- http.py: niquests session factory + etag normalization helpers
- files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
- Replace inline boilerplate with extract_core helpers
- Add start_run/end_run tracking to every extraction entry point
- extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ version = "0.1.0"
|
||||
description = "KC=F Coffee C futures price extractor"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"extract_core",
|
||||
"yfinance>=0.2.55",
|
||||
]
|
||||
|
||||
|
||||
@@ -8,14 +8,15 @@ Landing path: LANDING_DIR/prices/coffee_kc/{hash8}.csv.gzip
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yfinance as yf
|
||||
from extract_core import content_hash, end_run, landing_path, open_state_db, start_run
|
||||
from extract_core import write_bytes_atomic
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -25,7 +26,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger("Coffee Prices Extractor")
|
||||
|
||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
TICKER = "KC=F"
|
||||
DEST_SUBDIR = "prices/coffee_kc"
|
||||
|
||||
@@ -40,52 +41,54 @@ def extract_coffee_prices() -> None:
|
||||
On first run downloads full history (period='max'). On subsequent runs
|
||||
the hash matches if no new trading days have closed since last run.
|
||||
"""
|
||||
logger.info(f"Downloading {TICKER} daily OHLCV from Yahoo Finance...")
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "coffee_prices")
|
||||
try:
|
||||
logger.info(f"Downloading {TICKER} daily OHLCV from Yahoo Finance...")
|
||||
|
||||
ticker = yf.Ticker(TICKER)
|
||||
df = ticker.history(period="max", interval="1d", auto_adjust=False, timeout=DOWNLOAD_TIMEOUT_SECONDS)
|
||||
ticker = yf.Ticker(TICKER)
|
||||
df = ticker.history(period="max", interval="1d", auto_adjust=False, timeout=DOWNLOAD_TIMEOUT_SECONDS)
|
||||
|
||||
assert df is not None and len(df) > 0, f"yfinance returned empty DataFrame for {TICKER}"
|
||||
assert df is not None and len(df) > 0, f"yfinance returned empty DataFrame for {TICKER}"
|
||||
|
||||
# Reset index so Date becomes a plain column
|
||||
df = df.reset_index()
|
||||
# Reset index so Date becomes a plain column
|
||||
df = df.reset_index()
|
||||
|
||||
# Keep standard OHLCV columns only; yfinance may return extra columns
|
||||
keep_cols = [c for c in ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"] if c in df.columns]
|
||||
df = df[keep_cols]
|
||||
# Keep standard OHLCV columns only; yfinance may return extra columns
|
||||
keep_cols = [c for c in ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"] if c in df.columns]
|
||||
df = df[keep_cols]
|
||||
|
||||
# Normalize Date to ISO string for CSV stability across timezones
|
||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||
# Normalize Date to ISO string for CSV stability across timezones
|
||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||
|
||||
# Serialize to CSV bytes
|
||||
csv_buf = io.StringIO()
|
||||
df.to_csv(csv_buf, index=False)
|
||||
csv_bytes = csv_buf.getvalue().encode("utf-8")
|
||||
# Serialize to CSV bytes
|
||||
csv_buf = io.StringIO()
|
||||
df.to_csv(csv_buf, index=False)
|
||||
csv_bytes = csv_buf.getvalue().encode("utf-8")
|
||||
|
||||
assert len(csv_bytes) > 0, "CSV serialization produced empty output"
|
||||
assert len(csv_bytes) > 0, "CSV serialization produced empty output"
|
||||
|
||||
# Hash-based idempotency key (first 8 hex chars of SHA256)
|
||||
sha256 = hashlib.sha256(csv_bytes).hexdigest()
|
||||
etag = sha256[:8]
|
||||
etag = content_hash(csv_bytes)
|
||||
dest_dir = landing_path(LANDING_DIR, DEST_SUBDIR)
|
||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||
|
||||
dest_dir = LANDING_DIR / DEST_SUBDIR
|
||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||
if local_file.exists():
|
||||
logger.info(f"File {local_file.name} already exists — no new data, skipping")
|
||||
end_run(conn, run_id, status="success", files_skipped=1)
|
||||
return
|
||||
|
||||
if local_file.exists():
|
||||
logger.info(f"File {local_file.name} already exists — no new data, skipping")
|
||||
return
|
||||
compressed = gzip.compress(csv_bytes)
|
||||
bytes_written = write_bytes_atomic(local_file, compressed)
|
||||
|
||||
# Compress and write
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
compressed = gzip.compress(csv_bytes)
|
||||
local_file.write_bytes(compressed)
|
||||
|
||||
assert local_file.exists(), f"File was not written: {local_file}"
|
||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
||||
|
||||
logger.info(
|
||||
f"Stored {local_file} ({local_file.stat().st_size:,} bytes, {len(df):,} rows)"
|
||||
)
|
||||
logger.info(
|
||||
f"Stored {local_file} ({bytes_written:,} bytes, {len(df):,} rows)"
|
||||
)
|
||||
end_run(conn, run_id, status="success", files_written=1, bytes_written=bytes_written)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user