feat: extraction framework overhaul — extract_core shared package + SQLite state tracking

- Add extract/extract_core/ workspace package with three modules:
  - state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
  - http.py: niquests session factory + etag normalization helpers
  - files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
  - Replace inline boilerplate with extract_core helpers
  - Add start_run/end_run tracking to every extraction entry point
  - extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-22 14:37:50 +01:00
parent fc4121183c
commit 80c1163a7f
16 changed files with 702 additions and 290 deletions

View File

@@ -8,6 +8,7 @@ authors = [
requires-python = ">=3.13"
dependencies = [
"extract_core",
"niquests>=3.14.1",
]
[project.scripts]

View File

@@ -1,12 +1,14 @@
from .normalize import normalize_zipped_csv
import logging
import os
import pathlib
import sys
from datetime import datetime
from io import BytesIO
from pathlib import Path
import niquests
from extract_core import end_run, landing_path, normalize_etag, open_state_db, start_run
from extract_core import write_bytes_atomic
logging.basicConfig(
level=logging.INFO,
@@ -16,7 +18,7 @@ logging.basicConfig(
)
logger = logging.getLogger("PSDOnline Extractor")
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
LANDING_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Landing dir: {LANDING_DIR}")
@@ -27,61 +29,87 @@ FIRST_MONTH = 8
HTTP_TIMEOUT_SECONDS = 60
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
"""Extract PSD file to local year/month subdirectory."""
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session) -> int:
"""Extract PSD file to local year/month subdirectory.
Returns bytes_written (0 if the file already existed and was skipped).
"""
logger.info(f"Requesting file {url} ...")
response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
if response.status_code == 404:
logger.error("File doesn't exist on server, received status code 404 Not Found")
return
return 0
elif response.status_code != 200:
logger.error(f"Status code not ok, STATUS={response.status_code}")
return
return 0
etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
assert etag, "USDA response missing etag header"
raw_etag = response.headers.get("etag", "")
assert raw_etag, "USDA response missing etag header"
etag = normalize_etag(raw_etag)
extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
local_file = extract_to_path / f"{etag}.csv.gzip"
dest_dir = landing_path(LANDING_DIR, "psd", str(year), f"{month:02d}")
local_file = dest_dir / f"{etag}.csv.gzip"
if local_file.exists():
logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
return
return 0
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
logger.info(f"Storing file to {local_file}")
extract_to_path.mkdir(parents=True, exist_ok=True)
normalized_content = normalize_zipped_csv(BytesIO(response.content))
local_file.write_bytes(normalized_content.read())
assert local_file.exists(), f"File was not written: {local_file}"
bytes_written = write_bytes_atomic(local_file, normalized_content.read())
logger.info("Download complete")
return bytes_written
def extract_psd_dataset():
today = datetime.now()
conn = open_state_db(LANDING_DIR)
run_id = start_run(conn, "psdonline")
files_written = 0
files_skipped = 0
bytes_written = 0
cursor_value = None
try:
today = datetime.now()
with niquests.Session() as session:
for months_back in range(4):
year = today.year
month = today.month - months_back
while month < 1:
month += 12
year -= 1
with niquests.Session() as session:
for months_back in range(4):
year = today.year
month = today.month - months_back
while month < 1:
month += 12
year -= 1
url = PSD_HISTORICAL_URL.format(year=year, month=month)
logger.info(f"Trying {year}-{month:02d}...")
url = PSD_HISTORICAL_URL.format(year=year, month=month)
logger.info(f"Trying {year}-{month:02d}...")
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
if response.status_code == 200:
logger.info(f"Found latest data at {year}-{month:02d}")
extract_psd_file(url=url, year=year, month=month, http_session=session)
return
elif response.status_code == 404:
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
if response.status_code == 200:
logger.info(f"Found latest data at {year}-{month:02d}")
result = extract_psd_file(url=url, year=year, month=month, http_session=session)
if result > 0:
files_written = 1
bytes_written = result
else:
files_skipped = 1
cursor_value = f"{year}-{month:02d}"
break
elif response.status_code == 404:
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
else:
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
else:
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
logger.error("Could not find any available data in the last 4 months")
logger.error("Could not find any available data in the last 4 months")
end_run(
conn, run_id, status="success",
files_written=files_written, files_skipped=files_skipped,
bytes_written=bytes_written, cursor_value=cursor_value,
)
except Exception as e:
end_run(conn, run_id, status="failed", error_message=str(e))
raise
finally:
conn.close()
if __name__ == "__main__":