feat: extraction framework overhaul — extract_core shared package + SQLite state tracking
- Add extract/extract_core/ workspace package with three modules:
- state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
- http.py: niquests session factory + etag normalization helpers
- files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
- Replace inline boilerplate with extract_core helpers
- Add start_run/end_run tracking to every extraction entry point
- extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ authors = [
|
||||
requires-python = ">=3.13"
|
||||
|
||||
dependencies = [
|
||||
"extract_core",
|
||||
"niquests>=3.14.1",
|
||||
]
|
||||
[project.scripts]
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
from .normalize import normalize_zipped_csv
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import niquests
|
||||
from extract_core import end_run, landing_path, normalize_etag, open_state_db, start_run
|
||||
from extract_core import write_bytes_atomic
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -16,7 +18,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger("PSDOnline Extractor")
|
||||
|
||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
LANDING_DIR.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Landing dir: {LANDING_DIR}")
|
||||
|
||||
@@ -27,61 +29,87 @@ FIRST_MONTH = 8
|
||||
HTTP_TIMEOUT_SECONDS = 60
|
||||
|
||||
|
||||
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
|
||||
"""Extract PSD file to local year/month subdirectory."""
|
||||
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session) -> int:
|
||||
"""Extract PSD file to local year/month subdirectory.
|
||||
|
||||
Returns bytes_written (0 if the file already existed and was skipped).
|
||||
"""
|
||||
logger.info(f"Requesting file {url} ...")
|
||||
|
||||
response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if response.status_code == 404:
|
||||
logger.error("File doesn't exist on server, received status code 404 Not Found")
|
||||
return
|
||||
return 0
|
||||
elif response.status_code != 200:
|
||||
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
||||
return
|
||||
return 0
|
||||
|
||||
etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
|
||||
assert etag, "USDA response missing etag header"
|
||||
raw_etag = response.headers.get("etag", "")
|
||||
assert raw_etag, "USDA response missing etag header"
|
||||
etag = normalize_etag(raw_etag)
|
||||
|
||||
extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
|
||||
local_file = extract_to_path / f"{etag}.csv.gzip"
|
||||
dest_dir = landing_path(LANDING_DIR, "psd", str(year), f"{month:02d}")
|
||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||
if local_file.exists():
|
||||
logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
|
||||
return
|
||||
return 0
|
||||
|
||||
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
logger.info(f"Storing file to {local_file}")
|
||||
extract_to_path.mkdir(parents=True, exist_ok=True)
|
||||
normalized_content = normalize_zipped_csv(BytesIO(response.content))
|
||||
local_file.write_bytes(normalized_content.read())
|
||||
assert local_file.exists(), f"File was not written: {local_file}"
|
||||
bytes_written = write_bytes_atomic(local_file, normalized_content.read())
|
||||
logger.info("Download complete")
|
||||
return bytes_written
|
||||
|
||||
|
||||
def extract_psd_dataset():
|
||||
today = datetime.now()
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "psdonline")
|
||||
files_written = 0
|
||||
files_skipped = 0
|
||||
bytes_written = 0
|
||||
cursor_value = None
|
||||
try:
|
||||
today = datetime.now()
|
||||
with niquests.Session() as session:
|
||||
for months_back in range(4):
|
||||
year = today.year
|
||||
month = today.month - months_back
|
||||
while month < 1:
|
||||
month += 12
|
||||
year -= 1
|
||||
|
||||
with niquests.Session() as session:
|
||||
for months_back in range(4):
|
||||
year = today.year
|
||||
month = today.month - months_back
|
||||
while month < 1:
|
||||
month += 12
|
||||
year -= 1
|
||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||
logger.info(f"Trying {year}-{month:02d}...")
|
||||
|
||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||
logger.info(f"Trying {year}-{month:02d}...")
|
||||
|
||||
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if response.status_code == 200:
|
||||
logger.info(f"Found latest data at {year}-{month:02d}")
|
||||
extract_psd_file(url=url, year=year, month=month, http_session=session)
|
||||
return
|
||||
elif response.status_code == 404:
|
||||
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
|
||||
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if response.status_code == 200:
|
||||
logger.info(f"Found latest data at {year}-{month:02d}")
|
||||
result = extract_psd_file(url=url, year=year, month=month, http_session=session)
|
||||
if result > 0:
|
||||
files_written = 1
|
||||
bytes_written = result
|
||||
else:
|
||||
files_skipped = 1
|
||||
cursor_value = f"{year}-{month:02d}"
|
||||
break
|
||||
elif response.status_code == 404:
|
||||
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
|
||||
else:
|
||||
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
|
||||
else:
|
||||
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
|
||||
logger.error("Could not find any available data in the last 4 months")
|
||||
|
||||
logger.error("Could not find any available data in the last 4 months")
|
||||
end_run(
|
||||
conn, run_id, status="success",
|
||||
files_written=files_written, files_skipped=files_skipped,
|
||||
bytes_written=bytes_written, cursor_value=cursor_value,
|
||||
)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user