feat: extraction framework overhaul — extract_core shared package + SQLite state tracking
- Add extract/extract_core/ workspace package with three modules:
- state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
- http.py: niquests session factory + etag normalization helpers
- files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
- Replace inline boilerplate with extract_core helpers
- Add start_run/end_run tracking to every extraction entry point
- extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ version = "0.1.0"
|
||||
description = "ICE certified warehouse stocks extractor"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"extract_core",
|
||||
"niquests>=3.14.1",
|
||||
"xlrd>=2.0.1",
|
||||
]
|
||||
|
||||
@@ -20,16 +20,23 @@ CSV schemas:
|
||||
|
||||
import csv
|
||||
import gzip
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import niquests
|
||||
import xlrd
|
||||
from extract_core import (
|
||||
content_hash,
|
||||
end_run,
|
||||
landing_path,
|
||||
open_state_db,
|
||||
start_run,
|
||||
write_bytes_atomic,
|
||||
)
|
||||
|
||||
from ice_stocks.ice_api import find_all_reports, find_latest_report
|
||||
from ice_stocks.xls_parse import OLE2_MAGIC, detect_file_format, xls_to_rows
|
||||
@@ -42,7 +49,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger("ICE Stocks Extractor")
|
||||
|
||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
|
||||
# ── ice_stocks (daily rolling) ──────────────────────────────────────────────
|
||||
DEST_SUBDIR = "ice_stocks"
|
||||
@@ -105,31 +112,30 @@ HISTORICAL_PORT_COLS = [
|
||||
|
||||
# ── shared helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
def _write_landing_file(canonical_csv: bytes, dest_subdir: str, date_label: str) -> None:
|
||||
"""SHA256-hash canonical_csv, skip if exists, else gzip and write."""
|
||||
def _write_landing_file(canonical_csv: bytes, dest_subdir: str, date_label: str) -> int:
|
||||
"""SHA256-hash canonical_csv, skip if exists, else gzip and write atomically.
|
||||
|
||||
Returns bytes_written (0 if skipped).
|
||||
"""
|
||||
assert canonical_csv, "canonical_csv must not be empty"
|
||||
assert dest_subdir, "dest_subdir must not be empty"
|
||||
assert date_label, "date_label must not be empty"
|
||||
|
||||
sha256 = hashlib.sha256(canonical_csv).hexdigest()
|
||||
etag = sha256[:8]
|
||||
etag = content_hash(canonical_csv)
|
||||
year = date_label[:4]
|
||||
|
||||
dest_dir = LANDING_DIR / dest_subdir / year
|
||||
dest_dir = landing_path(LANDING_DIR, dest_subdir, year)
|
||||
local_file = dest_dir / f"{date_label}_{etag}.csv.gzip"
|
||||
|
||||
if local_file.exists():
|
||||
logger.info(f"File {local_file.name} already exists — content unchanged, skipping")
|
||||
return
|
||||
return 0
|
||||
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
compressed = gzip.compress(canonical_csv)
|
||||
local_file.write_bytes(compressed)
|
||||
bytes_written = write_bytes_atomic(local_file, compressed)
|
||||
|
||||
assert local_file.exists(), f"File was not written: {local_file}"
|
||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
||||
|
||||
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
|
||||
logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
|
||||
return bytes_written
|
||||
|
||||
|
||||
def _build_csv_bytes(fieldnames: list[str], rows: list[dict]) -> bytes:
|
||||
@@ -243,47 +249,66 @@ def extract_ice_stocks() -> None:
|
||||
discovery to find the latest 'Daily Warehouse Stocks' report.
|
||||
Idempotent: skips if content hash already on disk.
|
||||
"""
|
||||
with niquests.Session() as session:
|
||||
logger.info(f"Trying ICE rolling CSV: {ICE_ROLLING_CSV_URL}")
|
||||
try:
|
||||
response = session.get(ICE_ROLLING_CSV_URL, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.warning(f"Rolling CSV fetch failed: {e} — trying API discovery")
|
||||
response = None
|
||||
|
||||
use_api = response is None or response.status_code == 404
|
||||
|
||||
if use_api:
|
||||
logger.info("Falling back to ICE API discovery for Daily Warehouse Stocks")
|
||||
report = find_latest_report(session, ICE_STOCKS_LABEL)
|
||||
if not report:
|
||||
logger.error("ICE API: no 'Daily Warehouse Stocks' report found")
|
||||
return
|
||||
logger.info(f"Found report via API: {report['download_label']} ({report['publish_date']})")
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "ice_stocks")
|
||||
try:
|
||||
with niquests.Session() as session:
|
||||
logger.info(f"Trying ICE rolling CSV: {ICE_ROLLING_CSV_URL}")
|
||||
try:
|
||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||
response = session.get(ICE_ROLLING_CSV_URL, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download report from API URL: {e}")
|
||||
logger.warning(f"Rolling CSV fetch failed: {e} — trying API discovery")
|
||||
response = None
|
||||
|
||||
use_api = response is None or response.status_code == 404
|
||||
|
||||
if use_api:
|
||||
logger.info("Falling back to ICE API discovery for Daily Warehouse Stocks")
|
||||
report = find_latest_report(session, ICE_STOCKS_LABEL)
|
||||
if not report:
|
||||
logger.error("ICE API: no 'Daily Warehouse Stocks' report found")
|
||||
end_run(conn, run_id, status="failed", error_message="No report found via API")
|
||||
return
|
||||
logger.info(f"Found report via API: {report['download_label']} ({report['publish_date']})")
|
||||
try:
|
||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download report from API URL: {e}")
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
return
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"Unexpected status {response.status_code}")
|
||||
end_run(conn, run_id, status="failed", error_message=f"HTTP {response.status_code}")
|
||||
return
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"Unexpected status {response.status_code}")
|
||||
assert len(response.content) > 0, "Downloaded empty file from ICE"
|
||||
|
||||
fmt = detect_file_format(response.content)
|
||||
if fmt == "xls":
|
||||
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
||||
else:
|
||||
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
||||
|
||||
if not canonical_csv:
|
||||
logger.warning("ICE stocks parsed to 0 rows — check column mapping or XLS structure")
|
||||
end_run(conn, run_id, status="failed", error_message="Parsed 0 rows")
|
||||
return
|
||||
|
||||
assert len(response.content) > 0, "Downloaded empty file from ICE"
|
||||
|
||||
fmt = detect_file_format(response.content)
|
||||
if fmt == "xls":
|
||||
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
||||
else:
|
||||
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
||||
|
||||
if not canonical_csv:
|
||||
logger.warning("ICE stocks parsed to 0 rows — check column mapping or XLS structure")
|
||||
return
|
||||
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
_write_landing_file(canonical_csv, DEST_SUBDIR, today)
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
bytes_written = _write_landing_file(canonical_csv, DEST_SUBDIR, today)
|
||||
end_run(
|
||||
conn, run_id, status="success",
|
||||
files_written=1 if bytes_written > 0 else 0,
|
||||
files_skipped=1 if bytes_written == 0 else 0,
|
||||
bytes_written=bytes_written,
|
||||
cursor_value=today,
|
||||
)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ── ice_aging (monthly aging report) ────────────────────────────────────────
|
||||
@@ -309,65 +334,85 @@ def extract_ice_aging() -> None:
|
||||
Monthly report: stock quantities by age bucket × port.
|
||||
Idempotent: skips if content hash already on disk.
|
||||
"""
|
||||
with niquests.Session() as session:
|
||||
logger.info("Fetching latest ICE Aging Report via API")
|
||||
report = find_latest_report(session, ICE_AGING_LABEL)
|
||||
if not report:
|
||||
logger.error(f"ICE API: no report matching {ICE_AGING_LABEL!r}")
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "ice_aging")
|
||||
try:
|
||||
with niquests.Session() as session:
|
||||
logger.info("Fetching latest ICE Aging Report via API")
|
||||
report = find_latest_report(session, ICE_AGING_LABEL)
|
||||
if not report:
|
||||
logger.error(f"ICE API: no report matching {ICE_AGING_LABEL!r}")
|
||||
end_run(conn, run_id, status="failed", error_message="No aging report found via API")
|
||||
return
|
||||
|
||||
logger.info(f"Downloading: {report['download_label']} ({report['publish_date']})")
|
||||
try:
|
||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download aging report: {e}")
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
return
|
||||
|
||||
assert response.status_code == 200, f"HTTP {response.status_code}"
|
||||
assert response.content[:4] == OLE2_MAGIC, "Aging report is not an XLS file"
|
||||
|
||||
rows = xls_to_rows(response.content)
|
||||
|
||||
report_date = _parse_aging_date(str(rows[0][0]) if rows else "")
|
||||
if not report_date:
|
||||
msg = f"Could not parse aging report date from row 0: {rows[0] if rows else '(empty)'!r}"
|
||||
logger.error(msg)
|
||||
end_run(conn, run_id, status="failed", error_message=msg)
|
||||
return
|
||||
|
||||
logger.info(f"Downloading: {report['download_label']} ({report['publish_date']})")
|
||||
try:
|
||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download aging report: {e}")
|
||||
return
|
||||
# Row 3+ are data rows; stop at row labelled "Total"
|
||||
fieldnames = ["report_date", "age_bucket"] + AGING_PORT_HEADERS
|
||||
data_rows = []
|
||||
|
||||
assert response.status_code == 200, f"HTTP {response.status_code}"
|
||||
assert response.content[:4] == OLE2_MAGIC, "Aging report is not an XLS file"
|
||||
for row in rows[3:]:
|
||||
if not row or not str(row[0]).strip():
|
||||
continue
|
||||
label = str(row[0]).strip()
|
||||
if label.lower() == "total":
|
||||
break
|
||||
|
||||
rows = xls_to_rows(response.content)
|
||||
port_values = []
|
||||
for cell in row[1:]:
|
||||
if isinstance(cell, float):
|
||||
port_values.append(str(int(cell)))
|
||||
elif str(cell).strip() in ("-", ""):
|
||||
port_values.append("0")
|
||||
else:
|
||||
port_values.append(str(cell).replace(",", "").strip())
|
||||
|
||||
report_date = _parse_aging_date(str(rows[0][0]) if rows else "")
|
||||
if not report_date:
|
||||
logger.error(f"Could not parse aging report date from row 0: {rows[0] if rows else '(empty)'!r}")
|
||||
return
|
||||
|
||||
# Row 3+ are data rows; stop at row labelled "Total"
|
||||
fieldnames = ["report_date", "age_bucket"] + AGING_PORT_HEADERS
|
||||
data_rows = []
|
||||
|
||||
for row in rows[3:]:
|
||||
if not row or not str(row[0]).strip():
|
||||
continue
|
||||
label = str(row[0]).strip()
|
||||
if label.lower() == "total":
|
||||
break
|
||||
|
||||
port_values = []
|
||||
for cell in row[1:]:
|
||||
if isinstance(cell, float):
|
||||
port_values.append(str(int(cell)))
|
||||
elif str(cell).strip() in ("-", ""):
|
||||
while len(port_values) < len(AGING_PORT_HEADERS):
|
||||
port_values.append("0")
|
||||
else:
|
||||
port_values.append(str(cell).replace(",", "").strip())
|
||||
port_values = port_values[:len(AGING_PORT_HEADERS)]
|
||||
|
||||
while len(port_values) < len(AGING_PORT_HEADERS):
|
||||
port_values.append("0")
|
||||
port_values = port_values[:len(AGING_PORT_HEADERS)]
|
||||
record = {"report_date": report_date, "age_bucket": label}
|
||||
for col, val in zip(AGING_PORT_HEADERS, port_values):
|
||||
record[col] = val
|
||||
data_rows.append(record)
|
||||
|
||||
record = {"report_date": report_date, "age_bucket": label}
|
||||
for col, val in zip(AGING_PORT_HEADERS, port_values):
|
||||
record[col] = val
|
||||
data_rows.append(record)
|
||||
if not data_rows:
|
||||
logger.warning("Aging report parsed to 0 data rows")
|
||||
end_run(conn, run_id, status="failed", error_message="Parsed 0 data rows")
|
||||
return
|
||||
|
||||
if not data_rows:
|
||||
logger.warning("Aging report parsed to 0 data rows")
|
||||
return
|
||||
|
||||
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
||||
_write_landing_file(canonical_csv, AGING_DEST_SUBDIR, report_date)
|
||||
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
||||
bytes_written = _write_landing_file(canonical_csv, AGING_DEST_SUBDIR, report_date)
|
||||
end_run(
|
||||
conn, run_id, status="success",
|
||||
files_written=1 if bytes_written > 0 else 0,
|
||||
files_skipped=1 if bytes_written == 0 else 0,
|
||||
bytes_written=bytes_written,
|
||||
cursor_value=report_date,
|
||||
)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ── ice_stocks_by_port (historical) ─────────────────────────────────────────
|
||||
@@ -387,63 +432,80 @@ def extract_ice_historical() -> None:
|
||||
Static URL updated monthly. Covers Nov 1996 to present.
|
||||
Idempotent: skips if content hash already on disk.
|
||||
"""
|
||||
logger.info(f"Downloading ICE historical by-port XLS: {ICE_HISTORICAL_URL}")
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "ice_historical")
|
||||
try:
|
||||
logger.info(f"Downloading ICE historical by-port XLS: {ICE_HISTORICAL_URL}")
|
||||
|
||||
with niquests.Session() as session:
|
||||
try:
|
||||
response = session.get(ICE_HISTORICAL_URL, timeout=HISTORICAL_HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download historical XLS: {e}")
|
||||
with niquests.Session() as session:
|
||||
try:
|
||||
response = session.get(ICE_HISTORICAL_URL, timeout=HISTORICAL_HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download historical XLS: {e}")
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
return
|
||||
|
||||
assert response.status_code == 200, f"HTTP {response.status_code}"
|
||||
assert response.content[:4] == OLE2_MAGIC, "Historical file is not an XLS"
|
||||
|
||||
book = xlrd.open_workbook(file_contents=response.content)
|
||||
datemode = book.datemode
|
||||
rows = xls_to_rows(response.content)
|
||||
|
||||
# Data starts at row 8 (0-indexed); rows 0-7 are headers
|
||||
fieldnames = ["report_date"] + HISTORICAL_PORT_COLS
|
||||
data_rows = []
|
||||
|
||||
for row in rows[8:]:
|
||||
if not row or len(row) < 2:
|
||||
continue
|
||||
|
||||
serial_cell = row[1]
|
||||
if not isinstance(serial_cell, float) or serial_cell <= 0:
|
||||
continue
|
||||
|
||||
report_date = _excel_serial_to_date(serial_cell, datemode)
|
||||
if not report_date:
|
||||
continue
|
||||
|
||||
port_cells = row[2:2 + len(HISTORICAL_PORT_COLS)]
|
||||
port_values = []
|
||||
for cell in port_cells:
|
||||
if cell == "" or str(cell).strip() in ("-", ""):
|
||||
port_values.append("0")
|
||||
elif isinstance(cell, float):
|
||||
port_values.append(str(int(cell)))
|
||||
else:
|
||||
port_values.append(str(cell).replace(",", "").strip())
|
||||
|
||||
while len(port_values) < len(HISTORICAL_PORT_COLS):
|
||||
port_values.append("0")
|
||||
|
||||
record = {"report_date": report_date}
|
||||
for col, val in zip(HISTORICAL_PORT_COLS, port_values):
|
||||
record[col] = val
|
||||
data_rows.append(record)
|
||||
|
||||
if not data_rows:
|
||||
logger.warning("Historical XLS parsed to 0 data rows")
|
||||
end_run(conn, run_id, status="failed", error_message="Parsed 0 data rows")
|
||||
return
|
||||
|
||||
assert response.status_code == 200, f"HTTP {response.status_code}"
|
||||
assert response.content[:4] == OLE2_MAGIC, "Historical file is not an XLS"
|
||||
|
||||
book = xlrd.open_workbook(file_contents=response.content)
|
||||
datemode = book.datemode
|
||||
rows = xls_to_rows(response.content)
|
||||
|
||||
# Data starts at row 8 (0-indexed); rows 0-7 are headers
|
||||
fieldnames = ["report_date"] + HISTORICAL_PORT_COLS
|
||||
data_rows = []
|
||||
|
||||
for row in rows[8:]:
|
||||
if not row or len(row) < 2:
|
||||
continue
|
||||
|
||||
serial_cell = row[1]
|
||||
if not isinstance(serial_cell, float) or serial_cell <= 0:
|
||||
continue
|
||||
|
||||
report_date = _excel_serial_to_date(serial_cell, datemode)
|
||||
if not report_date:
|
||||
continue
|
||||
|
||||
port_cells = row[2:2 + len(HISTORICAL_PORT_COLS)]
|
||||
port_values = []
|
||||
for cell in port_cells:
|
||||
if cell == "" or str(cell).strip() in ("-", ""):
|
||||
port_values.append("0")
|
||||
elif isinstance(cell, float):
|
||||
port_values.append(str(int(cell)))
|
||||
else:
|
||||
port_values.append(str(cell).replace(",", "").strip())
|
||||
|
||||
while len(port_values) < len(HISTORICAL_PORT_COLS):
|
||||
port_values.append("0")
|
||||
|
||||
record = {"report_date": report_date}
|
||||
for col, val in zip(HISTORICAL_PORT_COLS, port_values):
|
||||
record[col] = val
|
||||
data_rows.append(record)
|
||||
|
||||
if not data_rows:
|
||||
logger.warning("Historical XLS parsed to 0 data rows")
|
||||
return
|
||||
|
||||
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
_write_landing_file(canonical_csv, HISTORICAL_DEST_SUBDIR, today)
|
||||
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
bytes_written = _write_landing_file(canonical_csv, HISTORICAL_DEST_SUBDIR, today)
|
||||
end_run(
|
||||
conn, run_id, status="success",
|
||||
files_written=1 if bytes_written > 0 else 0,
|
||||
files_skipped=1 if bytes_written == 0 else 0,
|
||||
bytes_written=bytes_written,
|
||||
cursor_value=today,
|
||||
)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
|
||||
@@ -458,50 +520,63 @@ def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
|
||||
"""
|
||||
assert max_pages > 0, f"max_pages must be positive, got {max_pages}"
|
||||
|
||||
with niquests.Session() as session:
|
||||
logger.info(f"Fetching all available Daily Warehouse Stocks reports (max {max_pages} pages)...")
|
||||
reports = find_all_reports(session, ICE_STOCKS_LABEL, max_pages=max_pages)
|
||||
conn = open_state_db(LANDING_DIR)
|
||||
run_id = start_run(conn, "ice_stocks_backfill")
|
||||
files_written = 0
|
||||
files_skipped = 0
|
||||
bytes_written_total = 0
|
||||
try:
|
||||
with niquests.Session() as session:
|
||||
logger.info(f"Fetching all available Daily Warehouse Stocks reports (max {max_pages} pages)...")
|
||||
reports = find_all_reports(session, ICE_STOCKS_LABEL, max_pages=max_pages)
|
||||
|
||||
if not reports:
|
||||
logger.error("ICE API: no 'Daily Warehouse Stocks' reports found")
|
||||
return
|
||||
if not reports:
|
||||
logger.error("ICE API: no 'Daily Warehouse Stocks' reports found")
|
||||
end_run(conn, run_id, status="failed", error_message="No reports found via API")
|
||||
return
|
||||
|
||||
logger.info(f"Found {len(reports)} reports: {reports[-1]['publish_date']} → {reports[0]['publish_date']}")
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
logger.info(f"Found {len(reports)} reports: {reports[-1]['publish_date']} → {reports[0]['publish_date']}")
|
||||
|
||||
for report in reports:
|
||||
publish_date = report["publish_date"]
|
||||
try:
|
||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to download {publish_date}: {e}")
|
||||
continue
|
||||
for report in reports:
|
||||
publish_date = report["publish_date"]
|
||||
try:
|
||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to download {publish_date}: {e}")
|
||||
continue
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"HTTP {response.status_code} for {publish_date} — skipping")
|
||||
continue
|
||||
if response.status_code != 200:
|
||||
logger.warning(f"HTTP {response.status_code} for {publish_date} — skipping")
|
||||
continue
|
||||
|
||||
fmt = detect_file_format(response.content)
|
||||
if fmt == "xls":
|
||||
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
||||
else:
|
||||
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
||||
fmt = detect_file_format(response.content)
|
||||
if fmt == "xls":
|
||||
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
||||
else:
|
||||
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
||||
|
||||
if not canonical_csv:
|
||||
logger.warning(f"Parsed 0 rows for {publish_date} — skipping")
|
||||
continue
|
||||
if not canonical_csv:
|
||||
logger.warning(f"Parsed 0 rows for {publish_date} — skipping")
|
||||
continue
|
||||
|
||||
# Use the report's publish date as the file date label
|
||||
file_count_before = sum(1 for _ in (LANDING_DIR / DEST_SUBDIR).rglob("*.csv.gzip"))
|
||||
_write_landing_file(canonical_csv, DEST_SUBDIR, publish_date)
|
||||
file_count_after = sum(1 for _ in (LANDING_DIR / DEST_SUBDIR).rglob("*.csv.gzip"))
|
||||
if file_count_after > file_count_before:
|
||||
downloaded += 1
|
||||
else:
|
||||
skipped += 1
|
||||
result = _write_landing_file(canonical_csv, DEST_SUBDIR, publish_date)
|
||||
if result > 0:
|
||||
files_written += 1
|
||||
bytes_written_total += result
|
||||
else:
|
||||
files_skipped += 1
|
||||
|
||||
logger.info(f"Backfill complete: {downloaded} new files downloaded, {skipped} already existed")
|
||||
logger.info(f"Backfill complete: {files_written} new files downloaded, {files_skipped} already existed")
|
||||
end_run(
|
||||
conn, run_id, status="success",
|
||||
files_written=files_written, files_skipped=files_skipped,
|
||||
bytes_written=bytes_written_total,
|
||||
)
|
||||
except Exception as e:
|
||||
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def extract_ice_all() -> None:
|
||||
|
||||
Reference in New Issue
Block a user