ICE extraction overhaul: API discovery + aging report + historical backfill

- Replace brittle ICE_STOCKS_URL env var with API-based URL discovery via
  the private ICE Report Center JSON API (no auth required)
- Add rolling CSV → XLS fallback in extract_ice_stocks() using
  find_latest_report() from ice_api.py
- Add ice_api.py: fetch_report_listings(), find_latest_report() with
  pagination up to MAX_API_PAGES
- Add xls_parse.py: detect_file_format() (magic bytes), xls_to_rows()
  using xlrd for OLE2/BIFF XLS files
- Add extract_ice_aging(): monthly certified stock aging report by
  age bucket × port → ice_aging/ landing dir
- Add extract_ice_historical(): 30-year EOM by-port stocks from static
  ICE URL → ice_stocks_by_port/ landing dir
- Add xlrd>=2.0.1 (parse XLS), xlwt>=1.3.0 (dev, test fixtures)
- Add SQLMesh raw + foundation models for both new datasets
- Add ice_aging_glob(), ice_stocks_by_port_glob() macros
- Add extract_ice_aging + extract_ice_historical pipeline entries
- Add 12 unit tests (format detection, XLS roundtrip, API mock, CSV output)

Seed files (data/landing/ice_aging/seed/ and ice_stocks_by_port/seed/)
must be created locally — data/ is gitignored.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-21 21:13:18 +01:00
parent ff39d65dc6
commit ff7301d6a8
13 changed files with 944 additions and 98 deletions

View File

@@ -0,0 +1,58 @@
-- Foundation fact: ICE certified Coffee C (Arabica) aging report.
--
-- Casts raw varchar columns to proper types and deduplicates via hash key.
-- Grain: one row per (report_date, age_bucket).
-- Age buckets represent how long coffee has been in certified storage.
-- Port columns are in bags (60kg).
MODEL (
name foundation.fct_ice_aging_stocks,
kind INCREMENTAL_BY_TIME_RANGE (
time_column report_date
),
grain (report_date, age_bucket),
start '2020-01-01',
cron '@daily'
);
WITH cast_and_clean AS (
SELECT
TRY_CAST(report_date AS date) AS report_date,
age_bucket,
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
TRY_CAST(houston_bags AS bigint) AS houston_bags,
TRY_CAST(miami_bags AS bigint) AS miami_bags,
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
TRY_CAST(total_bags AS bigint) AS total_bags,
filename AS source_file,
hash(report_date, age_bucket, total_bags) AS hkey
FROM raw.ice_aging_stocks
WHERE TRY_CAST(report_date AS date) IS NOT NULL
AND age_bucket IS NOT NULL
AND age_bucket != ''
),
deduplicated AS (
SELECT
any_value(report_date) AS report_date,
any_value(age_bucket) AS age_bucket,
any_value(antwerp_bags) AS antwerp_bags,
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
any_value(houston_bags) AS houston_bags,
any_value(miami_bags) AS miami_bags,
any_value(new_orleans_bags) AS new_orleans_bags,
any_value(new_york_bags) AS new_york_bags,
any_value(total_bags) AS total_bags,
any_value(source_file) AS source_file,
hkey
FROM cast_and_clean
GROUP BY hkey
)
SELECT *
FROM deduplicated
WHERE report_date BETWEEN @start_ds AND @end_ds

View File

@@ -0,0 +1,60 @@
-- Foundation fact: ICE historical end-of-month Coffee C certified warehouse stocks by port.
--
-- Covers November 1996 to present (30-year history). Casts raw varchar columns
-- to proper types and deduplicates via hash key.
--
-- Grain: one row per report_date (end-of-month).
-- Port columns are in bags (60kg).
MODEL (
name foundation.fct_ice_warehouse_stocks_by_port,
kind INCREMENTAL_BY_TIME_RANGE (
time_column report_date
),
grain (report_date),
start '1996-11-01',
cron '@daily'
);
WITH cast_and_clean AS (
SELECT
TRY_CAST(report_date AS date) AS report_date,
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
TRY_CAST(houston_bags AS bigint) AS houston_bags,
TRY_CAST(miami_bags AS bigint) AS miami_bags,
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
TRY_CAST(barcelona_bags AS bigint) AS barcelona_bags,
TRY_CAST(virginia_bags AS bigint) AS virginia_bags,
TRY_CAST(total_bags AS bigint) AS total_bags,
filename AS source_file,
hash(report_date, total_bags) AS hkey
FROM raw.ice_warehouse_stocks_by_port
WHERE TRY_CAST(report_date AS date) IS NOT NULL
AND TRY_CAST(total_bags AS bigint) IS NOT NULL
),
deduplicated AS (
SELECT
any_value(report_date) AS report_date,
any_value(new_york_bags) AS new_york_bags,
any_value(new_orleans_bags) AS new_orleans_bags,
any_value(houston_bags) AS houston_bags,
any_value(miami_bags) AS miami_bags,
any_value(antwerp_bags) AS antwerp_bags,
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
any_value(barcelona_bags) AS barcelona_bags,
any_value(virginia_bags) AS virginia_bags,
any_value(total_bags) AS total_bags,
any_value(source_file) AS source_file,
hkey
FROM cast_and_clean
GROUP BY hkey
)
SELECT *
FROM deduplicated
WHERE report_date BETWEEN @start_ds AND @end_ds