ICE extraction overhaul: API discovery + aging report + historical backfill
- Replace brittle ICE_STOCKS_URL env var with API-based URL discovery via the private ICE Report Center JSON API (no auth required) - Add rolling CSV → XLS fallback in extract_ice_stocks() using find_latest_report() from ice_api.py - Add ice_api.py: fetch_report_listings(), find_latest_report() with pagination up to MAX_API_PAGES - Add xls_parse.py: detect_file_format() (magic bytes), xls_to_rows() using xlrd for OLE2/BIFF XLS files - Add extract_ice_aging(): monthly certified stock aging report by age bucket × port → ice_aging/ landing dir - Add extract_ice_historical(): 30-year EOM by-port stocks from static ICE URL → ice_stocks_by_port/ landing dir - Add xlrd>=2.0.1 (parse XLS), xlwt>=1.3.0 (dev, test fixtures) - Add SQLMesh raw + foundation models for both new datasets - Add ice_aging_glob(), ice_stocks_by_port_glob() macros - Add extract_ice_aging + extract_ice_historical pipeline entries - Add 12 unit tests (format detection, XLS roundtrip, API mock, CSV output) Seed files (data/landing/ice_aging/seed/ and ice_stocks_by_port/seed/) must be created locally — data/ is gitignored. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -29,3 +29,17 @@ def ice_stocks_glob(evaluator) -> str:
|
||||
"""Return a quoted glob path for all ICE warehouse stock CSV gzip files under LANDING_DIR."""
|
||||
landing_dir = evaluator.var("LANDING_DIR") or os.environ.get("LANDING_DIR", "data/landing")
|
||||
return f"'{landing_dir}/ice_stocks/**/*.csv.gzip'"
|
||||
|
||||
|
||||
@macro()
|
||||
def ice_aging_glob(evaluator) -> str:
|
||||
"""Return a quoted glob path for all ICE aging report CSV gzip files under LANDING_DIR."""
|
||||
landing_dir = evaluator.var("LANDING_DIR") or os.environ.get("LANDING_DIR", "data/landing")
|
||||
return f"'{landing_dir}/ice_aging/**/*.csv.gzip'"
|
||||
|
||||
|
||||
@macro()
|
||||
def ice_stocks_by_port_glob(evaluator) -> str:
|
||||
"""Return a quoted glob path for all ICE historical by-port CSV gzip files under LANDING_DIR."""
|
||||
landing_dir = evaluator.var("LANDING_DIR") or os.environ.get("LANDING_DIR", "data/landing")
|
||||
return f"'{landing_dir}/ice_stocks_by_port/**/*.csv.gzip'"
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
-- Foundation fact: ICE certified Coffee C (Arabica) aging report.
|
||||
--
|
||||
-- Casts raw varchar columns to proper types and deduplicates via hash key.
|
||||
-- Grain: one row per (report_date, age_bucket).
|
||||
-- Age buckets represent how long coffee has been in certified storage.
|
||||
-- Port columns are in bags (60kg).
|
||||
|
||||
MODEL (
|
||||
name foundation.fct_ice_aging_stocks,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column report_date
|
||||
),
|
||||
grain (report_date, age_bucket),
|
||||
start '2020-01-01',
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
age_bucket,
|
||||
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
|
||||
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
|
||||
TRY_CAST(houston_bags AS bigint) AS houston_bags,
|
||||
TRY_CAST(miami_bags AS bigint) AS miami_bags,
|
||||
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
|
||||
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
|
||||
TRY_CAST(total_bags AS bigint) AS total_bags,
|
||||
|
||||
filename AS source_file,
|
||||
|
||||
hash(report_date, age_bucket, total_bags) AS hkey
|
||||
FROM raw.ice_aging_stocks
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND age_bucket IS NOT NULL
|
||||
AND age_bucket != ''
|
||||
),
|
||||
|
||||
deduplicated AS (
|
||||
SELECT
|
||||
any_value(report_date) AS report_date,
|
||||
any_value(age_bucket) AS age_bucket,
|
||||
any_value(antwerp_bags) AS antwerp_bags,
|
||||
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
|
||||
any_value(houston_bags) AS houston_bags,
|
||||
any_value(miami_bags) AS miami_bags,
|
||||
any_value(new_orleans_bags) AS new_orleans_bags,
|
||||
any_value(new_york_bags) AS new_york_bags,
|
||||
any_value(total_bags) AS total_bags,
|
||||
any_value(source_file) AS source_file,
|
||||
hkey
|
||||
FROM cast_and_clean
|
||||
GROUP BY hkey
|
||||
)
|
||||
|
||||
SELECT *
|
||||
FROM deduplicated
|
||||
WHERE report_date BETWEEN @start_ds AND @end_ds
|
||||
@@ -0,0 +1,60 @@
|
||||
-- Foundation fact: ICE historical end-of-month Coffee C certified warehouse stocks by port.
|
||||
--
|
||||
-- Covers November 1996 to present (30-year history). Casts raw varchar columns
|
||||
-- to proper types and deduplicates via hash key.
|
||||
--
|
||||
-- Grain: one row per report_date (end-of-month).
|
||||
-- Port columns are in bags (60kg).
|
||||
|
||||
MODEL (
|
||||
name foundation.fct_ice_warehouse_stocks_by_port,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column report_date
|
||||
),
|
||||
grain (report_date),
|
||||
start '1996-11-01',
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
|
||||
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
|
||||
TRY_CAST(houston_bags AS bigint) AS houston_bags,
|
||||
TRY_CAST(miami_bags AS bigint) AS miami_bags,
|
||||
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
|
||||
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
|
||||
TRY_CAST(barcelona_bags AS bigint) AS barcelona_bags,
|
||||
TRY_CAST(virginia_bags AS bigint) AS virginia_bags,
|
||||
TRY_CAST(total_bags AS bigint) AS total_bags,
|
||||
|
||||
filename AS source_file,
|
||||
|
||||
hash(report_date, total_bags) AS hkey
|
||||
FROM raw.ice_warehouse_stocks_by_port
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND TRY_CAST(total_bags AS bigint) IS NOT NULL
|
||||
),
|
||||
|
||||
deduplicated AS (
|
||||
SELECT
|
||||
any_value(report_date) AS report_date,
|
||||
any_value(new_york_bags) AS new_york_bags,
|
||||
any_value(new_orleans_bags) AS new_orleans_bags,
|
||||
any_value(houston_bags) AS houston_bags,
|
||||
any_value(miami_bags) AS miami_bags,
|
||||
any_value(antwerp_bags) AS antwerp_bags,
|
||||
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
|
||||
any_value(barcelona_bags) AS barcelona_bags,
|
||||
any_value(virginia_bags) AS virginia_bags,
|
||||
any_value(total_bags) AS total_bags,
|
||||
any_value(source_file) AS source_file,
|
||||
hkey
|
||||
FROM cast_and_clean
|
||||
GROUP BY hkey
|
||||
)
|
||||
|
||||
SELECT *
|
||||
FROM deduplicated
|
||||
WHERE report_date BETWEEN @start_ds AND @end_ds
|
||||
49
transform/sqlmesh_materia/models/raw/ice_aging_stocks.sql
Normal file
49
transform/sqlmesh_materia/models/raw/ice_aging_stocks.sql
Normal file
@@ -0,0 +1,49 @@
|
||||
-- Raw ICE certified stock aging report — technical ingestion layer.
|
||||
--
|
||||
-- Reads monthly aging report gzip CSVs from the landing directory.
|
||||
-- All values are varchar; casting happens in foundation.fct_ice_aging_stocks.
|
||||
--
|
||||
-- Source: ICE Report Center (Certified Stock Aging Report)
|
||||
-- Coverage: varies by download history
|
||||
-- Frequency: monthly (ICE updates after each delivery month)
|
||||
|
||||
MODEL (
|
||||
name raw.ice_aging_stocks,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
columns (
|
||||
report_date varchar,
|
||||
age_bucket varchar,
|
||||
antwerp_bags varchar,
|
||||
hamburg_bremen_bags varchar,
|
||||
houston_bags varchar,
|
||||
miami_bags varchar,
|
||||
new_orleans_bags varchar,
|
||||
new_york_bags varchar,
|
||||
total_bags varchar,
|
||||
filename varchar
|
||||
)
|
||||
);
|
||||
|
||||
SELECT
|
||||
report_date,
|
||||
age_bucket,
|
||||
antwerp_bags,
|
||||
hamburg_bremen_bags,
|
||||
houston_bags,
|
||||
miami_bags,
|
||||
new_orleans_bags,
|
||||
new_york_bags,
|
||||
total_bags,
|
||||
filename
|
||||
FROM read_csv(
|
||||
@ice_aging_glob(),
|
||||
delim = ',',
|
||||
encoding = 'utf-8',
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true,
|
||||
ignore_errors = true
|
||||
)
|
||||
@@ -0,0 +1,51 @@
|
||||
-- Raw ICE historical end-of-month warehouse stocks by port — technical ingestion layer.
|
||||
--
|
||||
-- Reads historical by-port stock gzip CSVs from the landing directory.
|
||||
-- All values are varchar; casting happens in foundation.fct_ice_warehouse_stocks_by_port.
|
||||
--
|
||||
-- Source: ICE (EOM_KC_cert_stox_by_port_nov96-present.xls)
|
||||
-- Coverage: November 1996 to present
|
||||
-- Frequency: monthly (ICE updates the static file monthly)
|
||||
|
||||
MODEL (
|
||||
name raw.ice_warehouse_stocks_by_port,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
columns (
|
||||
report_date varchar,
|
||||
new_york_bags varchar,
|
||||
new_orleans_bags varchar,
|
||||
houston_bags varchar,
|
||||
miami_bags varchar,
|
||||
antwerp_bags varchar,
|
||||
hamburg_bremen_bags varchar,
|
||||
barcelona_bags varchar,
|
||||
virginia_bags varchar,
|
||||
total_bags varchar,
|
||||
filename varchar
|
||||
)
|
||||
);
|
||||
|
||||
SELECT
|
||||
report_date,
|
||||
new_york_bags,
|
||||
new_orleans_bags,
|
||||
houston_bags,
|
||||
miami_bags,
|
||||
antwerp_bags,
|
||||
hamburg_bremen_bags,
|
||||
barcelona_bags,
|
||||
virginia_bags,
|
||||
total_bags,
|
||||
filename
|
||||
FROM read_csv(
|
||||
@ice_stocks_by_port_glob(),
|
||||
delim = ',',
|
||||
encoding = 'utf-8',
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true,
|
||||
ignore_errors = true
|
||||
)
|
||||
Reference in New Issue
Block a user