refactor(transform): remove raw layer, read landing zone directly
- Delete 6 data raw models (coffee_prices, cot_disaggregated, ice_*, psd_data) — pure read_csv passthroughs with no added value - Move 3 PSD seed models raw/ → seeds/, rename schema raw.* → seeds.* - Update staging.psdalldata__commodity: read_csv(@psd_glob()) directly, join seeds.psd_* instead of raw.psd_* - Update 5 foundation models: inline read_csv() with src CTE, removing raw.* dependency (fct_coffee_prices, fct_cot_positioning, fct_ice_*) - Remove fixture-based SQLMesh test that depended on raw.cot_disaggregated (unit tests incompatible with inline read_csv; integration run covers this) - Update readme.md: 3-layer architecture (staging/foundation → serving) Landing files are immutable and content-addressed — the landing directory is the audit trail. A raw SQL layer duplicated file bytes into DuckDB with no added value. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,8 +6,8 @@
|
||||
-- As new commodities are added (cocoa, sugar), rows are added here.
|
||||
--
|
||||
-- References:
|
||||
-- usda_commodity_code → raw.psd_alldata.commodity_code (numeric string, e.g. '0711100')
|
||||
-- cftc_commodity_code → raw.cot_disaggregated.cftc_commodity_code (3-char, e.g. '083')
|
||||
-- usda_commodity_code → staging.psdalldata__commodity.commodity_code (numeric string, e.g. '0711100')
|
||||
-- cftc_commodity_code → foundation.fct_cot_positioning.cftc_commodity_code (3-char, e.g. '083')
|
||||
--
|
||||
-- NOTE: Defined as FULL model (not SEED) to guarantee leading-zero preservation.
|
||||
-- Pandas CSV loading converts '083' → 83 even with varchar column declarations.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
-- Foundation fact: daily KC=F Coffee C futures prices.
|
||||
--
|
||||
-- Casts raw varchar columns to proper types and deduplicates via hash key.
|
||||
-- Reads directly from the landing zone, casts varchar columns to proper types,
|
||||
-- and deduplicates via hash key.
|
||||
-- Covers all available history from the landing directory.
|
||||
--
|
||||
-- Grain: one row per trade_date.
|
||||
@@ -17,7 +18,18 @@ MODEL (
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
WITH src AS (
|
||||
SELECT * FROM read_csv(
|
||||
@prices_glob(),
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true
|
||||
)
|
||||
),
|
||||
|
||||
cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(Date AS date) AS trade_date,
|
||||
TRY_CAST(Open AS double) AS open,
|
||||
@@ -32,7 +44,7 @@ WITH cast_and_clean AS (
|
||||
|
||||
-- Dedup key: trade date + close price
|
||||
hash(Date, Close) AS hkey
|
||||
FROM raw.coffee_prices
|
||||
FROM src
|
||||
WHERE TRY_CAST(Date AS date) IS NOT NULL
|
||||
AND TRY_CAST(Close AS double) IS NOT NULL
|
||||
),
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
-- Foundation fact: CFTC COT positioning, weekly grain, all commodities.
|
||||
--
|
||||
-- Casts raw varchar columns to proper types, cleans column names,
|
||||
-- computes net positions (long - short) per trader category, and
|
||||
-- deduplicates via hash key. Covers all commodities — filtering to
|
||||
-- Reads directly from the landing zone, casts varchar columns to proper types,
|
||||
-- cleans column names, computes net positions (long - short) per trader category,
|
||||
-- and deduplicates via hash key. Covers all commodities — filtering to
|
||||
-- a specific commodity happens in the serving layer.
|
||||
--
|
||||
-- Grain: one row per (cftc_commodity_code, report_date, cftc_contract_market_code)
|
||||
@@ -19,7 +19,19 @@ MODEL (
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
WITH src AS (
|
||||
SELECT * FROM read_csv(
|
||||
@cot_glob(),
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true,
|
||||
max_line_size = 10000000
|
||||
)
|
||||
),
|
||||
|
||||
cast_and_clean AS (
|
||||
SELECT
|
||||
-- Identifiers
|
||||
trim(market_and_exchange_names) AS market_and_exchange_name,
|
||||
@@ -103,7 +115,7 @@ WITH cast_and_clean AS (
|
||||
prod_merc_positions_long_all,
|
||||
prod_merc_positions_short_all
|
||||
) AS hkey
|
||||
FROM raw.cot_disaggregated
|
||||
FROM src
|
||||
-- Reject rows with null commodity code or malformed date
|
||||
WHERE trim(cftc_commodity_code) IS NOT NULL
|
||||
AND len(trim(cftc_commodity_code)) > 0
|
||||
|
||||
@@ -1,58 +1,70 @@
|
||||
-- Foundation fact: ICE certified Coffee C (Arabica) aging report.
|
||||
--
|
||||
-- Casts raw varchar columns to proper types and deduplicates via hash key.
|
||||
-- Grain: one row per (report_date, age_bucket).
|
||||
-- Age buckets represent how long coffee has been in certified storage.
|
||||
-- Port columns are in bags (60kg).
|
||||
|
||||
MODEL (
|
||||
name foundation.fct_ice_aging_stocks,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column report_date
|
||||
),
|
||||
grain (report_date, age_bucket),
|
||||
start '2020-01-01',
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
age_bucket,
|
||||
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
|
||||
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
|
||||
TRY_CAST(houston_bags AS bigint) AS houston_bags,
|
||||
TRY_CAST(miami_bags AS bigint) AS miami_bags,
|
||||
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
|
||||
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
|
||||
TRY_CAST(total_bags AS bigint) AS total_bags,
|
||||
|
||||
filename AS source_file,
|
||||
|
||||
hash(report_date, age_bucket, total_bags) AS hkey
|
||||
FROM raw.ice_aging_stocks
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND age_bucket IS NOT NULL
|
||||
AND age_bucket != ''
|
||||
),
|
||||
|
||||
deduplicated AS (
|
||||
SELECT
|
||||
any_value(report_date) AS report_date,
|
||||
any_value(age_bucket) AS age_bucket,
|
||||
any_value(antwerp_bags) AS antwerp_bags,
|
||||
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
|
||||
any_value(houston_bags) AS houston_bags,
|
||||
any_value(miami_bags) AS miami_bags,
|
||||
any_value(new_orleans_bags) AS new_orleans_bags,
|
||||
any_value(new_york_bags) AS new_york_bags,
|
||||
any_value(total_bags) AS total_bags,
|
||||
any_value(source_file) AS source_file,
|
||||
hkey
|
||||
FROM cast_and_clean
|
||||
GROUP BY hkey
|
||||
)
|
||||
|
||||
SELECT *
|
||||
FROM deduplicated
|
||||
WHERE report_date BETWEEN @start_ds AND @end_ds
|
||||
-- Foundation fact: ICE certified Coffee C (Arabica) aging report.
|
||||
--
|
||||
-- Reads directly from the landing zone, casts varchar columns to proper types,
|
||||
-- and deduplicates via hash key.
|
||||
-- Grain: one row per (report_date, age_bucket).
|
||||
-- Age buckets represent how long coffee has been in certified storage.
|
||||
-- Port columns are in bags (60kg).
|
||||
|
||||
MODEL (
|
||||
name foundation.fct_ice_aging_stocks,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column report_date
|
||||
),
|
||||
grain (report_date, age_bucket),
|
||||
start '2020-01-01',
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH src AS (
|
||||
SELECT * FROM read_csv(
|
||||
@ice_aging_glob(),
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true
|
||||
)
|
||||
),
|
||||
|
||||
cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
age_bucket,
|
||||
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
|
||||
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
|
||||
TRY_CAST(houston_bags AS bigint) AS houston_bags,
|
||||
TRY_CAST(miami_bags AS bigint) AS miami_bags,
|
||||
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
|
||||
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
|
||||
TRY_CAST(total_bags AS bigint) AS total_bags,
|
||||
|
||||
filename AS source_file,
|
||||
|
||||
hash(report_date, age_bucket, total_bags) AS hkey
|
||||
FROM src
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND age_bucket IS NOT NULL
|
||||
AND age_bucket != ''
|
||||
),
|
||||
|
||||
deduplicated AS (
|
||||
SELECT
|
||||
any_value(report_date) AS report_date,
|
||||
any_value(age_bucket) AS age_bucket,
|
||||
any_value(antwerp_bags) AS antwerp_bags,
|
||||
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
|
||||
any_value(houston_bags) AS houston_bags,
|
||||
any_value(miami_bags) AS miami_bags,
|
||||
any_value(new_orleans_bags) AS new_orleans_bags,
|
||||
any_value(new_york_bags) AS new_york_bags,
|
||||
any_value(total_bags) AS total_bags,
|
||||
any_value(source_file) AS source_file,
|
||||
hkey
|
||||
FROM cast_and_clean
|
||||
GROUP BY hkey
|
||||
)
|
||||
|
||||
SELECT *
|
||||
FROM deduplicated
|
||||
WHERE report_date BETWEEN @start_ds AND @end_ds
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
-- Foundation fact: ICE certified Coffee C (Arabica) warehouse stocks.
|
||||
--
|
||||
-- Casts raw varchar columns to proper types and deduplicates via hash key.
|
||||
-- Reads directly from the landing zone, casts varchar columns to proper types,
|
||||
-- and deduplicates via hash key.
|
||||
-- "Certified" means Coffee C graded and stamped as delivery-eligible
|
||||
-- against ICE futures contracts — a key physical supply indicator.
|
||||
--
|
||||
@@ -16,7 +17,18 @@ MODEL (
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
WITH src AS (
|
||||
SELECT * FROM read_csv(
|
||||
@ice_stocks_glob(),
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true
|
||||
)
|
||||
),
|
||||
|
||||
cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
TRY_CAST(total_certified_bags AS bigint) AS total_certified_bags,
|
||||
@@ -26,7 +38,7 @@ WITH cast_and_clean AS (
|
||||
|
||||
-- Dedup key: report date + total bags
|
||||
hash(report_date, total_certified_bags) AS hkey
|
||||
FROM raw.ice_warehouse_stocks
|
||||
FROM src
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND TRY_CAST(total_certified_bags AS bigint) IS NOT NULL
|
||||
),
|
||||
|
||||
@@ -1,60 +1,72 @@
|
||||
-- Foundation fact: ICE historical end-of-month Coffee C certified warehouse stocks by port.
|
||||
--
|
||||
-- Covers November 1996 to present (30-year history). Casts raw varchar columns
|
||||
-- to proper types and deduplicates via hash key.
|
||||
--
|
||||
-- Grain: one row per report_date (end-of-month).
|
||||
-- Port columns are in bags (60kg).
|
||||
|
||||
MODEL (
|
||||
name foundation.fct_ice_warehouse_stocks_by_port,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column report_date
|
||||
),
|
||||
grain (report_date),
|
||||
start '1996-11-01',
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
|
||||
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
|
||||
TRY_CAST(houston_bags AS bigint) AS houston_bags,
|
||||
TRY_CAST(miami_bags AS bigint) AS miami_bags,
|
||||
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
|
||||
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
|
||||
TRY_CAST(barcelona_bags AS bigint) AS barcelona_bags,
|
||||
TRY_CAST(virginia_bags AS bigint) AS virginia_bags,
|
||||
TRY_CAST(total_bags AS bigint) AS total_bags,
|
||||
|
||||
filename AS source_file,
|
||||
|
||||
hash(report_date, total_bags) AS hkey
|
||||
FROM raw.ice_warehouse_stocks_by_port
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND TRY_CAST(total_bags AS bigint) IS NOT NULL
|
||||
),
|
||||
|
||||
deduplicated AS (
|
||||
SELECT
|
||||
any_value(report_date) AS report_date,
|
||||
any_value(new_york_bags) AS new_york_bags,
|
||||
any_value(new_orleans_bags) AS new_orleans_bags,
|
||||
any_value(houston_bags) AS houston_bags,
|
||||
any_value(miami_bags) AS miami_bags,
|
||||
any_value(antwerp_bags) AS antwerp_bags,
|
||||
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
|
||||
any_value(barcelona_bags) AS barcelona_bags,
|
||||
any_value(virginia_bags) AS virginia_bags,
|
||||
any_value(total_bags) AS total_bags,
|
||||
any_value(source_file) AS source_file,
|
||||
hkey
|
||||
FROM cast_and_clean
|
||||
GROUP BY hkey
|
||||
)
|
||||
|
||||
SELECT *
|
||||
FROM deduplicated
|
||||
WHERE report_date BETWEEN @start_ds AND @end_ds
|
||||
-- Foundation fact: ICE historical end-of-month Coffee C certified warehouse stocks by port.
|
||||
--
|
||||
-- Reads directly from the landing zone, casts varchar columns to proper types,
|
||||
-- and deduplicates via hash key.
|
||||
-- Covers November 1996 to present (30-year history).
|
||||
--
|
||||
-- Grain: one row per report_date (end-of-month).
|
||||
-- Port columns are in bags (60kg).
|
||||
|
||||
MODEL (
|
||||
name foundation.fct_ice_warehouse_stocks_by_port,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column report_date
|
||||
),
|
||||
grain (report_date),
|
||||
start '1996-11-01',
|
||||
cron '@daily'
|
||||
);
|
||||
|
||||
WITH src AS (
|
||||
SELECT * FROM read_csv(
|
||||
@ice_stocks_by_port_glob(),
|
||||
compression = 'gzip',
|
||||
header = true,
|
||||
union_by_name = true,
|
||||
filename = true,
|
||||
all_varchar = true
|
||||
)
|
||||
),
|
||||
|
||||
cast_and_clean AS (
|
||||
SELECT
|
||||
TRY_CAST(report_date AS date) AS report_date,
|
||||
TRY_CAST(new_york_bags AS bigint) AS new_york_bags,
|
||||
TRY_CAST(new_orleans_bags AS bigint) AS new_orleans_bags,
|
||||
TRY_CAST(houston_bags AS bigint) AS houston_bags,
|
||||
TRY_CAST(miami_bags AS bigint) AS miami_bags,
|
||||
TRY_CAST(antwerp_bags AS bigint) AS antwerp_bags,
|
||||
TRY_CAST(hamburg_bremen_bags AS bigint) AS hamburg_bremen_bags,
|
||||
TRY_CAST(barcelona_bags AS bigint) AS barcelona_bags,
|
||||
TRY_CAST(virginia_bags AS bigint) AS virginia_bags,
|
||||
TRY_CAST(total_bags AS bigint) AS total_bags,
|
||||
|
||||
filename AS source_file,
|
||||
|
||||
hash(report_date, total_bags) AS hkey
|
||||
FROM src
|
||||
WHERE TRY_CAST(report_date AS date) IS NOT NULL
|
||||
AND TRY_CAST(total_bags AS bigint) IS NOT NULL
|
||||
),
|
||||
|
||||
deduplicated AS (
|
||||
SELECT
|
||||
any_value(report_date) AS report_date,
|
||||
any_value(new_york_bags) AS new_york_bags,
|
||||
any_value(new_orleans_bags) AS new_orleans_bags,
|
||||
any_value(houston_bags) AS houston_bags,
|
||||
any_value(miami_bags) AS miami_bags,
|
||||
any_value(antwerp_bags) AS antwerp_bags,
|
||||
any_value(hamburg_bremen_bags) AS hamburg_bremen_bags,
|
||||
any_value(barcelona_bags) AS barcelona_bags,
|
||||
any_value(virginia_bags) AS virginia_bags,
|
||||
any_value(total_bags) AS total_bags,
|
||||
any_value(source_file) AS source_file,
|
||||
hkey
|
||||
FROM cast_and_clean
|
||||
GROUP BY hkey
|
||||
)
|
||||
|
||||
SELECT *
|
||||
FROM deduplicated
|
||||
WHERE report_date BETWEEN @start_ds AND @end_ds
|
||||
|
||||
Reference in New Issue
Block a user