feat(cot): add combined (futures+options) COT extractor and transform models

- extract/cftc_cot: refactor extract_cot_year() to accept url_template and
  landing_subdir params; add _extract_cot() shared loop; add extract_cot_combined()
  entry point using com_disagg_txt_{year}.zip → landing/cot_combined/
- pyproject.toml: add extract_cot_combined script entry point
- macros/__init__.py: add @cot_combined_glob() for cot_combined/**/*.csv.gzip
- fct_cot_positioning.sql: union cot_glob and cot_combined_glob in src CTE;
  add report_type column (FutOnly_or_Combined) to cast_and_clean + deduplicated;
  include FutOnly_or_Combined in hkey to avoid key collisions; add report_type to grain
- obt_cot_positioning.sql: add report_type = 'FutOnly' filter to preserve
  existing serving behavior
- obt_cot_positioning_combined.sql: new serving model filtered to report_type =
  'Combined'; identical analytics (COT index, net %, windows) on combined data
- pipelines.py: register extract_cot_combined; add to extract_all meta-pipeline

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-26 11:24:56 +01:00
parent 8628496881
commit b884bc2b4a
7 changed files with 205 additions and 16 deletions

View File

@@ -10,6 +10,7 @@ dependencies = [
[project.scripts] [project.scripts]
extract_cot = "cftc_cot.execute:extract_cot_dataset" extract_cot = "cftc_cot.execute:extract_cot_dataset"
extract_cot_combined = "cftc_cot.execute:extract_cot_combined"
[build-system] [build-system]
requires = ["hatchling"] requires = ["hatchling"]

View File

@@ -1,11 +1,13 @@
"""CFTC COT Disaggregated Futures data extraction. """CFTC COT Disaggregated data extraction.
Downloads yearly ZIP files from CFTC and stores as gzip CSV in the landing Downloads yearly ZIP files from CFTC and stores as gzip CSV in the landing
directory. CFTC publishes one file per year that updates every Friday at directory. CFTC publishes one file per year that updates every Friday at
3:30 PM ET. On first run this backfills all years from 2006. On subsequent 3:30 PM ET. On first run this backfills all years from 2006. On subsequent
runs it skips files whose etag matches what is already on disk. runs it skips files whose etag matches what is already on disk.
Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip Two report variants are supported:
- Futures-only: Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip
- Combined (fut+options): Landing path: LANDING_DIR/cot_combined/{year}/{etag}.csv.gzip
""" """
import logging import logging
@@ -37,9 +39,10 @@ logger = logging.getLogger("CFTC COT Extractor")
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing")) LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
# CFTC publishes yearly ZIPs for the disaggregated futures-only report. # CFTC publishes yearly ZIPs for both variants of the disaggregated report.
# The file for the current year is updated each Friday at 3:30 PM ET. # The file for the current year is updated each Friday at 3:30 PM ET.
COT_URL_TEMPLATE = "https://www.cftc.gov/files/dea/history/fut_disagg_txt_{year}.zip" COT_URL_FUTURES_ONLY = "https://www.cftc.gov/files/dea/history/fut_disagg_txt_{year}.zip"
COT_URL_COMBINED = "https://www.cftc.gov/files/dea/history/com_disagg_txt_{year}.zip"
FIRST_YEAR = 2006 # Disaggregated report starts June 2006 FIRST_YEAR = 2006 # Disaggregated report starts June 2006
HTTP_TIMEOUT_SECONDS = 120 # COT ZIPs are up to ~30 MB HTTP_TIMEOUT_SECONDS = 120 # COT ZIPs are up to ~30 MB
@@ -60,12 +63,12 @@ def _synthetic_etag(year: int, headers: dict) -> str:
return etag return etag
def extract_cot_year(year: int, http_session: niquests.Session) -> int: def extract_cot_year(year: int, http_session: niquests.Session, url_template: str, landing_subdir: str) -> int:
"""Download and store COT data for a single year. """Download and store COT data for a single year.
Returns bytes_written (0 if skipped or unavailable). Returns bytes_written (0 if skipped or unavailable).
""" """
url = COT_URL_TEMPLATE.format(year=year) url = url_template.format(year=year)
logger.info(f"Checking COT data for {year}: {url}") logger.info(f"Checking COT data for {year}: {url}")
head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS) head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
@@ -79,7 +82,7 @@ def extract_cot_year(year: int, http_session: niquests.Session) -> int:
raw_etag = head.headers.get("etag", "") raw_etag = head.headers.get("etag", "")
etag = normalize_etag(raw_etag) if raw_etag else _synthetic_etag(year, head.headers) etag = normalize_etag(raw_etag) if raw_etag else _synthetic_etag(year, head.headers)
dest_dir = landing_path(LANDING_DIR, "cot", str(year)) dest_dir = landing_path(LANDING_DIR, landing_subdir, str(year))
local_file = dest_dir / f"{etag}.csv.gzip" local_file = dest_dir / f"{etag}.csv.gzip"
if local_file.exists(): if local_file.exists():
@@ -104,8 +107,8 @@ def extract_cot_year(year: int, http_session: niquests.Session) -> int:
return bytes_written return bytes_written
def extract_cot_dataset(): def _extract_cot(url_template: str, landing_subdir: str, extractor_name: str) -> None:
"""Extract all available CFTC COT disaggregated futures data. """Shared extraction loop for any COT report variant.
Downloads current year first (always re-checks for weekly Friday updates), Downloads current year first (always re-checks for weekly Friday updates),
then backfills historical years. Bounded to MAX_YEARS. Continues on then backfills historical years. Bounded to MAX_YEARS. Continues on
@@ -119,7 +122,7 @@ def extract_cot_dataset():
) )
conn = open_state_db(LANDING_DIR) conn = open_state_db(LANDING_DIR)
run_id = start_run(conn, "cftc_cot") run_id = start_run(conn, extractor_name)
files_written = 0 files_written = 0
files_skipped = 0 files_skipped = 0
bytes_written_total = 0 bytes_written_total = 0
@@ -127,7 +130,7 @@ def extract_cot_dataset():
with niquests.Session() as session: with niquests.Session() as session:
for year in years: for year in years:
try: try:
result = extract_cot_year(year, session) result = extract_cot_year(year, session, url_template, landing_subdir)
if result > 0: if result > 0:
files_written += 1 files_written += 1
bytes_written_total += result bytes_written_total += result
@@ -136,7 +139,7 @@ def extract_cot_dataset():
except Exception: except Exception:
logger.exception(f"Failed to extract COT data for {year}, continuing") logger.exception(f"Failed to extract COT data for {year}, continuing")
logger.info(f"COT extraction complete: {files_written} new file(s) downloaded") logger.info(f"COT extraction complete ({extractor_name}): {files_written} new file(s) downloaded")
end_run( end_run(
conn, run_id, status="success", conn, run_id, status="success",
files_written=files_written, files_skipped=files_skipped, files_written=files_written, files_skipped=files_skipped,
@@ -150,5 +153,15 @@ def extract_cot_dataset():
conn.close() conn.close()
def extract_cot_dataset():
"""Extract CFTC COT disaggregated futures-only report."""
_extract_cot(COT_URL_FUTURES_ONLY, "cot", "cftc_cot")
def extract_cot_combined():
"""Extract CFTC COT disaggregated combined (futures+options) report."""
_extract_cot(COT_URL_COMBINED, "cot_combined", "cftc_cot_combined")
if __name__ == "__main__": if __name__ == "__main__":
extract_cot_dataset() extract_cot_dataset()

View File

@@ -20,6 +20,10 @@ PIPELINES = {
"command": ["uv", "run", "--package", "cftc_cot", "extract_cot"], "command": ["uv", "run", "--package", "cftc_cot", "extract_cot"],
"timeout_seconds": 1800, "timeout_seconds": 1800,
}, },
"extract_cot_combined": {
"command": ["uv", "run", "--package", "cftc_cot", "extract_cot_combined"],
"timeout_seconds": 1800,
},
"extract_prices": { "extract_prices": {
"command": ["uv", "run", "--package", "coffee_prices", "extract_prices"], "command": ["uv", "run", "--package", "coffee_prices", "extract_prices"],
"timeout_seconds": 300, "timeout_seconds": 300,
@@ -49,7 +53,7 @@ PIPELINES = {
"timeout_seconds": 120, "timeout_seconds": 120,
}, },
"extract_all": { "extract_all": {
"command": ["meta", "extract", "extract_cot", "extract_prices", "extract_ice_all", "extract_weather"], "command": ["meta", "extract", "extract_cot", "extract_cot_combined", "extract_prices", "extract_ice_all", "extract_weather"],
"timeout_seconds": 6600, "timeout_seconds": 6600,
}, },
"transform": { "transform": {
@@ -68,7 +72,7 @@ PIPELINES = {
META_PIPELINES: dict[str, list[str]] = { META_PIPELINES: dict[str, list[str]] = {
"extract_all": ["extract", "extract_cot", "extract_prices", "extract_ice_all", "extract_weather"], "extract_all": ["extract", "extract_cot", "extract_cot_combined", "extract_prices", "extract_ice_all", "extract_weather"],
} }

View File

@@ -17,6 +17,13 @@ def cot_glob(evaluator) -> str:
return f"'{landing_dir}/cot/**/*.csv.gzip'" return f"'{landing_dir}/cot/**/*.csv.gzip'"
@macro()
def cot_combined_glob(evaluator) -> str:
"""Return a quoted glob path for all COT combined (futures+options) CSV gzip files under LANDING_DIR."""
landing_dir = evaluator.var("LANDING_DIR") or os.environ.get("LANDING_DIR", "data/landing")
return f"'{landing_dir}/cot_combined/**/*.csv.gzip'"
@macro() @macro()
def prices_glob(evaluator) -> str: def prices_glob(evaluator) -> str:
"""Return a quoted glob path for all coffee price CSV gzip files under LANDING_DIR.""" """Return a quoted glob path for all coffee price CSV gzip files under LANDING_DIR."""

View File

@@ -4,7 +4,7 @@ MODEL (
kind INCREMENTAL_BY_TIME_RANGE ( kind INCREMENTAL_BY_TIME_RANGE (
time_column report_date time_column report_date
), ),
grain (cftc_commodity_code, report_date, cftc_contract_market_code, ingest_date), grain (cftc_commodity_code, report_date, cftc_contract_market_code, ingest_date, report_type),
start '2006-06-13', start '2006-06-13',
cron '@daily' cron '@daily'
); );
@@ -21,6 +21,18 @@ WITH src AS (
all_varchar = TRUE, all_varchar = TRUE,
max_line_size = 10000000 max_line_size = 10000000
) )
UNION ALL BY NAME
SELECT
*
FROM READ_CSV(
@cot_combined_glob(),
compression = 'gzip',
header = TRUE,
union_by_name = TRUE,
filename = TRUE,
all_varchar = TRUE,
max_line_size = 10000000
)
), cast_and_clean AS ( ), cast_and_clean AS (
SELECT SELECT
TRIM(market_and_exchange_names) AS market_and_exchange_name, /* Identifiers */ TRIM(market_and_exchange_names) AS market_and_exchange_name, /* Identifiers */
@@ -28,6 +40,7 @@ WITH src AS (
TRIM(cftc_commodity_code) AS cftc_commodity_code, TRIM(cftc_commodity_code) AS cftc_commodity_code,
TRIM(cftc_contract_market_code) AS cftc_contract_market_code, TRIM(cftc_contract_market_code) AS cftc_contract_market_code,
TRIM(contract_units) AS contract_units, TRIM(contract_units) AS contract_units,
TRIM("FutOnly_or_Combined") AS report_type, /* 'FutOnly' or 'Combined' — discriminates the two CFTC report variants */
TRY_CAST(open_interest_all AS INT) AS open_interest, /* Open interest */ /* CFTC uses '.' as null for any field — use TRY_CAST throughout */ TRY_CAST(open_interest_all AS INT) AS open_interest, /* Open interest */ /* CFTC uses '.' as null for any field — use TRY_CAST throughout */
TRY_CAST(prod_merc_positions_long_all AS INT) AS prod_merc_long, /* Producer / Merchant (commercial hedgers: exporters, processors) */ TRY_CAST(prod_merc_positions_long_all AS INT) AS prod_merc_long, /* Producer / Merchant (commercial hedgers: exporters, processors) */
TRY_CAST(prod_merc_positions_short_all AS INT) AS prod_merc_short, TRY_CAST(prod_merc_positions_short_all AS INT) AS prod_merc_short,
@@ -66,12 +79,13 @@ WITH src AS (
cftc_commodity_code, cftc_commodity_code,
report_date_as_yyyy_mm_dd, report_date_as_yyyy_mm_dd,
cftc_contract_market_code, cftc_contract_market_code,
"FutOnly_or_Combined",
open_interest_all, open_interest_all,
m_money_positions_long_all, m_money_positions_long_all,
m_money_positions_short_all, m_money_positions_short_all,
prod_merc_positions_long_all, prod_merc_positions_long_all,
prod_merc_positions_short_all prod_merc_positions_short_all
) AS hkey /* Dedup key: hash of business grain + key metrics */ ) AS hkey /* Dedup key: hash of business grain + key metrics; includes report variant so fut-only and combined rows get distinct keys */
FROM src FROM src
/* Reject rows with null commodity code or malformed date */ /* Reject rows with null commodity code or malformed date */
WHERE WHERE
@@ -119,6 +133,7 @@ WITH src AS (
ANY_VALUE(traders_managed_money_short) AS traders_managed_money_short, ANY_VALUE(traders_managed_money_short) AS traders_managed_money_short,
ANY_VALUE(traders_managed_money_spread) AS traders_managed_money_spread, ANY_VALUE(traders_managed_money_spread) AS traders_managed_money_spread,
ANY_VALUE(ingest_date) AS ingest_date, ANY_VALUE(ingest_date) AS ingest_date,
ANY_VALUE(report_type) AS report_type,
hkey hkey
FROM cast_and_clean FROM cast_and_clean
GROUP BY GROUP BY

View File

@@ -20,6 +20,7 @@ WITH latest_revision AS (
ON f.cftc_commodity_code = d.cftc_commodity_code ON f.cftc_commodity_code = d.cftc_commodity_code
WHERE WHERE
d.commodity_name = 'Coffee, Green' d.commodity_name = 'Coffee, Green'
AND f.report_type = 'FutOnly'
AND f.report_date BETWEEN @start_ds AND @end_ds AND f.report_date BETWEEN @start_ds AND @end_ds
QUALIFY QUALIFY
ROW_NUMBER() OVER ( ROW_NUMBER() OVER (

View File

@@ -0,0 +1,148 @@
/* Serving mart: COT positioning (combined futures+options) for Coffee C futures. */ /* Same analytics as serving.cot_positioning, but filtered to the combined */ /* report variant (FutOnly_or_Combined = 'Combined'). Positions include */ /* options delta-equivalent exposure, showing total directional market bet. */ /* Grain: one row per report_date for Coffee C futures. */ /* Latest revision per date: MAX(ingest_date) used to deduplicate CFTC corrections. */
MODEL (
name serving.cot_positioning_combined,
kind INCREMENTAL_BY_TIME_RANGE (
time_column report_date
),
grain (
report_date
),
start '2006-06-13',
cron '@daily'
);
WITH latest_revision AS (
/* Pick the most recently ingested row when CFTC issues corrections */
SELECT
f.*
FROM foundation.fct_cot_positioning AS f
INNER JOIN foundation.dim_commodity AS d
ON f.cftc_commodity_code = d.cftc_commodity_code
WHERE
d.commodity_name = 'Coffee, Green'
AND f.report_type = 'Combined'
AND f.report_date BETWEEN @start_ds AND @end_ds
QUALIFY
ROW_NUMBER() OVER (
PARTITION BY f.report_date, f.cftc_contract_market_code
ORDER BY f.ingest_date DESC
) = 1
), with_derived AS (
SELECT
report_date,
market_and_exchange_name,
cftc_commodity_code,
cftc_contract_market_code,
contract_units,
ingest_date,
open_interest, /* Absolute positions (contracts, delta-equivalent for options) */
managed_money_long,
managed_money_short,
managed_money_spread,
managed_money_net,
prod_merc_long,
prod_merc_short,
prod_merc_net,
swap_long,
swap_short,
swap_spread,
swap_net,
other_reportable_long,
other_reportable_short,
other_reportable_spread,
other_reportable_net,
nonreportable_long,
nonreportable_short,
nonreportable_net,
ROUND(managed_money_net::REAL / NULLIF(open_interest, 0) * 100, 2) AS managed_money_net_pct_of_oi, /* Normalized: managed money net as % of open interest */ /* Removes size effects and makes cross-period comparison meaningful */
ROUND(managed_money_long::REAL / NULLIF(managed_money_short, 0), 3) AS managed_money_long_short_ratio, /* Long/short ratio: >1 = more bulls than bears in managed money */
change_open_interest, /* Weekly changes */
change_managed_money_long,
change_managed_money_short,
change_managed_money_net,
change_prod_merc_long,
change_prod_merc_short,
managed_money_net /* Week-over-week momentum in managed money net (via LAG) */ - LAG(managed_money_net, 1) OVER (ORDER BY report_date) AS managed_money_net_wow,
concentration_top4_long_pct, /* Concentration */
concentration_top4_short_pct,
concentration_top8_long_pct,
concentration_top8_short_pct,
traders_total, /* Trader counts */
traders_managed_money_long,
traders_managed_money_short,
traders_managed_money_spread,
CASE
WHEN MAX(managed_money_net) OVER w26 = MIN(managed_money_net) OVER w26
THEN 50.0
ELSE ROUND(
(
managed_money_net - MIN(managed_money_net) OVER w26
)::REAL / (
MAX(managed_money_net) OVER w26 - MIN(managed_money_net) OVER w26
) * 100,
1
)
END AS cot_index_26w, /* COT Index (26-week): where is current net vs. trailing 26 weeks? */ /* 0 = most bearish extreme, 100 = most bullish extreme */ /* Includes options delta-equivalent exposure */
CASE
WHEN MAX(managed_money_net) OVER w52 = MIN(managed_money_net) OVER w52
THEN 50.0
ELSE ROUND(
(
managed_money_net - MIN(managed_money_net) OVER w52
)::REAL / (
MAX(managed_money_net) OVER w52 - MIN(managed_money_net) OVER w52
) * 100,
1
)
END AS cot_index_52w /* COT Index (52-week): longer-term positioning context */
FROM latest_revision
WINDOW w26 AS (ORDER BY report_date ROWS BETWEEN 25 PRECEDING AND CURRENT ROW), w52 AS (ORDER BY report_date ROWS BETWEEN 51 PRECEDING AND CURRENT ROW)
)
SELECT
report_date,
market_and_exchange_name,
cftc_commodity_code,
cftc_contract_market_code,
contract_units,
ingest_date,
open_interest,
managed_money_long,
managed_money_short,
managed_money_spread,
managed_money_net,
prod_merc_long,
prod_merc_short,
prod_merc_net,
swap_long,
swap_short,
swap_spread,
swap_net,
other_reportable_long,
other_reportable_short,
other_reportable_spread,
other_reportable_net,
nonreportable_long,
nonreportable_short,
nonreportable_net,
managed_money_net_pct_of_oi,
managed_money_long_short_ratio,
change_open_interest,
change_managed_money_long,
change_managed_money_short,
change_managed_money_net,
change_prod_merc_long,
change_prod_merc_short,
managed_money_net_wow,
concentration_top4_long_pct,
concentration_top4_short_pct,
concentration_top8_long_pct,
concentration_top8_short_pct,
traders_total,
traders_managed_money_long,
traders_managed_money_short,
traders_managed_money_spread,
cot_index_26w,
cot_index_52w
FROM with_derived
ORDER BY
report_date