Add CFTC COT data integration with foundation data model layer

- New extraction package (cftc_cot): downloads yearly Disaggregated Futures ZIPs
  from CFTC, etag-based dedup, dynamic inner filename discovery, gzip normalization
- SQLMesh 3-layer architecture: raw (technical) → foundation (business model) → serving (mart)
- dim_commodity seed: conformed dimension mapping USDA ↔ CFTC codes — the commodity ontology
- fct_cot_positioning: typed, deduplicated weekly positioning facts for all commodities
- obt_cot_positioning: Coffee C mart with COT Index (26w/52w), WoW delta, OI ratios
- Analytics functions + REST API endpoints: /commodities/<code>/positioning[/latest]
- Dashboard widget: Managed Money net, COT Index card, dual-axis Chart.js chart
- 23 passing tests (10 unit + 2 SQLMesh model + existing regression suite)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-20 21:57:04 +01:00
parent d09ba91023
commit 0a83b2cb74
19 changed files with 1111 additions and 3 deletions

View File

@@ -0,0 +1,18 @@
[project]
name = "cftc_cot"
version = "0.1.0"
description = "CFTC Commitment of Traders data extractor"
requires-python = ">=3.13"
dependencies = [
"niquests>=3.14.1",
]
[project.scripts]
extract_cot = "cftc_cot.execute:extract_cot_dataset"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/cftc_cot"]

View File

@@ -0,0 +1,129 @@
"""CFTC COT Disaggregated Futures data extraction.
Downloads yearly ZIP files from CFTC and stores as gzip CSV in the landing
directory. CFTC publishes one file per year that updates every Friday at
3:30 PM ET. On first run this backfills all years from 2006. On subsequent
runs it skips files whose etag matches what is already on disk.
Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip
"""
import logging
import os
import pathlib
import sys
from datetime import datetime
from io import BytesIO
import niquests
from .normalize import find_csv_inner_filename, normalize_zipped_csv
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("CFTC COT Extractor")
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
# CFTC publishes yearly ZIPs for the disaggregated futures-only report.
# The file for the current year is updated each Friday at 3:30 PM ET.
COT_URL_TEMPLATE = "https://www.cftc.gov/files/dea/history/fut_disagg_txt_{year}.zip"
FIRST_YEAR = 2006 # Disaggregated report starts June 2006
HTTP_TIMEOUT_SECONDS = 120 # COT ZIPs are up to ~30 MB
MAX_YEARS = 25 # Safety bound on backfill range
def _synthetic_etag(year: int, headers: dict) -> str:
"""Build a dedup key when CFTC omits the etag header.
Uses content-length + last-modified. This is not as strong as a real etag
(a server clock change would trigger a re-download), but it is safe because
the staging layer deduplicates via hash key.
"""
last_modified = headers.get("last-modified", "")
content_length = headers.get("content-length", "0")
etag = f"{year}_{content_length}_{hash(last_modified) & 0xFFFFFFFF:08x}"
logger.info(f"No etag header for {year}, using synthetic etag: {etag}")
return etag
def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
"""Download and store COT data for a single year.
Returns True if a new file was written, False if skipped or unavailable.
"""
url = COT_URL_TEMPLATE.format(year=year)
logger.info(f"Checking COT data for {year}: {url}")
head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
if head.status_code == 404:
logger.info(f"Year {year} not available (404) — skipping")
return False
assert head.status_code == 200, (
f"Unexpected HEAD status {head.status_code} for {url}"
)
raw_etag = head.headers.get("etag", "")
etag = raw_etag.replace('"', "").replace(":", "_") if raw_etag else _synthetic_etag(year, head.headers)
dest_dir = LANDING_DIR / "cot" / str(year)
local_file = dest_dir / f"{etag}.csv.gzip"
if local_file.exists():
logger.info(f"Year {year}: {etag}.csv.gzip already exists, skipping")
return False
logger.info(f"Downloading COT data for {year}...")
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
assert response.status_code == 200, (
f"GET failed with status {response.status_code} for {url}"
)
assert len(response.content) > 0, f"Downloaded empty file from {url}"
zip_buffer = BytesIO(response.content)
inner_filename = find_csv_inner_filename(BytesIO(response.content))
normalized = normalize_zipped_csv(zip_buffer, inner_filename)
dest_dir.mkdir(parents=True, exist_ok=True)
local_file.write_bytes(normalized.read())
assert local_file.exists(), f"File was not written: {local_file}"
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
return True
def extract_cot_dataset():
"""Extract all available CFTC COT disaggregated futures data.
Downloads current year first (always re-checks for weekly Friday updates),
then backfills historical years. Bounded to MAX_YEARS. Continues on
individual year failures so a single bad year does not abort the run.
"""
LANDING_DIR.mkdir(parents=True, exist_ok=True)
current_year = datetime.now().year
years = list(range(current_year, FIRST_YEAR - 1, -1))
assert len(years) <= MAX_YEARS, (
f"Year range {len(years)} exceeds MAX_YEARS={MAX_YEARS}"
)
new_count = 0
with niquests.Session() as session:
for year in years:
try:
if extract_cot_year(year, session):
new_count += 1
except Exception:
logger.exception(f"Failed to extract COT data for {year}, continuing")
logger.info(f"COT extraction complete: {new_count} new file(s) downloaded")
if __name__ == "__main__":
extract_cot_dataset()

View File

@@ -0,0 +1,43 @@
"""Normalize CFTC ZIP archives to gzip CSV."""
import gzip
import zipfile
from io import BytesIO
def find_csv_inner_filename(buffer: BytesIO) -> str:
"""Find the single .txt file inside a CFTC ZIP archive.
CFTC uses .txt extension for their comma-delimited CSV files. The filename
varies across years (e.g. 'f_year.txt', 'FUT_DISAGG_2015.txt'). We assert
exactly one .txt file exists and return its name.
"""
with zipfile.ZipFile(buffer, mode="r") as zf:
txt_files = [n for n in zf.namelist() if n.lower().endswith(".txt")]
assert len(txt_files) == 1, (
f"Expected exactly 1 .txt file in CFTC ZIP, found: {zf.namelist()}"
)
return txt_files[0]
def normalize_zipped_csv(buffer: BytesIO, inner_filename: str) -> BytesIO:
"""Extract a single CSV from a ZIP and recompress as gzip.
Args:
buffer: ZIP file content as BytesIO (will be read from position 0).
inner_filename: Name of the file inside the ZIP archive.
Returns:
BytesIO with gzip-compressed CSV content, seeked to position 0.
"""
buffer.seek(0)
out = BytesIO()
with zipfile.ZipFile(buffer, mode="r") as zf:
assert inner_filename in zf.namelist(), (
f"Expected '{inner_filename}' in ZIP, found: {zf.namelist()}"
)
with zf.open(inner_filename, mode="r") as csv_file:
with gzip.open(out, "wb") as gz:
gz.write(csv_file.read())
out.seek(0)
return out