Add CFTC COT data integration with foundation data model layer

- New extraction package (cftc_cot): downloads yearly Disaggregated Futures ZIPs from CFTC, etag-based dedup, dynamic inner filename discovery, gzip normalization - SQLMesh 3-layer architecture: raw (technical) → foundation (business model) → serving (mart) - dim_commodity seed: conformed dimension mapping USDA ↔ CFTC codes — the commodity ontology - fct_cot_positioning: typed, deduplicated weekly positioning facts for all commodities - obt_cot_positioning: Coffee C mart with COT Index (26w/52w), WoW delta, OI ratios - Analytics functions + REST API endpoints: /commodities/<code>/positioning[/latest] - Dashboard widget: Managed Money net, COT Index card, dual-axis Chart.js chart - 23 passing tests (10 unit + 2 SQLMesh model + existing regression suite) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-20 21:57:04 +01:00
parent d09ba91023
commit 0a83b2cb74
19 changed files with 1111 additions and 3 deletions
--- a/extract/cftc_cot/pyproject.toml
+++ b/extract/cftc_cot/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "cftc_cot"
+version = "0.1.0"
+description = "CFTC Commitment of Traders data extractor"
+requires-python = ">=3.13"
+dependencies = [
+    "niquests>=3.14.1",
+]
+
+[project.scripts]
+extract_cot = "cftc_cot.execute:extract_cot_dataset"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/cftc_cot"]
--- a/extract/cftc_cot/src/cftc_cot/init.py
+++ b/extract/cftc_cot/src/cftc_cot/init.py
--- a/extract/cftc_cot/src/cftc_cot/execute.py
+++ b/extract/cftc_cot/src/cftc_cot/execute.py
@@ -0,0 +1,129 @@
+"""CFTC COT Disaggregated Futures data extraction.
+
+Downloads yearly ZIP files from CFTC and stores as gzip CSV in the landing
+directory. CFTC publishes one file per year that updates every Friday at
+3:30 PM ET. On first run this backfills all years from 2006. On subsequent
+runs it skips files whose etag matches what is already on disk.
+
+Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip
+"""
+
+import logging
+import os
+import pathlib
+import sys
+from datetime import datetime
+from io import BytesIO
+
+import niquests
+
+from .normalize import find_csv_inner_filename, normalize_zipped_csv
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("CFTC COT Extractor")
+
+LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
+
+# CFTC publishes yearly ZIPs for the disaggregated futures-only report.
+# The file for the current year is updated each Friday at 3:30 PM ET.
+COT_URL_TEMPLATE = "https://www.cftc.gov/files/dea/history/fut_disagg_txt_{year}.zip"
+
+FIRST_YEAR = 2006  # Disaggregated report starts June 2006
+HTTP_TIMEOUT_SECONDS = 120  # COT ZIPs are up to ~30 MB
+MAX_YEARS = 25  # Safety bound on backfill range
+
+
+def _synthetic_etag(year: int, headers: dict) -> str:
+    """Build a dedup key when CFTC omits the etag header.
+
+    Uses content-length + last-modified. This is not as strong as a real etag
+    (a server clock change would trigger a re-download), but it is safe because
+    the staging layer deduplicates via hash key.
+    """
+    last_modified = headers.get("last-modified", "")
+    content_length = headers.get("content-length", "0")
+    etag = f"{year}_{content_length}_{hash(last_modified) & 0xFFFFFFFF:08x}"
+    logger.info(f"No etag header for {year}, using synthetic etag: {etag}")
+    return etag
+
+
+def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
+    """Download and store COT data for a single year.
+
+    Returns True if a new file was written, False if skipped or unavailable.
+    """
+    url = COT_URL_TEMPLATE.format(year=year)
+    logger.info(f"Checking COT data for {year}: {url}")
+
+    head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
+    if head.status_code == 404:
+        logger.info(f"Year {year} not available (404) — skipping")
+        return False
+    assert head.status_code == 200, (
+        f"Unexpected HEAD status {head.status_code} for {url}"
+    )
+
+    raw_etag = head.headers.get("etag", "")
+    etag = raw_etag.replace('"', "").replace(":", "_") if raw_etag else _synthetic_etag(year, head.headers)
+
+    dest_dir = LANDING_DIR / "cot" / str(year)
+    local_file = dest_dir / f"{etag}.csv.gzip"
+
+    if local_file.exists():
+        logger.info(f"Year {year}: {etag}.csv.gzip already exists, skipping")
+        return False
+
+    logger.info(f"Downloading COT data for {year}...")
+    response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
+    assert response.status_code == 200, (
+        f"GET failed with status {response.status_code} for {url}"
+    )
+    assert len(response.content) > 0, f"Downloaded empty file from {url}"
+
+    zip_buffer = BytesIO(response.content)
+    inner_filename = find_csv_inner_filename(BytesIO(response.content))
+    normalized = normalize_zipped_csv(zip_buffer, inner_filename)
+
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    local_file.write_bytes(normalized.read())
+
+    assert local_file.exists(), f"File was not written: {local_file}"
+    assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
+
+    logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
+    return True
+
+
+def extract_cot_dataset():
+    """Extract all available CFTC COT disaggregated futures data.
+
+    Downloads current year first (always re-checks for weekly Friday updates),
+    then backfills historical years. Bounded to MAX_YEARS. Continues on
+    individual year failures so a single bad year does not abort the run.
+    """
+    LANDING_DIR.mkdir(parents=True, exist_ok=True)
+    current_year = datetime.now().year
+    years = list(range(current_year, FIRST_YEAR - 1, -1))
+    assert len(years) <= MAX_YEARS, (
+        f"Year range {len(years)} exceeds MAX_YEARS={MAX_YEARS}"
+    )
+
+    new_count = 0
+    with niquests.Session() as session:
+        for year in years:
+            try:
+                if extract_cot_year(year, session):
+                    new_count += 1
+            except Exception:
+                logger.exception(f"Failed to extract COT data for {year}, continuing")
+
+    logger.info(f"COT extraction complete: {new_count} new file(s) downloaded")
+
+
+if __name__ == "__main__":
+    extract_cot_dataset()
--- a/extract/cftc_cot/src/cftc_cot/normalize.py
+++ b/extract/cftc_cot/src/cftc_cot/normalize.py
@@ -0,0 +1,43 @@
+"""Normalize CFTC ZIP archives to gzip CSV."""
+
+import gzip
+import zipfile
+from io import BytesIO
+
+
+def find_csv_inner_filename(buffer: BytesIO) -> str:
+    """Find the single .txt file inside a CFTC ZIP archive.
+
+    CFTC uses .txt extension for their comma-delimited CSV files. The filename
+    varies across years (e.g. 'f_year.txt', 'FUT_DISAGG_2015.txt'). We assert
+    exactly one .txt file exists and return its name.
+    """
+    with zipfile.ZipFile(buffer, mode="r") as zf:
+        txt_files = [n for n in zf.namelist() if n.lower().endswith(".txt")]
+        assert len(txt_files) == 1, (
+            f"Expected exactly 1 .txt file in CFTC ZIP, found: {zf.namelist()}"
+        )
+        return txt_files[0]
+
+
+def normalize_zipped_csv(buffer: BytesIO, inner_filename: str) -> BytesIO:
+    """Extract a single CSV from a ZIP and recompress as gzip.
+
+    Args:
+        buffer: ZIP file content as BytesIO (will be read from position 0).
+        inner_filename: Name of the file inside the ZIP archive.
+
+    Returns:
+        BytesIO with gzip-compressed CSV content, seeked to position 0.
+    """
+    buffer.seek(0)
+    out = BytesIO()
+    with zipfile.ZipFile(buffer, mode="r") as zf:
+        assert inner_filename in zf.namelist(), (
+            f"Expected '{inner_filename}' in ZIP, found: {zf.namelist()}"
+        )
+        with zf.open(inner_filename, mode="r") as csv_file:
+            with gzip.open(out, "wb") as gz:
+                gz.write(csv_file.read())
+    out.seek(0)
+    return out