Add CFTC COT data integration with foundation data model layer
- New extraction package (cftc_cot): downloads yearly Disaggregated Futures ZIPs from CFTC, etag-based dedup, dynamic inner filename discovery, gzip normalization - SQLMesh 3-layer architecture: raw (technical) → foundation (business model) → serving (mart) - dim_commodity seed: conformed dimension mapping USDA ↔ CFTC codes — the commodity ontology - fct_cot_positioning: typed, deduplicated weekly positioning facts for all commodities - obt_cot_positioning: Coffee C mart with COT Index (26w/52w), WoW delta, OI ratios - Analytics functions + REST API endpoints: /commodities/<code>/positioning[/latest] - Dashboard widget: Managed Money net, COT Index card, dual-axis Chart.js chart - 23 passing tests (10 unit + 2 SQLMesh model + existing regression suite) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
18
extract/cftc_cot/pyproject.toml
Normal file
18
extract/cftc_cot/pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[project]
|
||||
name = "cftc_cot"
|
||||
version = "0.1.0"
|
||||
description = "CFTC Commitment of Traders data extractor"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"niquests>=3.14.1",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
extract_cot = "cftc_cot.execute:extract_cot_dataset"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/cftc_cot"]
|
||||
0
extract/cftc_cot/src/cftc_cot/__init__.py
Normal file
0
extract/cftc_cot/src/cftc_cot/__init__.py
Normal file
129
extract/cftc_cot/src/cftc_cot/execute.py
Normal file
129
extract/cftc_cot/src/cftc_cot/execute.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""CFTC COT Disaggregated Futures data extraction.
|
||||
|
||||
Downloads yearly ZIP files from CFTC and stores as gzip CSV in the landing
|
||||
directory. CFTC publishes one file per year that updates every Friday at
|
||||
3:30 PM ET. On first run this backfills all years from 2006. On subsequent
|
||||
runs it skips files whose etag matches what is already on disk.
|
||||
|
||||
Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
|
||||
import niquests
|
||||
|
||||
from .normalize import find_csv_inner_filename, normalize_zipped_csv
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger = logging.getLogger("CFTC COT Extractor")
|
||||
|
||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||
|
||||
# CFTC publishes yearly ZIPs for the disaggregated futures-only report.
|
||||
# The file for the current year is updated each Friday at 3:30 PM ET.
|
||||
COT_URL_TEMPLATE = "https://www.cftc.gov/files/dea/history/fut_disagg_txt_{year}.zip"
|
||||
|
||||
FIRST_YEAR = 2006 # Disaggregated report starts June 2006
|
||||
HTTP_TIMEOUT_SECONDS = 120 # COT ZIPs are up to ~30 MB
|
||||
MAX_YEARS = 25 # Safety bound on backfill range
|
||||
|
||||
|
||||
def _synthetic_etag(year: int, headers: dict) -> str:
|
||||
"""Build a dedup key when CFTC omits the etag header.
|
||||
|
||||
Uses content-length + last-modified. This is not as strong as a real etag
|
||||
(a server clock change would trigger a re-download), but it is safe because
|
||||
the staging layer deduplicates via hash key.
|
||||
"""
|
||||
last_modified = headers.get("last-modified", "")
|
||||
content_length = headers.get("content-length", "0")
|
||||
etag = f"{year}_{content_length}_{hash(last_modified) & 0xFFFFFFFF:08x}"
|
||||
logger.info(f"No etag header for {year}, using synthetic etag: {etag}")
|
||||
return etag
|
||||
|
||||
|
||||
def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
||||
"""Download and store COT data for a single year.
|
||||
|
||||
Returns True if a new file was written, False if skipped or unavailable.
|
||||
"""
|
||||
url = COT_URL_TEMPLATE.format(year=year)
|
||||
logger.info(f"Checking COT data for {year}: {url}")
|
||||
|
||||
head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
if head.status_code == 404:
|
||||
logger.info(f"Year {year} not available (404) — skipping")
|
||||
return False
|
||||
assert head.status_code == 200, (
|
||||
f"Unexpected HEAD status {head.status_code} for {url}"
|
||||
)
|
||||
|
||||
raw_etag = head.headers.get("etag", "")
|
||||
etag = raw_etag.replace('"', "").replace(":", "_") if raw_etag else _synthetic_etag(year, head.headers)
|
||||
|
||||
dest_dir = LANDING_DIR / "cot" / str(year)
|
||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||
|
||||
if local_file.exists():
|
||||
logger.info(f"Year {year}: {etag}.csv.gzip already exists, skipping")
|
||||
return False
|
||||
|
||||
logger.info(f"Downloading COT data for {year}...")
|
||||
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
assert response.status_code == 200, (
|
||||
f"GET failed with status {response.status_code} for {url}"
|
||||
)
|
||||
assert len(response.content) > 0, f"Downloaded empty file from {url}"
|
||||
|
||||
zip_buffer = BytesIO(response.content)
|
||||
inner_filename = find_csv_inner_filename(BytesIO(response.content))
|
||||
normalized = normalize_zipped_csv(zip_buffer, inner_filename)
|
||||
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
local_file.write_bytes(normalized.read())
|
||||
|
||||
assert local_file.exists(), f"File was not written: {local_file}"
|
||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
||||
|
||||
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
|
||||
return True
|
||||
|
||||
|
||||
def extract_cot_dataset():
|
||||
"""Extract all available CFTC COT disaggregated futures data.
|
||||
|
||||
Downloads current year first (always re-checks for weekly Friday updates),
|
||||
then backfills historical years. Bounded to MAX_YEARS. Continues on
|
||||
individual year failures so a single bad year does not abort the run.
|
||||
"""
|
||||
LANDING_DIR.mkdir(parents=True, exist_ok=True)
|
||||
current_year = datetime.now().year
|
||||
years = list(range(current_year, FIRST_YEAR - 1, -1))
|
||||
assert len(years) <= MAX_YEARS, (
|
||||
f"Year range {len(years)} exceeds MAX_YEARS={MAX_YEARS}"
|
||||
)
|
||||
|
||||
new_count = 0
|
||||
with niquests.Session() as session:
|
||||
for year in years:
|
||||
try:
|
||||
if extract_cot_year(year, session):
|
||||
new_count += 1
|
||||
except Exception:
|
||||
logger.exception(f"Failed to extract COT data for {year}, continuing")
|
||||
|
||||
logger.info(f"COT extraction complete: {new_count} new file(s) downloaded")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_cot_dataset()
|
||||
43
extract/cftc_cot/src/cftc_cot/normalize.py
Normal file
43
extract/cftc_cot/src/cftc_cot/normalize.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Normalize CFTC ZIP archives to gzip CSV."""
|
||||
|
||||
import gzip
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
def find_csv_inner_filename(buffer: BytesIO) -> str:
|
||||
"""Find the single .txt file inside a CFTC ZIP archive.
|
||||
|
||||
CFTC uses .txt extension for their comma-delimited CSV files. The filename
|
||||
varies across years (e.g. 'f_year.txt', 'FUT_DISAGG_2015.txt'). We assert
|
||||
exactly one .txt file exists and return its name.
|
||||
"""
|
||||
with zipfile.ZipFile(buffer, mode="r") as zf:
|
||||
txt_files = [n for n in zf.namelist() if n.lower().endswith(".txt")]
|
||||
assert len(txt_files) == 1, (
|
||||
f"Expected exactly 1 .txt file in CFTC ZIP, found: {zf.namelist()}"
|
||||
)
|
||||
return txt_files[0]
|
||||
|
||||
|
||||
def normalize_zipped_csv(buffer: BytesIO, inner_filename: str) -> BytesIO:
|
||||
"""Extract a single CSV from a ZIP and recompress as gzip.
|
||||
|
||||
Args:
|
||||
buffer: ZIP file content as BytesIO (will be read from position 0).
|
||||
inner_filename: Name of the file inside the ZIP archive.
|
||||
|
||||
Returns:
|
||||
BytesIO with gzip-compressed CSV content, seeked to position 0.
|
||||
"""
|
||||
buffer.seek(0)
|
||||
out = BytesIO()
|
||||
with zipfile.ZipFile(buffer, mode="r") as zf:
|
||||
assert inner_filename in zf.namelist(), (
|
||||
f"Expected '{inner_filename}' in ZIP, found: {zf.namelist()}"
|
||||
)
|
||||
with zf.open(inner_filename, mode="r") as csv_file:
|
||||
with gzip.open(out, "wb") as gz:
|
||||
gz.write(csv_file.read())
|
||||
out.seek(0)
|
||||
return out
|
||||
Reference in New Issue
Block a user