Add Phase 1A-C + ICE warehouse stocks: prices, methodology, pipeline automation

Phase 1A — KC=F Coffee Futures Prices:
- New extract/coffee_prices/ package (yfinance): downloads KC=F daily OHLCV,
  stores as gzip CSV with SHA256-based idempotency
- SQLMesh models: raw/coffee_prices → foundation/fct_coffee_prices →
  serving/coffee_prices (with 20d/50d SMA, 52-week high/low, daily return %)
- Dashboard: 4 metric cards + dual-line chart (close, 20d MA, 50d MA)
- API: GET /commodities/<ticker>/prices

Phase 1B — Data Methodology Page:
- New /methodology route with full-page template (base.html)
- 6 anchored sections: USDA PSD, CFTC COT, KC=F price, ICE warehouse stocks,
  data quality model, update schedule table
- "Methodology" link added to marketing footer

Phase 1C — Automated Pipeline:
- supervisor.sh updated: runs extract_cot, extract_prices, extract_ice in
  sequence before transform
- Webhook failure alerting via ALERT_WEBHOOK_URL env var (ntfy/Slack/Telegram)

ICE Warehouse Stocks:
- New extract/ice_stocks/ package (niquests): normalizes ICE Report Center CSV
  to canonical schema, hash-based idempotency, soft-fail on 404 with guidance
- SQLMesh models: raw/ice_warehouse_stocks → foundation/fct_ice_warehouse_stocks
  → serving/ice_warehouse_stocks (30d avg, WoW change, 52w drawdown)
- Dashboard: 4 metric cards + line chart (certified bags + 30d avg)
- API: GET /commodities/<code>/stocks

Foundation:
- dim_commodity: added ticker (KC=F) and ice_stock_report_code (COFFEE-C) columns
- macros/__init__.py: added prices_glob() and ice_stocks_glob()
- pipelines.py: added extract_prices and extract_ice entries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-21 11:41:43 +01:00
parent 2962bf5e3b
commit 67c048485b
25 changed files with 1350 additions and 6 deletions

View File

@@ -0,0 +1,18 @@
[project]
name = "ice_stocks"
version = "0.1.0"
description = "ICE certified warehouse stocks extractor"
requires-python = ">=3.13"
dependencies = [
"niquests>=3.14.1",
]
[project.scripts]
extract_ice = "ice_stocks.execute:extract_ice_stocks"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/ice_stocks"]

View File

@@ -0,0 +1,173 @@
"""ICE certified Coffee C warehouse stock extraction.
Downloads daily certified stock reports from the ICE Report Center and stores
as gzip CSV in the landing directory. Uses SHA256 of content as the
idempotency key — skips if a file with the same hash already exists.
Landing path: LANDING_DIR/ice_stocks/{year}/{date}_{hash8}.csv.gzip
CSV format produced (matching raw.ice_warehouse_stocks columns):
report_date,total_certified_bags,pending_grading_bags
ICE Report Center URL discovery:
Visit https://www.theice.com/report-center and locate the
"Coffee C Warehouse Stocks" report. The download URL has the pattern:
https://www.theice.com/report-center/commodities/COFFEE/reports/...
Set ICE_STOCKS_URL environment variable to the discovered URL.
"""
import csv
import gzip
import hashlib
import io
import logging
import os
import pathlib
import sys
from datetime import datetime
import niquests
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("ICE Stocks Extractor")
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
DEST_SUBDIR = "ice_stocks"
# ICE Report Center URL for Coffee C certified warehouse stocks.
# Discover by visiting https://www.theice.com/report-center and locating
# the Coffee C warehouse stocks CSV export. Override via environment variable.
ICE_STOCKS_URL = os.getenv(
"ICE_STOCKS_URL",
"https://www.theice.com/publicdocs/futures_us/exchange_notices/coffee_certifiedstocks.csv",
)
HTTP_TIMEOUT_SECONDS = 60
# Expected column names from ICE CSV (may vary — adapt to actual column names)
# The ICE report typically has: Date, Certified Stocks (bags), Pending Grading (bags)
# We normalize to our canonical names.
COLUMN_MAPPINGS = {
# Possible ICE column name → our canonical name
"date": "report_date",
"report date": "report_date",
"Date": "report_date",
"certified stocks": "total_certified_bags",
"Certified Stocks": "total_certified_bags",
"certified stocks (bags)": "total_certified_bags",
"total certified": "total_certified_bags",
"pending grading": "pending_grading_bags",
"Pending Grading": "pending_grading_bags",
"pending grading (bags)": "pending_grading_bags",
}
def _normalize_row(row: dict) -> dict | None:
"""Map raw ICE CSV columns to canonical schema. Returns None if date missing."""
normalized = {}
for raw_key, value in row.items():
canonical = COLUMN_MAPPINGS.get(raw_key.strip()) or COLUMN_MAPPINGS.get(raw_key.strip().lower())
if canonical:
# Strip commas from numeric strings (ICE uses "1,234,567" format)
normalized[canonical] = value.strip().replace(",", "") if value else ""
if "report_date" not in normalized or not normalized["report_date"]:
return None
# Fill missing optional columns with empty string
normalized.setdefault("total_certified_bags", "")
normalized.setdefault("pending_grading_bags", "")
return normalized
def _build_canonical_csv(raw_content: bytes) -> bytes:
"""Parse ICE CSV and emit canonical CSV with our column schema."""
text = raw_content.decode("utf-8", errors="replace")
reader = csv.DictReader(io.StringIO(text))
rows = []
for row in reader:
normalized = _normalize_row(row)
if normalized:
rows.append(normalized)
if not rows:
return b""
out = io.StringIO()
writer = csv.DictWriter(out, fieldnames=["report_date", "total_certified_bags", "pending_grading_bags"])
writer.writeheader()
writer.writerows(rows)
return out.getvalue().encode("utf-8")
def extract_ice_stocks() -> None:
"""Download ICE certified Coffee C warehouse stocks and store as gzip CSV.
Idempotent: computes SHA256 of canonical CSV bytes, skips if already on disk.
The ICE report is a rolling file (same URL, updated daily) — we detect
changes via content hash.
"""
logger.info(f"Downloading ICE warehouse stocks from: {ICE_STOCKS_URL}")
with niquests.Session() as session:
try:
response = session.get(ICE_STOCKS_URL, timeout=HTTP_TIMEOUT_SECONDS)
except Exception as e:
logger.error(
f"Failed to connect to ICE Report Center: {e}\n"
"If the URL has changed, set ICE_STOCKS_URL environment variable.\n"
"Visit https://www.theice.com/report-center to find the current URL."
)
return
if response.status_code == 404:
logger.warning(
"ICE stocks URL returned 404. The report URL may have changed.\n"
"Visit https://www.theice.com/report-center to find the current URL,\n"
"then set ICE_STOCKS_URL environment variable."
)
return
assert response.status_code == 200, (
f"Unexpected status {response.status_code} from {ICE_STOCKS_URL}"
)
assert len(response.content) > 0, "Downloaded empty file from ICE"
canonical_csv = _build_canonical_csv(response.content)
if not canonical_csv:
logger.warning("ICE CSV parsed to 0 rows — column mapping may need updating")
return
# Hash-based idempotency
sha256 = hashlib.sha256(canonical_csv).hexdigest()
etag = sha256[:8]
today = datetime.now().strftime("%Y-%m-%d")
year = datetime.now().strftime("%Y")
dest_dir = LANDING_DIR / DEST_SUBDIR / year
local_file = dest_dir / f"{today}_{etag}.csv.gzip"
if local_file.exists():
logger.info(f"File {local_file.name} already exists — content unchanged, skipping")
return
dest_dir.mkdir(parents=True, exist_ok=True)
compressed = gzip.compress(canonical_csv)
local_file.write_bytes(compressed)
assert local_file.exists(), f"File was not written: {local_file}"
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
if __name__ == "__main__":
extract_ice_stocks()