feat(extract): add OpenWeatherMap daily weather extractor

Adds extract/openweathermap package with daily weather extraction for 8 coffee-growing regions (Brazil, Vietnam, Colombia, Ethiopia, Honduras, Guatemala, Indonesia). Feeds crop stress signal for commodity sentiment score. Extractor: - OWM One Call API 3.0 / Day Summary — one JSON.gz per (location, date) - extract_weather: daily, fetches yesterday + today (16 calls max) - extract_weather_backfill: fills 2020-01-01 to yesterday, capped at 500 calls/run with resume cursor '{location_id}:{date}' for crash safety - Full idempotency via file existence check; state tracking via extract_core SQLMesh: - seeds.weather_locations (8 regions with lat/lon/variety) - foundation.fct_weather_daily: INCREMENTAL_BY_TIME_RANGE, grain (location_id, observation_date), dedup via hash key, crop stress flags: is_frost (<2°C), is_heat_stress (>35°C), is_drought (<1mm), in_growing_season Landing path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 22:40:27 +01:00
parent c3c8333407
commit 08e74665bb
31 changed files with 1377 additions and 915 deletions
--- a/extract/openweathermap/pyproject.toml
+++ b/extract/openweathermap/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "openweathermap"
+version = "0.1.0"
+description = "OpenWeatherMap daily weather extractor for coffee-growing regions"
+requires-python = ">=3.13"
+dependencies = [
+    "extract_core",
+    "niquests>=3.14.1",
+]
+
+[project.scripts]
+extract_weather          = "openweathermap.execute:extract_weather"
+extract_weather_backfill = "openweathermap.execute:extract_weather_backfill"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/openweathermap"]
--- a/extract/openweathermap/src/openweathermap/init.py
+++ b/extract/openweathermap/src/openweathermap/init.py
--- a/extract/openweathermap/src/openweathermap/api.py
+++ b/extract/openweathermap/src/openweathermap/api.py
@@ -0,0 +1,76 @@
+"""Thin client for the OpenWeatherMap One Call API 3.0 — Day Summary endpoint.
+
+Endpoint: GET https://api.openweathermap.org/data/3.0/onecall/day_summary
+Docs: https://openweathermap.org/api/one-call-3#history_daily_aggregation
+
+Returns one JSON object per (lat, lon, date) with daily aggregates:
+  temperature.{min,max,morning,afternoon,evening,night}
+  precipitation.total
+  humidity.afternoon
+  cloud_cover.afternoon
+  wind.max.{speed,direction}
+  pressure.afternoon
+
+This module contains only the HTTP call and basic response validation.
+All business logic (file storage, rate limiting, cursor tracking) lives in execute.py.
+"""
+
+import niquests
+
+OWM_BASE_URL = "https://api.openweathermap.org/data/3.0/onecall/day_summary"
+HTTP_TIMEOUT_SECONDS = 30
+MAX_RESPONSE_BYTES = 10_000  # Day summary is ~500 bytes; 10 KB is a generous bound
+
+
+class RateLimitError(Exception):
+    """Raised when OWM returns HTTP 429 (rate limit exceeded)."""
+
+
+def fetch_day_summary(
+    session: niquests.Session,
+    lat: float,
+    lon: float,
+    date_str: str,
+    api_key: str,
+) -> dict:
+    """Fetch the OWM One Call 3.0 day summary for a single (lat, lon, date).
+
+    date_str must be YYYY-MM-DD format.
+    Returns the parsed JSON dict on success.
+
+    Raises RateLimitError on HTTP 429 — caller is responsible for sleeping and retrying.
+    Raises AssertionError on any other non-200 status.
+    """
+    assert api_key, "api_key must not be empty"
+    assert date_str and len(date_str) == 10, f"date_str must be YYYY-MM-DD, got {date_str!r}"
+    assert -90.0 <= lat <= 90.0, f"lat out of range: {lat}"
+    assert -180.0 <= lon <= 180.0, f"lon out of range: {lon}"
+
+    response = session.get(
+        OWM_BASE_URL,
+        params={
+            "lat": lat,
+            "lon": lon,
+            "date": date_str,
+            "appid": api_key,
+            "units": "metric",
+        },
+        timeout=HTTP_TIMEOUT_SECONDS,
+    )
+
+    if response.status_code == 429:
+        raise RateLimitError(f"OWM rate limit hit for lat={lat} lon={lon} date={date_str}")
+
+    assert response.status_code == 200, (
+        f"OWM API returned HTTP {response.status_code} for "
+        f"lat={lat} lon={lon} date={date_str}: {response.text[:200]}"
+    )
+    assert len(response.content) <= MAX_RESPONSE_BYTES, (
+        f"OWM response unexpectedly large ({len(response.content)} bytes) for {date_str}"
+    )
+
+    data = response.json()
+    assert isinstance(data, dict), f"Expected dict response, got {type(data)}"
+    assert "date" in data, f"OWM response missing 'date' field: {list(data.keys())}"
+
+    return data
--- a/extract/openweathermap/src/openweathermap/execute.py
+++ b/extract/openweathermap/src/openweathermap/execute.py
@@ -0,0 +1,330 @@
+"""OpenWeatherMap daily weather extraction for coffee-growing regions.
+
+Two entry points:
+
+  extract_weather()
+      Daily run: fetches yesterday + today for all 8 locations (16 calls max).
+      Yesterday is included to cover the midnight edge case — if the daily job
+      fires just after midnight UTC, today's OWM data may still be partial.
+      Idempotent: skips if the landing file already exists.
+
+  extract_weather_backfill()
+      Historical fill: iterates (date, location) pairs from 2020-01-01 to
+      yesterday. Bounded to MAX_CALLS_PER_BACKFILL_RUN per run; re-run daily
+      to advance. Resumes from cursor on restart.
+
+Landing path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz
+
+Idempotency: file existence check. Past weather is immutable — (location_id, date)
+uniquely identifies a file that never changes once written.
+
+Backfill cursor format: '{location_id}:{date}' (e.g. 'brazil_parana:2022-07-15').
+Encodes both dimensions so a mid-run crash resumes at the exact (location, date) pair.
+"""
+
+import gzip
+import json
+import logging
+import os
+import sys
+import time
+from datetime import date, timedelta
+from pathlib import Path
+
+import niquests
+from extract_core import end_run, get_last_cursor, landing_path, open_state_db, start_run, write_bytes_atomic
+
+from openweathermap.api import RateLimitError, fetch_day_summary
+from openweathermap.locations import LOCATIONS
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("OWM Weather Extractor")
+
+LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
+LANDING_SUBDIR = "weather"
+
+EXTRACTOR_DAILY = "owm_weather_daily"
+EXTRACTOR_BACKFILL = "owm_weather_backfill"
+
+# Rate limiting: OWM free tier = 1000 calls/day (~0.7/s).
+# 1.5s between calls stays comfortably below the limit for the daily run.
+# 2.0s for backfill (more conservative, many sequential calls).
+SLEEP_BETWEEN_CALLS_SECONDS = 1.5
+SLEEP_BETWEEN_BACKFILL_CALLS_SECONDS = 2.0
+
+# On 429: wait 60s, then one retry. If still 429, abort the run.
+SLEEP_ON_RATE_LIMIT_SECONDS = 60
+MAX_RATE_LIMIT_RETRIES = 1
+
+# Cap backfill at 500 calls per run (~17 min at 2s/call).
+# 5-year backfill = 14,600 calls → ~30 runs. Re-run daily until complete.
+MAX_CALLS_PER_BACKFILL_RUN = 500
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def _write_weather_file(location_id: str, date_str: str, payload: dict) -> int:
+    """Gzip-compress payload JSON and write atomically to the landing zone.
+
+    Returns bytes_written, or 0 if the file already exists (idempotent skip).
+    Path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz
+    """
+    assert location_id, "location_id must not be empty"
+    assert date_str and len(date_str) == 10, f"date_str must be YYYY-MM-DD, got {date_str!r}"
+    assert isinstance(payload, dict) and payload, "payload must be a non-empty dict"
+
+    year = date_str[:4]
+    dest_dir = landing_path(LANDING_DIR, LANDING_SUBDIR, location_id, year)
+    local_file = dest_dir / f"{date_str}.json.gz"
+
+    if local_file.exists():
+        logger.debug(f"Already exists, skipping: {local_file}")
+        return 0
+
+    compressed = gzip.compress(json.dumps(payload, separators=(",", ":")).encode("utf-8"))
+    bytes_written = write_bytes_atomic(local_file, compressed)
+    logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
+    return bytes_written
+
+
+def _fetch_with_retry(session: niquests.Session, loc: dict, date_str: str, api_key: str) -> dict | None:
+    """Fetch OWM day summary with one 429-retry.
+
+    Returns the JSON dict on success, or None if rate limit persists after retry.
+    """
+    for attempt in range(MAX_RATE_LIMIT_RETRIES + 1):
+        try:
+            return fetch_day_summary(session, loc["lat"], loc["lon"], date_str, api_key)
+        except RateLimitError:
+            if attempt < MAX_RATE_LIMIT_RETRIES:
+                logger.warning(
+                    f"Rate limit hit for {loc['id']} {date_str} — "
+                    f"sleeping {SLEEP_ON_RATE_LIMIT_SECONDS}s before retry"
+                )
+                time.sleep(SLEEP_ON_RATE_LIMIT_SECONDS)
+            else:
+                logger.error(f"Rate limit persisted after retry for {loc['id']} {date_str}")
+                return None
+    return None  # unreachable; satisfies type checker
+
+
+def _file_exists(location_id: str, date_str: str) -> bool:
+    year = date_str[:4]
+    return (LANDING_DIR / LANDING_SUBDIR / location_id / year / f"{date_str}.json.gz").exists()
+
+
+# ── daily extractor ───────────────────────────────────────────────────────────
+
+def extract_weather() -> None:
+    """Fetch yesterday + today weather for all 8 coffee-growing locations.
+
+    Up to 16 API calls. Both days are skipped if files already exist,
+    so re-running costs zero API calls (fully idempotent).
+    """
+    api_key = os.environ.get("OPENWEATHERMAP_API_KEY", "")
+    assert api_key, "OPENWEATHERMAP_API_KEY environment variable must be set"
+
+    today = date.today()
+    yesterday = today - timedelta(days=1)
+    dates_to_fetch = [yesterday.isoformat(), today.isoformat()]
+
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, EXTRACTOR_DAILY)
+    files_written = 0
+    files_skipped = 0
+    bytes_written_total = 0
+
+    try:
+        with niquests.Session() as session:
+            for date_str in dates_to_fetch:
+                for loc in LOCATIONS:
+                    if _file_exists(loc["id"], date_str):
+                        logger.info(f"Already exists: {loc['id']} {date_str}")
+                        files_skipped += 1
+                        continue
+
+                    data = _fetch_with_retry(session, loc, date_str, api_key)
+                    if data is None:
+                        logger.error(f"Skipping {loc['id']} {date_str} after persistent rate limit")
+                        continue
+
+                    bw = _write_weather_file(loc["id"], date_str, data)
+                    if bw > 0:
+                        files_written += 1
+                        bytes_written_total += bw
+                    else:
+                        files_skipped += 1
+
+                    time.sleep(SLEEP_BETWEEN_CALLS_SECONDS)
+
+        end_run(
+            conn, run_id,
+            status="success",
+            files_written=files_written,
+            files_skipped=files_skipped,
+            bytes_written=bytes_written_total,
+            cursor_value=today.isoformat(),
+        )
+        logger.info(f"Daily weather complete: {files_written} new, {files_skipped} skipped")
+    except Exception as e:
+        end_run(conn, run_id, status="failed", error_message=str(e))
+        raise
+    finally:
+        conn.close()
+
+
+# ── backfill extractor ────────────────────────────────────────────────────────
+
+def extract_weather_backfill() -> None:
+    """Fill historical weather data from 2020-01-01 to yesterday.
+
+    Iterates (date, location) pairs in date-ascending, LOCATIONS-list order.
+    Bounded to MAX_CALLS_PER_BACKFILL_RUN per run — re-run daily to advance.
+
+    Cursor format: '{location_id}:{date}' (e.g. 'brazil_parana:2022-07-15').
+    Encodes both dimensions: on resume, all pairs at or before the cursor are
+    skipped (via cursor comparison first, then file-existence check).
+
+    5-year backfill (2020–2025) = 14,600 calls. At 500/run = ~30 runs.
+
+    429 handling: sleep 60s, one retry. If still 429, save cursor and exit
+    with status='failed' so the cursor does not advance beyond the last
+    successfully written pair. Safe to re-run the next day.
+    """
+    api_key = os.environ.get("OPENWEATHERMAP_API_KEY", "")
+    assert api_key, "OPENWEATHERMAP_API_KEY environment variable must be set"
+
+    start = date(2020, 1, 1)
+    end = date.today() - timedelta(days=1)  # never fetch today in backfill
+
+    conn = open_state_db(LANDING_DIR)
+    run_id = start_run(conn, EXTRACTOR_BACKFILL)
+    files_written = 0
+    files_skipped = 0
+    bytes_written_total = 0
+    calls_made = 0
+    last_cursor: str | None = None
+
+    # Load resume cursor from last successful run
+    resume_cursor = get_last_cursor(conn, EXTRACTOR_BACKFILL)
+    if resume_cursor:
+        logger.info(f"Resuming backfill from cursor: {resume_cursor}")
+    else:
+        logger.info(f"Starting fresh backfill from {start.isoformat()}")
+
+    # Parse cursor into (location_id, date_str) for skip comparison
+    resume_location_id: str | None = None
+    resume_date_str: str | None = None
+    if resume_cursor and ":" in resume_cursor:
+        resume_location_id, resume_date_str = resume_cursor.split(":", 1)
+
+    location_ids = [loc["id"] for loc in LOCATIONS]
+    resume_loc_idx = -1
+    if resume_location_id and resume_location_id in location_ids:
+        resume_loc_idx = location_ids.index(resume_location_id)
+
+    try:
+        with niquests.Session() as session:
+            current = start
+            while current <= end:
+                date_str = current.isoformat()
+
+                for loc in LOCATIONS:
+                    loc_idx = location_ids.index(loc["id"])
+
+                    # Cursor-based skip: (date, loc_idx) <= (resume_date, resume_loc_idx)
+                    # This skips everything already processed in previous runs.
+                    if resume_date_str:
+                        if date_str < resume_date_str:
+                            files_skipped += 1
+                            continue
+                        if date_str == resume_date_str and loc_idx <= resume_loc_idx:
+                            files_skipped += 1
+                            continue
+
+                    # File-existence check: idempotency guard for files already on disk
+                    # (e.g. written by the daily extractor, or a previous partial run)
+                    if _file_exists(loc["id"], date_str):
+                        files_skipped += 1
+                        last_cursor = f"{loc['id']}:{date_str}"
+                        continue
+
+                    # Per-run call cap
+                    if calls_made >= MAX_CALLS_PER_BACKFILL_RUN:
+                        logger.info(
+                            f"Reached cap of {MAX_CALLS_PER_BACKFILL_RUN} calls. "
+                            f"Re-run to continue from {last_cursor or resume_cursor}"
+                        )
+                        end_run(
+                            conn, run_id,
+                            status="success",
+                            files_written=files_written,
+                            files_skipped=files_skipped,
+                            bytes_written=bytes_written_total,
+                            cursor_value=last_cursor or resume_cursor,
+                        )
+                        return
+
+                    data = _fetch_with_retry(session, loc, date_str, api_key)
+                    calls_made += 1
+
+                    if data is None:
+                        logger.warning(f"Persistent rate limit at {loc['id']} {date_str} — stopping run")
+                        end_run(
+                            conn, run_id,
+                            status="failed",
+                            files_written=files_written,
+                            files_skipped=files_skipped,
+                            bytes_written=bytes_written_total,
+                            cursor_value=last_cursor or resume_cursor,
+                            error_message="Persistent rate limit — resume from cursor",
+                        )
+                        return
+
+                    bw = _write_weather_file(loc["id"], date_str, data)
+                    if bw > 0:
+                        files_written += 1
+                        bytes_written_total += bw
+                    else:
+                        files_skipped += 1
+
+                    last_cursor = f"{loc['id']}:{date_str}"
+                    time.sleep(SLEEP_BETWEEN_BACKFILL_CALLS_SECONDS)
+
+                current += timedelta(days=1)
+
+        final_cursor = last_cursor or resume_cursor or end.isoformat()
+        logger.info(
+            f"Backfill complete: {files_written} written, "
+            f"{files_skipped} skipped, {calls_made} API calls"
+        )
+        end_run(
+            conn, run_id,
+            status="success",
+            files_written=files_written,
+            files_skipped=files_skipped,
+            bytes_written=bytes_written_total,
+            cursor_value=final_cursor,
+        )
+    except Exception as e:
+        end_run(
+            conn, run_id,
+            status="failed",
+            files_written=files_written,
+            files_skipped=files_skipped,
+            bytes_written=bytes_written_total,
+            cursor_value=last_cursor or resume_cursor,
+            error_message=str(e),
+        )
+        raise
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    extract_weather()
--- a/extract/openweathermap/src/openweathermap/locations.py
+++ b/extract/openweathermap/src/openweathermap/locations.py
@@ -0,0 +1,35 @@
+"""Coffee-growing region coordinates for OpenWeatherMap extraction.
+
+Each entry is a dict with:
+  id       — filesystem-safe unique identifier (used as landing subdirectory name)
+  lat/lon  — WGS84 coordinates
+  name     — human-readable region name
+  country  — ISO 3166-1 alpha-2 country code
+  variety  — 'Arabica' or 'Robusta' (drives growing season logic in SQL)
+
+Locations were chosen to represent the primary growing zones for the world's
+major coffee-producing countries, weighted toward Arabica regions since KC=F
+futures track Arabica.
+"""
+
+LOCATIONS: list[dict] = [
+    # Brazil — largest Arabica producer; frost risk in highlands (Jun–Aug)
+    {"id": "brazil_minas_gerais", "lat": -19.9167, "lon": -43.9345, "name": "Minas Gerais",     "country": "BR", "variety": "Arabica"},
+    {"id": "brazil_parana",       "lat": -23.4205, "lon": -51.9330, "name": "Paraná",            "country": "BR", "variety": "Arabica"},
+    # Vietnam — largest Robusta producer; Central Highlands plateau
+    {"id": "vietnam_highlands",   "lat":  12.6667, "lon": 108.0500, "name": "Central Highlands", "country": "VN", "variety": "Robusta"},
+    # Colombia — premium washed Arabica; Huila department
+    {"id": "colombia_huila",      "lat":   2.5359, "lon": -75.5277, "name": "Huila",             "country": "CO", "variety": "Arabica"},
+    # Ethiopia — birthplace of Arabica; Sidama zone (Yirgacheffe region)
+    {"id": "ethiopia_sidama",     "lat":   6.7612, "lon":  38.4721, "name": "Sidama",            "country": "ET", "variety": "Arabica"},
+    # Honduras — largest Central American producer; Copán department
+    {"id": "honduras_copan",      "lat":  14.8333, "lon": -89.1500, "name": "Copán",             "country": "HN", "variety": "Arabica"},
+    # Guatemala — benchmark Central American; Antigua valley
+    {"id": "guatemala_antigua",   "lat":  14.5586, "lon": -90.7295, "name": "Antigua",           "country": "GT", "variety": "Arabica"},
+    # Indonesia — Sumatra (Mandheling); significant Robusta production
+    {"id": "indonesia_sumatra",   "lat":   3.5952, "lon":  98.6722, "name": "Sumatra",           "country": "ID", "variety": "Robusta"},
+]
+
+assert len(LOCATIONS) == 8, f"Expected 8 locations, got {len(LOCATIONS)}"
+assert all("id" in loc and "lat" in loc and "lon" in loc for loc in LOCATIONS), \
+    "Each location must have id, lat, lon"