feat(extract): add OpenWeatherMap daily weather extractor

Adds extract/openweathermap package with daily weather extraction for 8
coffee-growing regions (Brazil, Vietnam, Colombia, Ethiopia, Honduras,
Guatemala, Indonesia). Feeds crop stress signal for commodity sentiment score.

Extractor:
- OWM One Call API 3.0 / Day Summary — one JSON.gz per (location, date)
- extract_weather: daily, fetches yesterday + today (16 calls max)
- extract_weather_backfill: fills 2020-01-01 to yesterday, capped at 500
  calls/run with resume cursor '{location_id}:{date}' for crash safety
- Full idempotency via file existence check; state tracking via extract_core

SQLMesh:
- seeds.weather_locations (8 regions with lat/lon/variety)
- foundation.fct_weather_daily: INCREMENTAL_BY_TIME_RANGE, grain
  (location_id, observation_date), dedup via hash key, crop stress flags:
  is_frost (<2°C), is_heat_stress (>35°C), is_drought (<1mm), in_growing_season

Landing path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-25 22:40:27 +01:00
parent c3c8333407
commit 08e74665bb
31 changed files with 1377 additions and 915 deletions

View File

@@ -0,0 +1,20 @@
[project]
name = "openweathermap"
version = "0.1.0"
description = "OpenWeatherMap daily weather extractor for coffee-growing regions"
requires-python = ">=3.13"
dependencies = [
"extract_core",
"niquests>=3.14.1",
]
[project.scripts]
extract_weather = "openweathermap.execute:extract_weather"
extract_weather_backfill = "openweathermap.execute:extract_weather_backfill"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/openweathermap"]

View File

@@ -0,0 +1,76 @@
"""Thin client for the OpenWeatherMap One Call API 3.0 — Day Summary endpoint.
Endpoint: GET https://api.openweathermap.org/data/3.0/onecall/day_summary
Docs: https://openweathermap.org/api/one-call-3#history_daily_aggregation
Returns one JSON object per (lat, lon, date) with daily aggregates:
temperature.{min,max,morning,afternoon,evening,night}
precipitation.total
humidity.afternoon
cloud_cover.afternoon
wind.max.{speed,direction}
pressure.afternoon
This module contains only the HTTP call and basic response validation.
All business logic (file storage, rate limiting, cursor tracking) lives in execute.py.
"""
import niquests
OWM_BASE_URL = "https://api.openweathermap.org/data/3.0/onecall/day_summary"
HTTP_TIMEOUT_SECONDS = 30
MAX_RESPONSE_BYTES = 10_000 # Day summary is ~500 bytes; 10 KB is a generous bound
class RateLimitError(Exception):
"""Raised when OWM returns HTTP 429 (rate limit exceeded)."""
def fetch_day_summary(
session: niquests.Session,
lat: float,
lon: float,
date_str: str,
api_key: str,
) -> dict:
"""Fetch the OWM One Call 3.0 day summary for a single (lat, lon, date).
date_str must be YYYY-MM-DD format.
Returns the parsed JSON dict on success.
Raises RateLimitError on HTTP 429 — caller is responsible for sleeping and retrying.
Raises AssertionError on any other non-200 status.
"""
assert api_key, "api_key must not be empty"
assert date_str and len(date_str) == 10, f"date_str must be YYYY-MM-DD, got {date_str!r}"
assert -90.0 <= lat <= 90.0, f"lat out of range: {lat}"
assert -180.0 <= lon <= 180.0, f"lon out of range: {lon}"
response = session.get(
OWM_BASE_URL,
params={
"lat": lat,
"lon": lon,
"date": date_str,
"appid": api_key,
"units": "metric",
},
timeout=HTTP_TIMEOUT_SECONDS,
)
if response.status_code == 429:
raise RateLimitError(f"OWM rate limit hit for lat={lat} lon={lon} date={date_str}")
assert response.status_code == 200, (
f"OWM API returned HTTP {response.status_code} for "
f"lat={lat} lon={lon} date={date_str}: {response.text[:200]}"
)
assert len(response.content) <= MAX_RESPONSE_BYTES, (
f"OWM response unexpectedly large ({len(response.content)} bytes) for {date_str}"
)
data = response.json()
assert isinstance(data, dict), f"Expected dict response, got {type(data)}"
assert "date" in data, f"OWM response missing 'date' field: {list(data.keys())}"
return data

View File

@@ -0,0 +1,330 @@
"""OpenWeatherMap daily weather extraction for coffee-growing regions.
Two entry points:
extract_weather()
Daily run: fetches yesterday + today for all 8 locations (16 calls max).
Yesterday is included to cover the midnight edge case — if the daily job
fires just after midnight UTC, today's OWM data may still be partial.
Idempotent: skips if the landing file already exists.
extract_weather_backfill()
Historical fill: iterates (date, location) pairs from 2020-01-01 to
yesterday. Bounded to MAX_CALLS_PER_BACKFILL_RUN per run; re-run daily
to advance. Resumes from cursor on restart.
Landing path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz
Idempotency: file existence check. Past weather is immutable — (location_id, date)
uniquely identifies a file that never changes once written.
Backfill cursor format: '{location_id}:{date}' (e.g. 'brazil_parana:2022-07-15').
Encodes both dimensions so a mid-run crash resumes at the exact (location, date) pair.
"""
import gzip
import json
import logging
import os
import sys
import time
from datetime import date, timedelta
from pathlib import Path
import niquests
from extract_core import end_run, get_last_cursor, landing_path, open_state_db, start_run, write_bytes_atomic
from openweathermap.api import RateLimitError, fetch_day_summary
from openweathermap.locations import LOCATIONS
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("OWM Weather Extractor")
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
LANDING_SUBDIR = "weather"
EXTRACTOR_DAILY = "owm_weather_daily"
EXTRACTOR_BACKFILL = "owm_weather_backfill"
# Rate limiting: OWM free tier = 1000 calls/day (~0.7/s).
# 1.5s between calls stays comfortably below the limit for the daily run.
# 2.0s for backfill (more conservative, many sequential calls).
SLEEP_BETWEEN_CALLS_SECONDS = 1.5
SLEEP_BETWEEN_BACKFILL_CALLS_SECONDS = 2.0
# On 429: wait 60s, then one retry. If still 429, abort the run.
SLEEP_ON_RATE_LIMIT_SECONDS = 60
MAX_RATE_LIMIT_RETRIES = 1
# Cap backfill at 500 calls per run (~17 min at 2s/call).
# 5-year backfill = 14,600 calls → ~30 runs. Re-run daily until complete.
MAX_CALLS_PER_BACKFILL_RUN = 500
# ── helpers ──────────────────────────────────────────────────────────────────
def _write_weather_file(location_id: str, date_str: str, payload: dict) -> int:
"""Gzip-compress payload JSON and write atomically to the landing zone.
Returns bytes_written, or 0 if the file already exists (idempotent skip).
Path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz
"""
assert location_id, "location_id must not be empty"
assert date_str and len(date_str) == 10, f"date_str must be YYYY-MM-DD, got {date_str!r}"
assert isinstance(payload, dict) and payload, "payload must be a non-empty dict"
year = date_str[:4]
dest_dir = landing_path(LANDING_DIR, LANDING_SUBDIR, location_id, year)
local_file = dest_dir / f"{date_str}.json.gz"
if local_file.exists():
logger.debug(f"Already exists, skipping: {local_file}")
return 0
compressed = gzip.compress(json.dumps(payload, separators=(",", ":")).encode("utf-8"))
bytes_written = write_bytes_atomic(local_file, compressed)
logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
return bytes_written
def _fetch_with_retry(session: niquests.Session, loc: dict, date_str: str, api_key: str) -> dict | None:
"""Fetch OWM day summary with one 429-retry.
Returns the JSON dict on success, or None if rate limit persists after retry.
"""
for attempt in range(MAX_RATE_LIMIT_RETRIES + 1):
try:
return fetch_day_summary(session, loc["lat"], loc["lon"], date_str, api_key)
except RateLimitError:
if attempt < MAX_RATE_LIMIT_RETRIES:
logger.warning(
f"Rate limit hit for {loc['id']} {date_str}"
f"sleeping {SLEEP_ON_RATE_LIMIT_SECONDS}s before retry"
)
time.sleep(SLEEP_ON_RATE_LIMIT_SECONDS)
else:
logger.error(f"Rate limit persisted after retry for {loc['id']} {date_str}")
return None
return None # unreachable; satisfies type checker
def _file_exists(location_id: str, date_str: str) -> bool:
year = date_str[:4]
return (LANDING_DIR / LANDING_SUBDIR / location_id / year / f"{date_str}.json.gz").exists()
# ── daily extractor ───────────────────────────────────────────────────────────
def extract_weather() -> None:
"""Fetch yesterday + today weather for all 8 coffee-growing locations.
Up to 16 API calls. Both days are skipped if files already exist,
so re-running costs zero API calls (fully idempotent).
"""
api_key = os.environ.get("OPENWEATHERMAP_API_KEY", "")
assert api_key, "OPENWEATHERMAP_API_KEY environment variable must be set"
today = date.today()
yesterday = today - timedelta(days=1)
dates_to_fetch = [yesterday.isoformat(), today.isoformat()]
conn = open_state_db(LANDING_DIR)
run_id = start_run(conn, EXTRACTOR_DAILY)
files_written = 0
files_skipped = 0
bytes_written_total = 0
try:
with niquests.Session() as session:
for date_str in dates_to_fetch:
for loc in LOCATIONS:
if _file_exists(loc["id"], date_str):
logger.info(f"Already exists: {loc['id']} {date_str}")
files_skipped += 1
continue
data = _fetch_with_retry(session, loc, date_str, api_key)
if data is None:
logger.error(f"Skipping {loc['id']} {date_str} after persistent rate limit")
continue
bw = _write_weather_file(loc["id"], date_str, data)
if bw > 0:
files_written += 1
bytes_written_total += bw
else:
files_skipped += 1
time.sleep(SLEEP_BETWEEN_CALLS_SECONDS)
end_run(
conn, run_id,
status="success",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=today.isoformat(),
)
logger.info(f"Daily weather complete: {files_written} new, {files_skipped} skipped")
except Exception as e:
end_run(conn, run_id, status="failed", error_message=str(e))
raise
finally:
conn.close()
# ── backfill extractor ────────────────────────────────────────────────────────
def extract_weather_backfill() -> None:
"""Fill historical weather data from 2020-01-01 to yesterday.
Iterates (date, location) pairs in date-ascending, LOCATIONS-list order.
Bounded to MAX_CALLS_PER_BACKFILL_RUN per run — re-run daily to advance.
Cursor format: '{location_id}:{date}' (e.g. 'brazil_parana:2022-07-15').
Encodes both dimensions: on resume, all pairs at or before the cursor are
skipped (via cursor comparison first, then file-existence check).
5-year backfill (20202025) = 14,600 calls. At 500/run = ~30 runs.
429 handling: sleep 60s, one retry. If still 429, save cursor and exit
with status='failed' so the cursor does not advance beyond the last
successfully written pair. Safe to re-run the next day.
"""
api_key = os.environ.get("OPENWEATHERMAP_API_KEY", "")
assert api_key, "OPENWEATHERMAP_API_KEY environment variable must be set"
start = date(2020, 1, 1)
end = date.today() - timedelta(days=1) # never fetch today in backfill
conn = open_state_db(LANDING_DIR)
run_id = start_run(conn, EXTRACTOR_BACKFILL)
files_written = 0
files_skipped = 0
bytes_written_total = 0
calls_made = 0
last_cursor: str | None = None
# Load resume cursor from last successful run
resume_cursor = get_last_cursor(conn, EXTRACTOR_BACKFILL)
if resume_cursor:
logger.info(f"Resuming backfill from cursor: {resume_cursor}")
else:
logger.info(f"Starting fresh backfill from {start.isoformat()}")
# Parse cursor into (location_id, date_str) for skip comparison
resume_location_id: str | None = None
resume_date_str: str | None = None
if resume_cursor and ":" in resume_cursor:
resume_location_id, resume_date_str = resume_cursor.split(":", 1)
location_ids = [loc["id"] for loc in LOCATIONS]
resume_loc_idx = -1
if resume_location_id and resume_location_id in location_ids:
resume_loc_idx = location_ids.index(resume_location_id)
try:
with niquests.Session() as session:
current = start
while current <= end:
date_str = current.isoformat()
for loc in LOCATIONS:
loc_idx = location_ids.index(loc["id"])
# Cursor-based skip: (date, loc_idx) <= (resume_date, resume_loc_idx)
# This skips everything already processed in previous runs.
if resume_date_str:
if date_str < resume_date_str:
files_skipped += 1
continue
if date_str == resume_date_str and loc_idx <= resume_loc_idx:
files_skipped += 1
continue
# File-existence check: idempotency guard for files already on disk
# (e.g. written by the daily extractor, or a previous partial run)
if _file_exists(loc["id"], date_str):
files_skipped += 1
last_cursor = f"{loc['id']}:{date_str}"
continue
# Per-run call cap
if calls_made >= MAX_CALLS_PER_BACKFILL_RUN:
logger.info(
f"Reached cap of {MAX_CALLS_PER_BACKFILL_RUN} calls. "
f"Re-run to continue from {last_cursor or resume_cursor}"
)
end_run(
conn, run_id,
status="success",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=last_cursor or resume_cursor,
)
return
data = _fetch_with_retry(session, loc, date_str, api_key)
calls_made += 1
if data is None:
logger.warning(f"Persistent rate limit at {loc['id']} {date_str} — stopping run")
end_run(
conn, run_id,
status="failed",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=last_cursor or resume_cursor,
error_message="Persistent rate limit — resume from cursor",
)
return
bw = _write_weather_file(loc["id"], date_str, data)
if bw > 0:
files_written += 1
bytes_written_total += bw
else:
files_skipped += 1
last_cursor = f"{loc['id']}:{date_str}"
time.sleep(SLEEP_BETWEEN_BACKFILL_CALLS_SECONDS)
current += timedelta(days=1)
final_cursor = last_cursor or resume_cursor or end.isoformat()
logger.info(
f"Backfill complete: {files_written} written, "
f"{files_skipped} skipped, {calls_made} API calls"
)
end_run(
conn, run_id,
status="success",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=final_cursor,
)
except Exception as e:
end_run(
conn, run_id,
status="failed",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=last_cursor or resume_cursor,
error_message=str(e),
)
raise
finally:
conn.close()
if __name__ == "__main__":
extract_weather()

View File

@@ -0,0 +1,35 @@
"""Coffee-growing region coordinates for OpenWeatherMap extraction.
Each entry is a dict with:
id — filesystem-safe unique identifier (used as landing subdirectory name)
lat/lon — WGS84 coordinates
name — human-readable region name
country — ISO 3166-1 alpha-2 country code
variety — 'Arabica' or 'Robusta' (drives growing season logic in SQL)
Locations were chosen to represent the primary growing zones for the world's
major coffee-producing countries, weighted toward Arabica regions since KC=F
futures track Arabica.
"""
LOCATIONS: list[dict] = [
# Brazil — largest Arabica producer; frost risk in highlands (JunAug)
{"id": "brazil_minas_gerais", "lat": -19.9167, "lon": -43.9345, "name": "Minas Gerais", "country": "BR", "variety": "Arabica"},
{"id": "brazil_parana", "lat": -23.4205, "lon": -51.9330, "name": "Paraná", "country": "BR", "variety": "Arabica"},
# Vietnam — largest Robusta producer; Central Highlands plateau
{"id": "vietnam_highlands", "lat": 12.6667, "lon": 108.0500, "name": "Central Highlands", "country": "VN", "variety": "Robusta"},
# Colombia — premium washed Arabica; Huila department
{"id": "colombia_huila", "lat": 2.5359, "lon": -75.5277, "name": "Huila", "country": "CO", "variety": "Arabica"},
# Ethiopia — birthplace of Arabica; Sidama zone (Yirgacheffe region)
{"id": "ethiopia_sidama", "lat": 6.7612, "lon": 38.4721, "name": "Sidama", "country": "ET", "variety": "Arabica"},
# Honduras — largest Central American producer; Copán department
{"id": "honduras_copan", "lat": 14.8333, "lon": -89.1500, "name": "Copán", "country": "HN", "variety": "Arabica"},
# Guatemala — benchmark Central American; Antigua valley
{"id": "guatemala_antigua", "lat": 14.5586, "lon": -90.7295, "name": "Antigua", "country": "GT", "variety": "Arabica"},
# Indonesia — Sumatra (Mandheling); significant Robusta production
{"id": "indonesia_sumatra", "lat": 3.5952, "lon": 98.6722, "name": "Sumatra", "country": "ID", "variety": "Robusta"},
]
assert len(LOCATIONS) == 8, f"Expected 8 locations, got {len(LOCATIONS)}"
assert all("id" in loc and "lat" in loc and "lon" in loc for loc in LOCATIONS), \
"Each location must have id, lat, lon"