feat(extract): replace OpenWeatherMap with Open-Meteo weather extractor

Replaced the OWM extractor (8 locations, API key required, 14,600-call
backfill over 30+ days) with Open-Meteo (12 locations, no API key,
ERA5 reanalysis, full backfill in 12 API calls ~30 seconds).

- Rename extract/openweathermap → extract/openmeteo (git mv)
- Rewrite api.py: fetch_archive (ERA5, date-range) + fetch_recent (forecast,
  past_days=10 to cover ERA5 lag); 9 daily variables incl. et0 and VPD
- Rewrite execute.py: _split_and_write() unzips parallel arrays into per-day
  flat JSON; no cursor / rate limiting / call cap needed
- Update pipelines.py: --package openmeteo, timeout 120s (was 1200s)
- Update fct_weather_daily.sql: flat Open-Meteo field names (temperature_2m_*
  etc.), remove pressure_afternoon_hpa, add et0_mm + vpd_max_kpa + is_high_vpd
- Remove OPENWEATHERMAP_API_KEY from CLAUDE.md env vars table

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-26 00:59:54 +01:00
parent 4817f7de2f
commit 9de3a3ba01
13 changed files with 412 additions and 458 deletions

View File

@@ -0,0 +1,20 @@
[project]
name = "openmeteo"
version = "0.1.0"
description = "Open-Meteo daily weather extractor for coffee-growing regions"
requires-python = ">=3.13"
dependencies = [
"extract_core",
"niquests>=3.14.1",
]
[project.scripts]
extract_weather = "openmeteo.execute:extract_weather"
extract_weather_backfill = "openmeteo.execute:extract_weather_backfill"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/openmeteo"]

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,116 @@
"""Open-Meteo weather API client.
Two endpoints:
fetch_archive(session, lat, lon, start_date, end_date) -> dict
ERA5 reanalysis data — consistent, scientifically validated.
Available from 1940 to ~5 days ago (reanalysis processing lag).
Use for historical backfill.
fetch_recent(session, lat, lon, past_days) -> dict
Forecast model blended with recent observations.
Covers the last N days + today (fills the ERA5 lag window).
Use for daily updates.
Both return the same structure:
{
"daily": {
"time": ["2020-01-01", "2020-01-02", ...],
"temperature_2m_max": [28.5, 27.1, ...],
...
}
}
No API key required. No rate limits for reasonable usage (~12 calls/day).
"""
import niquests
ARCHIVE_URL = "https://archive-api.open-meteo.com/v1/archive"
FORECAST_URL = "https://api.open-meteo.com/v1/forecast"
HTTP_TIMEOUT_SECONDS = 60
MAX_RESPONSE_BYTES = 2_000_000 # multi-year response ~200 KB; 2 MB is generous
# Variables fetched for each location. All metric units.
# wind_speed_unit=ms ensures m/s (Open-Meteo default is km/h).
DAILY_VARIABLES = ",".join([
"temperature_2m_max",
"temperature_2m_min",
"temperature_2m_mean",
"precipitation_sum",
"wind_speed_10m_max",
"relative_humidity_2m_max",
"cloud_cover_mean",
"et0_fao_evapotranspiration", # FAO Penman-Monteith ET — direct crop water demand signal
"vapour_pressure_deficit_max", # VPD >1.5 kPa = significant plant water stress
])
def _get(session: niquests.Session, url: str, params: dict) -> dict:
"""GET url, validate HTTP 200, return parsed JSON dict."""
response = session.get(url, params=params, timeout=HTTP_TIMEOUT_SECONDS)
assert response.status_code == 200, (
f"Open-Meteo returned HTTP {response.status_code}: {response.text[:300]}"
)
assert len(response.content) <= MAX_RESPONSE_BYTES, (
f"Open-Meteo response unexpectedly large: {len(response.content):,} bytes"
)
data = response.json()
assert isinstance(data, dict), f"Expected dict, got {type(data)}"
# Open-Meteo signals some errors as HTTP 200 with error=true in body
if data.get("error"):
raise ValueError(f"Open-Meteo API error: {data.get('reason', data)}")
assert "daily" in data, f"Open-Meteo response missing 'daily' key: {list(data.keys())}"
assert "time" in data["daily"], "Open-Meteo 'daily' missing 'time' array"
return data
def fetch_archive(
session: niquests.Session,
lat: float,
lon: float,
start_date: str,
end_date: str,
) -> dict:
"""Fetch ERA5 reanalysis daily data for a date range (YYYY-MM-DD strings)."""
assert start_date and len(start_date) == 10, f"start_date must be YYYY-MM-DD, got {start_date!r}"
assert end_date and len(end_date) == 10, f"end_date must be YYYY-MM-DD, got {end_date!r}"
assert start_date <= end_date, f"start_date {start_date} must be <= end_date {end_date}"
return _get(session, ARCHIVE_URL, {
"latitude": lat,
"longitude": lon,
"start_date": start_date,
"end_date": end_date,
"daily": DAILY_VARIABLES,
"wind_speed_unit": "ms",
"timezone": "UTC",
})
def fetch_recent(
session: niquests.Session,
lat: float,
lon: float,
past_days: int = 10,
) -> dict:
"""Fetch recent weather via Open-Meteo forecast model (fills ERA5 lag window).
past_days=10 captures the ~5-day ERA5 lag plus buffer for missed daily runs.
"""
assert 1 <= past_days <= 92, f"past_days must be 1-92, got {past_days}"
return _get(session, FORECAST_URL, {
"latitude": lat,
"longitude": lon,
"daily": DAILY_VARIABLES,
"wind_speed_unit": "ms",
"timezone": "UTC",
"past_days": past_days,
"forecast_days": 1,
})

View File

@@ -0,0 +1,212 @@
"""Open-Meteo daily weather extraction for coffee-growing regions.
Two entry points:
extract_weather()
Daily run: fetches the last 10 days for all 12 locations.
10 days covers the ~5-day ERA5 reanalysis lag plus buffer for missed runs.
Uses the forecast API (fills the recent window not yet in ERA5 archive).
12 API calls total. Completes in ~10 seconds.
extract_weather_backfill()
Historical fill: fetches 2020-01-01 → yesterday for all 12 locations.
Uses the archive API. One date-range request per location = 12 total calls.
Completes in ~30 seconds. No cursor needed.
Landing path: LANDING_DIR/weather/{location_id}/{year}/{date}.json.gz
Each file is a flat JSON object with Open-Meteo variable names:
{"date": "2020-01-01", "temperature_2m_max": 28.5, "precipitation_sum": 12.5, ...}
No API key required. No rate limiting. Fully idempotent (file existence check).
"""
import gzip
import json
import logging
import os
import sys
import time
from datetime import date, timedelta
from pathlib import Path
import niquests
from extract_core import end_run, landing_path, open_state_db, start_run, write_bytes_atomic
from openmeteo.api import fetch_archive, fetch_recent
from openmeteo.locations import LOCATIONS
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger("Open-Meteo Extractor")
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
LANDING_SUBDIR = "weather"
EXTRACTOR_DAILY = "openmeteo_daily"
EXTRACTOR_BACKFILL = "openmeteo_backfill"
BACKFILL_START = date(2020, 1, 1)
# Small sleep between location calls — polite usage of the free community API.
SLEEP_BETWEEN_LOCATIONS_SECONDS = 0.5
# ── helpers ───────────────────────────────────────────────────────────────────
def _write_day_file(location_id: str, date_str: str, record: dict) -> int:
"""Write one day's weather record as gzipped JSON. Returns bytes written, or 0 if skipped."""
assert location_id and date_str and record
year = date_str[:4]
dest_dir = landing_path(LANDING_DIR, LANDING_SUBDIR, location_id, year)
local_file = dest_dir / f"{date_str}.json.gz"
if local_file.exists():
return 0
compressed = gzip.compress(json.dumps(record, separators=(",", ":")).encode("utf-8"))
bytes_written = write_bytes_atomic(local_file, compressed)
logger.debug(f"Stored {local_file} ({bytes_written:,} bytes)")
return bytes_written
def _split_and_write(location_id: str, response: dict) -> tuple[int, int, int]:
"""Split an Open-Meteo array response into per-day JSON.gz files.
Open-Meteo returns parallel arrays per variable under response['daily']['time'].
We zip these into one flat dict per day and write each as a separate file.
Returns (files_written, files_skipped, bytes_written).
"""
daily = response["daily"]
dates = daily["time"]
variables = [k for k in daily if k != "time"]
files_written = 0
files_skipped = 0
bytes_written_total = 0
for i, date_str in enumerate(dates):
if not date_str:
continue
record = {"date": date_str}
for var in variables:
values = daily[var]
record[var] = values[i] if i < len(values) else None
bw = _write_day_file(location_id, date_str, record)
if bw > 0:
files_written += 1
bytes_written_total += bw
else:
files_skipped += 1
return files_written, files_skipped, bytes_written_total
# ── daily extractor ───────────────────────────────────────────────────────────
def extract_weather() -> None:
"""Fetch the last 10 days of weather for all 12 locations.
Uses Open-Meteo forecast API (past_days=10). 12 API calls. ~10 seconds.
"""
conn = open_state_db(LANDING_DIR)
run_id = start_run(conn, EXTRACTOR_DAILY)
files_written = 0
files_skipped = 0
bytes_written_total = 0
try:
with niquests.Session() as session:
for loc in LOCATIONS:
logger.info(f"Fetching recent: {loc['id']} ({loc['country']})")
response = fetch_recent(session, loc["lat"], loc["lon"], past_days=10)
w, s, bw = _split_and_write(loc["id"], response)
files_written += w
files_skipped += s
bytes_written_total += bw
time.sleep(SLEEP_BETWEEN_LOCATIONS_SECONDS)
end_run(
conn, run_id,
status="success",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=date.today().isoformat(),
)
logger.info(f"Daily weather complete: {files_written} new, {files_skipped} skipped")
except Exception as e:
end_run(conn, run_id, status="failed", error_message=str(e))
raise
finally:
conn.close()
# ── backfill extractor ────────────────────────────────────────────────────────
def extract_weather_backfill() -> None:
"""Fetch full weather history (2020-01-01 → yesterday) for all 12 locations.
Uses Open-Meteo archive API (ERA5 reanalysis). One date-range request per
location = 12 calls total. Completes in ~30 seconds.
Idempotent: per-day files already on disk are skipped when splitting the
response. Safe to re-run at any time — will skip what already exists.
"""
yesterday = (date.today() - timedelta(days=1)).isoformat()
start_date = BACKFILL_START.isoformat()
conn = open_state_db(LANDING_DIR)
run_id = start_run(conn, EXTRACTOR_BACKFILL)
files_written = 0
files_skipped = 0
bytes_written_total = 0
try:
with niquests.Session() as session:
for loc in LOCATIONS:
logger.info(
f"Backfill {loc['id']} ({loc['country']}) "
f"{start_date}{yesterday}"
)
response = fetch_archive(
session, loc["lat"], loc["lon"],
start_date=start_date,
end_date=yesterday,
)
w, s, bw = _split_and_write(loc["id"], response)
files_written += w
files_skipped += s
bytes_written_total += bw
logger.info(f" {loc['id']}: {w} new, {s} already existed")
time.sleep(SLEEP_BETWEEN_LOCATIONS_SECONDS)
end_run(
conn, run_id,
status="success",
files_written=files_written,
files_skipped=files_skipped,
bytes_written=bytes_written_total,
cursor_value=yesterday,
)
logger.info(
f"Backfill complete: {files_written} new files, "
f"{files_skipped} already existed"
)
except Exception as e:
end_run(conn, run_id, status="failed", error_message=str(e))
raise
finally:
conn.close()
if __name__ == "__main__":
extract_weather()

View File

@@ -0,0 +1,43 @@
"""Coffee-growing region coordinates for OpenWeatherMap extraction.
Each entry is a dict with:
id — filesystem-safe unique identifier (used as landing subdirectory name)
lat/lon — WGS84 coordinates
name — human-readable region name
country — ISO 3166-1 alpha-2 country code
variety — 'Arabica' or 'Robusta' (drives growing season logic in SQL)
Locations were chosen to represent the primary growing zones for the world's
major coffee-producing countries, weighted toward Arabica regions since KC=F
futures track Arabica.
"""
LOCATIONS: list[dict] = [
# Brazil — largest Arabica producer; frost risk in highlands (JunAug)
{"id": "brazil_minas_gerais", "lat": -19.9167, "lon": -43.9345, "name": "Minas Gerais", "country": "BR", "variety": "Arabica"},
{"id": "brazil_parana", "lat": -23.4205, "lon": -51.9330, "name": "Paraná", "country": "BR", "variety": "Arabica"},
# Vietnam — largest Robusta producer; Central Highlands plateau
{"id": "vietnam_highlands", "lat": 12.6667, "lon": 108.0500, "name": "Central Highlands", "country": "VN", "variety": "Robusta"},
# Colombia — premium washed Arabica; Huila department
{"id": "colombia_huila", "lat": 2.5359, "lon": -75.5277, "name": "Huila", "country": "CO", "variety": "Arabica"},
# Ethiopia — birthplace of Arabica; Sidama zone (Yirgacheffe region)
{"id": "ethiopia_sidama", "lat": 6.7612, "lon": 38.4721, "name": "Sidama", "country": "ET", "variety": "Arabica"},
# Honduras — largest Central American producer; Copán department
{"id": "honduras_copan", "lat": 14.8333, "lon": -89.1500, "name": "Copán", "country": "HN", "variety": "Arabica"},
# Guatemala — benchmark Central American; Antigua valley
{"id": "guatemala_antigua", "lat": 14.5586, "lon": -90.7295, "name": "Antigua", "country": "GT", "variety": "Arabica"},
# Indonesia — Sumatra (Mandheling); significant Robusta production
{"id": "indonesia_sumatra", "lat": 3.5952, "lon": 98.6722, "name": "Sumatra", "country": "ID", "variety": "Robusta"},
# Brazil — Espírito Santo; largest Conilon (Robusta) producing state in Brazil
{"id": "brazil_espirito_santo", "lat": -19.3908, "lon": -40.0668, "name": "Espírito Santo", "country": "BR", "variety": "Robusta"},
# Peru — Jaén/Cajamarca; fastest-growing Arabica origin, top-10 global producer
{"id": "peru_jaen", "lat": -5.7064, "lon": -78.8077, "name": "Jaén", "country": "PE", "variety": "Arabica"},
# Uganda — Mount Elgon/Bugisu; 4th largest African producer, significant Robusta
{"id": "uganda_elgon", "lat": 1.0826, "lon": 34.1751, "name": "Mount Elgon", "country": "UG", "variety": "Robusta"},
# Ivory Coast — Daloa region; historically significant West African Robusta
{"id": "ivory_coast_daloa", "lat": 6.8774, "lon": -6.4502, "name": "Daloa", "country": "CI", "variety": "Robusta"},
]
assert len(LOCATIONS) == 12, f"Expected 12 locations, got {len(LOCATIONS)}"
assert all("id" in loc and "lat" in loc and "lon" in loc for loc in LOCATIONS), \
"Each location must have id, lat, lon"