fix(extract,transform): fix COT/prices column name mismatches + OWM rate limit skip

- fct_cot_positioning: quote Swap__Positions_Short_All and Swap__Positions_Spread_All (CSV uses double underscore; DuckDB preserves header names exactly) - fct_cot_positioning: quote Report_Date_as_YYYY-MM-DD (dashes preserved in header) - fct_coffee_prices: quote "Adj Close" (space in CSV header) - openmeteo/execute.py: skip API call in backfill when all daily files already exist (_count_existing_files pre-check prevents 429 rate limit on re-runs) - dev_run.sh: open browser as admin@beanflows.coffee instead of pro@ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 09:46:34 +01:00
parent 611a4af966
commit 4fae358f97
5 changed files with 47 additions and 24 deletions
--- a/extract/openmeteo/src/openmeteo/execute.py
+++ b/extract/openmeteo/src/openmeteo/execute.py
@@ -75,6 +75,19 @@ def _write_day_file(location_id: str, date_str: str, record: dict) -> int:
    return bytes_written


+def _count_existing_files(location_id: str, start: date, end: date) -> int:
+    """Count per-day files already on disk for a location between start and end (inclusive)."""
+    count = 0
+    d = start
+    while d <= end:
+        year = d.strftime("%Y")
+        dest_dir = landing_path(LANDING_DIR, LANDING_SUBDIR, location_id, year)
+        if (dest_dir / f"{d.isoformat()}.json.gz").exists():
+            count += 1
+        d += timedelta(days=1)
+    return count
+
+
 def _split_and_write(location_id: str, response: dict) -> tuple[int, int, int]:
    """Split an Open-Meteo array response into per-day JSON.gz files.

@@ -171,12 +184,22 @@ def extract_weather_backfill() -> None:
    bytes_written_total = 0

    try:
+        start = BACKFILL_START
+        end = date.fromisoformat(yesterday)
+        expected_days = (end - start).days + 1
+
        with niquests.Session() as session:
            for loc in LOCATIONS:
                logger.info(
                    f"Backfill {loc['id']} ({loc['country']}) "
                    f"{start_date} → {yesterday}"
                )
+                existing = _count_existing_files(loc["id"], start, end)
+                if existing == expected_days:
+                    logger.info(f"  {loc['id']}: 0 new, {existing} already existed (skipped API call)")
+                    files_skipped += existing
+                    continue
+
                response = fetch_archive(
                    session, loc["lat"], loc["lon"],
                    start_date=start_date,