fix(transform+content): unblock SQLMesh plan — three pipeline fixes

stg_playtomic_availability: - Add maximum_object_size = 134217728 (128 MB) to both read_json calls; daily files exceed the 16 MB default as venue count grows - Add seed recheck file (1970-01-01_recheck_00.json.gz, gitignored with data/) to avoid READ_JSON IOException when no recheck files exist pseo_city_costs_de + pseo_city_pricing: - Add QUALIFY ROW_NUMBER() OVER (PARTITION BY city_slug ...) = 1 to deduplicate rows caused by dim_cities' loose LIKE join; reduces pseo_city_costs_de from 2.6M → 222 rows (one per unique city) content/__init__.py: - DuckDB lowercases all column names at rest ("ratePeak" → "ratepeak"), so calc_overrides dict comprehension never matched DEFAULTS keys. Fix: build case-insensitive reverse map {k.lower(): k} and normalise row keys before lookup. Applied in both generate_articles() and preview_article(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 18:51:53 +01:00
parent dd1daaad1e
commit e3a6b91bc0
4 changed files with 26 additions and 4 deletions
--- a/transform/sqlmesh_padelnomics/models/serving/pseo_city_costs_de.sql
+++ b/transform/sqlmesh_padelnomics/models/serving/pseo_city_costs_de.sql
@@ -47,3 +47,6 @@ LEFT JOIN serving.planner_defaults p
 -- Only cities with actual padel presence and at least some rate data
 WHERE c.padel_venue_count > 0
  AND (p.rate_peak IS NOT NULL OR c.median_peak_rate IS NOT NULL)
+-- dim_cities has a loose LIKE join that produces duplicates per city_slug;
+-- take the row with the highest market_score to get canonical city data.
+QUALIFY ROW_NUMBER() OVER (PARTITION BY c.city_slug ORDER BY c.market_score DESC NULLS LAST) = 1
--- a/transform/sqlmesh_padelnomics/models/serving/pseo_city_pricing.sql
+++ b/transform/sqlmesh_padelnomics/models/serving/pseo_city_pricing.sql
@@ -42,3 +42,6 @@ INNER JOIN serving.city_market_profile c
  AND LOWER(TRIM(vpb.city)) = LOWER(TRIM(c.city_name))
 -- Only cities with enough venues for meaningful pricing statistics
 WHERE vpb.venue_count >= 2
+-- city_market_profile inherits duplicates from dim_cities' loose LIKE join;
+-- take the highest market_score row as the canonical city record.
+QUALIFY ROW_NUMBER() OVER (PARTITION BY c.city_slug ORDER BY c.market_score DESC NULLS LAST) = 1
--- a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_availability.sql
+++ b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_availability.sql
@@ -34,7 +34,8 @@ WITH morning_files AS (
      captured_at_utc: 'VARCHAR',
      venues: 'JSON[]'
    },
-    filename = true
+    filename = true,
+    maximum_object_size = 134217728  -- 128 MB; daily files grow with venue count
  )
  WHERE filename NOT LIKE '%_recheck_%'
    AND venues IS NOT NULL
@@ -58,7 +59,8 @@ recheck_files AS (
      captured_at_utc: 'VARCHAR',
      venues: 'JSON[]'
    },
-    filename = true
+    filename = true,
+    maximum_object_size = 134217728  -- 128 MB; matches morning snapshot limit
  )
  WHERE venues IS NOT NULL
    AND json_array_length(venues) > 0
--- a/web/src/padelnomics/content/init.py
+++ b/web/src/padelnomics/content/init.py
@@ -306,7 +306,14 @@ async def generate_articles(
            # Calculator content type: create scenario
            scenario_slug = None
            if config["content_type"] == "calculator":
-                calc_overrides = {k: v for k, v in row.items() if k in DEFAULTS}
+                # DuckDB lowercases all column names; build a case-insensitive
+                # reverse map so "ratepeak" (stored) matches "ratePeak" (DEFAULTS).
+                _defaults_ci = {k.lower(): k for k in DEFAULTS}
+                calc_overrides = {
+                    _defaults_ci[k.lower()]: v
+                    for k, v in row.items()
+                    if k.lower() in _defaults_ci and v is not None
+                }
                state = validate_state(calc_overrides)
                d = calc(state, lang=lang)

@@ -479,7 +486,14 @@ async def preview_article(

    # Calculator: compute scenario in-memory
    if config["content_type"] == "calculator":
-        calc_overrides = {k: v for k, v in row.items() if k in DEFAULTS}
+        # DuckDB lowercases all column names; build a case-insensitive
+        # reverse map so "ratepeak" (stored) matches "ratePeak" (DEFAULTS).
+        _defaults_ci = {k.lower(): k for k in DEFAULTS}
+        calc_overrides = {
+            _defaults_ci[k.lower()]: v
+            for k, v in row.items()
+            if k.lower() in _defaults_ci and v is not None
+        }
        state = validate_state(calc_overrides)
        calc(state, lang=lang)  # validate state produces valid output
        ctx["scenario_slug"] = slug + "-" + str(row[natural_key])