From e3a6b91bc014f5e5b92ee7898a05cc7d971df679 Mon Sep 17 00:00:00 2001 From: Deeman Date: Mon, 23 Feb 2026 18:51:53 +0100 Subject: [PATCH] =?UTF-8?q?fix(transform+content):=20unblock=20SQLMesh=20p?= =?UTF-8?q?lan=20=E2=80=94=20three=20pipeline=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stg_playtomic_availability: - Add maximum_object_size = 134217728 (128 MB) to both read_json calls; daily files exceed the 16 MB default as venue count grows - Add seed recheck file (1970-01-01_recheck_00.json.gz, gitignored with data/) to avoid READ_JSON IOException when no recheck files exist pseo_city_costs_de + pseo_city_pricing: - Add QUALIFY ROW_NUMBER() OVER (PARTITION BY city_slug ...) = 1 to deduplicate rows caused by dim_cities' loose LIKE join; reduces pseo_city_costs_de from 2.6M → 222 rows (one per unique city) content/__init__.py: - DuckDB lowercases all column names at rest ("ratePeak" → "ratepeak"), so calc_overrides dict comprehension never matched DEFAULTS keys. Fix: build case-insensitive reverse map {k.lower(): k} and normalise row keys before lookup. Applied in both generate_articles() and preview_article(). Co-Authored-By: Claude Opus 4.6 --- .../models/serving/pseo_city_costs_de.sql | 3 +++ .../models/serving/pseo_city_pricing.sql | 3 +++ .../staging/stg_playtomic_availability.sql | 6 ++++-- web/src/padelnomics/content/__init__.py | 18 ++++++++++++++++-- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/transform/sqlmesh_padelnomics/models/serving/pseo_city_costs_de.sql b/transform/sqlmesh_padelnomics/models/serving/pseo_city_costs_de.sql index b3044ef..9776418 100644 --- a/transform/sqlmesh_padelnomics/models/serving/pseo_city_costs_de.sql +++ b/transform/sqlmesh_padelnomics/models/serving/pseo_city_costs_de.sql @@ -47,3 +47,6 @@ LEFT JOIN serving.planner_defaults p -- Only cities with actual padel presence and at least some rate data WHERE c.padel_venue_count > 0 AND (p.rate_peak IS NOT NULL OR c.median_peak_rate IS NOT NULL) +-- dim_cities has a loose LIKE join that produces duplicates per city_slug; +-- take the row with the highest market_score to get canonical city data. +QUALIFY ROW_NUMBER() OVER (PARTITION BY c.city_slug ORDER BY c.market_score DESC NULLS LAST) = 1 diff --git a/transform/sqlmesh_padelnomics/models/serving/pseo_city_pricing.sql b/transform/sqlmesh_padelnomics/models/serving/pseo_city_pricing.sql index a9f63d8..49420a3 100644 --- a/transform/sqlmesh_padelnomics/models/serving/pseo_city_pricing.sql +++ b/transform/sqlmesh_padelnomics/models/serving/pseo_city_pricing.sql @@ -42,3 +42,6 @@ INNER JOIN serving.city_market_profile c AND LOWER(TRIM(vpb.city)) = LOWER(TRIM(c.city_name)) -- Only cities with enough venues for meaningful pricing statistics WHERE vpb.venue_count >= 2 +-- city_market_profile inherits duplicates from dim_cities' loose LIKE join; +-- take the highest market_score row as the canonical city record. +QUALIFY ROW_NUMBER() OVER (PARTITION BY c.city_slug ORDER BY c.market_score DESC NULLS LAST) = 1 diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_availability.sql b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_availability.sql index 65ecd85..bf0b3f2 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_availability.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_availability.sql @@ -34,7 +34,8 @@ WITH morning_files AS ( captured_at_utc: 'VARCHAR', venues: 'JSON[]' }, - filename = true + filename = true, + maximum_object_size = 134217728 -- 128 MB; daily files grow with venue count ) WHERE filename NOT LIKE '%_recheck_%' AND venues IS NOT NULL @@ -58,7 +59,8 @@ recheck_files AS ( captured_at_utc: 'VARCHAR', venues: 'JSON[]' }, - filename = true + filename = true, + maximum_object_size = 134217728 -- 128 MB; matches morning snapshot limit ) WHERE venues IS NOT NULL AND json_array_length(venues) > 0 diff --git a/web/src/padelnomics/content/__init__.py b/web/src/padelnomics/content/__init__.py index 8cbc714..b975302 100644 --- a/web/src/padelnomics/content/__init__.py +++ b/web/src/padelnomics/content/__init__.py @@ -306,7 +306,14 @@ async def generate_articles( # Calculator content type: create scenario scenario_slug = None if config["content_type"] == "calculator": - calc_overrides = {k: v for k, v in row.items() if k in DEFAULTS} + # DuckDB lowercases all column names; build a case-insensitive + # reverse map so "ratepeak" (stored) matches "ratePeak" (DEFAULTS). + _defaults_ci = {k.lower(): k for k in DEFAULTS} + calc_overrides = { + _defaults_ci[k.lower()]: v + for k, v in row.items() + if k.lower() in _defaults_ci and v is not None + } state = validate_state(calc_overrides) d = calc(state, lang=lang) @@ -479,7 +486,14 @@ async def preview_article( # Calculator: compute scenario in-memory if config["content_type"] == "calculator": - calc_overrides = {k: v for k, v in row.items() if k in DEFAULTS} + # DuckDB lowercases all column names; build a case-insensitive + # reverse map so "ratepeak" (stored) matches "ratePeak" (DEFAULTS). + _defaults_ci = {k.lower(): k for k in DEFAULTS} + calc_overrides = { + _defaults_ci[k.lower()]: v + for k, v in row.items() + if k.lower() in _defaults_ci and v is not None + } state = validate_state(calc_overrides) calc(state, lang=lang) # validate state produces valid output ctx["scenario_slug"] = slug + "-" + str(row[natural_key])