From 8e0dd6af63dd93eaad85ce5568c3fdde3d370b13 Mon Sep 17 00:00:00 2001 From: Deeman Date: Mon, 9 Mar 2026 12:23:50 +0100 Subject: [PATCH] fix(data): filter non-Latin city names + score range clamp (Phase F) - stg_population_geonames: reject CJK/Cyrillic/Arabic city names via regex (fixes "Seelow" showing Japanese characters on map) - dim_locations: filter empty location names after trim - location_profiles: defensive LEAST/GREATEST clamp on both scores (0-100) Co-Authored-By: Claude Opus 4.6 --- .../sqlmesh_padelnomics/models/foundation/dim_locations.sql | 1 + .../sqlmesh_padelnomics/models/serving/location_profiles.sql | 4 ++-- .../models/staging/stg_population_geonames.sql | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index b0ddbe3..98d0fd0 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -48,6 +48,7 @@ locations AS ( ref_year FROM staging.stg_population_geonames WHERE lat IS NOT NULL AND lon IS NOT NULL + AND LENGTH(TRIM(city_name)) > 0 ), -- ── EU NUTS-2 income via spatial join ────────────────────────────────────── -- Each EU location's (lon, lat) is matched against NUTS-2 boundary polygons. diff --git a/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql b/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql index 9e5483b..d7645bd 100644 --- a/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql +++ b/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql @@ -278,8 +278,8 @@ SELECT THEN ROUND(s.catchment_padel_courts::DOUBLE / s.catchment_population * 100000, 2) ELSE NULL END AS catchment_venues_per_100k, - s.market_score, - s.opportunity_score, + LEAST(GREATEST(s.market_score, 0), 100) AS market_score, + LEAST(GREATEST(s.opportunity_score, 0), 100) AS opportunity_score, s.median_hourly_rate, s.median_peak_rate, s.median_offpeak_rate, diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql index 6c6404b..c7032ba 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql @@ -38,3 +38,6 @@ WHERE geoname_id IS NOT NULL AND city_name IS NOT NULL AND lat IS NOT NULL AND lon IS NOT NULL + -- Reject names with non-Latin characters (CJK, Cyrillic, Arabic, Thai, etc.) + -- Allows ASCII + Latin Extended (diacritics: ÄÖÜ, àéî, ñ, ø, etc.) + AND regexp_matches(city_name, '^[\x20-\x7E\u00C0-\u024F\u1E00-\u1EFF]+$')