diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql index 49ea369..b1b1067 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql @@ -12,7 +12,9 @@ -- stg_population_uk → ONS LAD population -- stg_population_geonames → GeoNames global fallback -- --- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0. +-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. +-- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails. +-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities). -- City name matching is case/whitespace-insensitive within each country. -- -- Grain: (country_code, city_slug) — two cities in different countries can share a @@ -75,9 +77,33 @@ uk_pop AS ( ), -- GeoNames global fallback (all cities ≥50K) geonames_pop AS ( - SELECT geoname_id, city_name, country_code, population, ref_year + SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year FROM staging.stg_population_geonames QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1 +), +-- GeoNames spatial fallback: for cities where string name match fails, +-- find the nearest GeoNames location within ~15km. +-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich. +-- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest. +geonames_spatial AS ( + SELECT + vc.country_code, + vc.city_slug, + gn.geoname_id AS spatial_geoname_id, + gn.population AS spatial_population, + gn.ref_year AS spatial_ref_year + FROM venue_cities vc + JOIN geonames_pop gn + ON vc.country_code = gn.country_code + AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter + AND ABS(vc.centroid_lon - gn.lon) < 0.14 + QUALIFY ROW_NUMBER() OVER ( + PARTITION BY vc.country_code, vc.city_slug + ORDER BY ST_Distance_Sphere( + ST_Point(vc.centroid_lon, vc.centroid_lat), + ST_Point(gn.lon, gn.lat) + ) + ) = 1 ) SELECT vc.country_code, @@ -135,13 +161,14 @@ SELECT )) AS country_slug, vc.centroid_lat AS lat, vc.centroid_lon AS lon, - -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0. - -- City name match is case/whitespace-insensitive within each country. + -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. + -- Spatial fallback activates only when all string matches fail (~29% of cities). COALESCE( ep.population, usa.population, uk.population, gn.population, + gs.spatial_population, 0 )::BIGINT AS population, COALESCE( @@ -149,14 +176,15 @@ SELECT usa.ref_year, uk.ref_year, gn.ref_year, + gs.spatial_ref_year, 0 )::INTEGER AS population_year, vc.padel_venue_count, ci.median_income_pps, ci.income_year, -- GeoNames ID: FK to dim_locations / location_opportunity_profile. - -- NULL when city name doesn't match any GeoNames entry. - gn.geoname_id + -- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.) + COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id FROM venue_cities vc LEFT JOIN country_income ci ON vc.country_code = ci.country_code -- Eurostat EU population (via city code→name lookup) @@ -171,10 +199,14 @@ LEFT JOIN us_pop usa LEFT JOIN uk_pop uk ON vc.country_code = uk.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name)) --- GeoNames global fallback +-- GeoNames string match (primary) LEFT JOIN geonames_pop gn ON vc.country_code = gn.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name)) +-- GeoNames spatial fallback (nearest within ~15km, for when name match fails) +LEFT JOIN geonames_spatial gs + ON vc.country_code = gs.country_code + AND vc.city_slug = gs.city_slug -- Enforce grain: if two cities in the same country have the same slug -- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues QUALIFY ROW_NUMBER() OVER (