feat(sql): dim_cities — GeoNames spatial population fallback

Adds a coordinate-based population lookup as a fallback when string name
matching fails (~29% of cities). Uses bbox pre-filter (0.14° ≈ 15 km) then
ST_Distance_Sphere to find the nearest GeoNames location in the same country.

Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich.

Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string >
GeoNames spatial > 0.

Coverage: 70.5% → 98.5% (5,401 / 5,481 cities with population > 0).
Key cities before/after:
  Wien:   0 → 1,691,468
  Milano: 0 → 1,371,498
  München: already matched by string; verified still correct at 1,488,719

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-27 08:47:26 +01:00
parent 6586eca921
commit 3aa30ab419

View File

@@ -12,7 +12,9 @@
-- stg_population_uk → ONS LAD population -- stg_population_uk → ONS LAD population
-- stg_population_geonames → GeoNames global fallback -- stg_population_geonames → GeoNames global fallback
-- --
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0. -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
-- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails.
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities).
-- City name matching is case/whitespace-insensitive within each country. -- City name matching is case/whitespace-insensitive within each country.
-- --
-- Grain: (country_code, city_slug) — two cities in different countries can share a -- Grain: (country_code, city_slug) — two cities in different countries can share a
@@ -75,9 +77,33 @@ uk_pop AS (
), ),
-- GeoNames global fallback (all cities ≥50K) -- GeoNames global fallback (all cities ≥50K)
geonames_pop AS ( geonames_pop AS (
SELECT geoname_id, city_name, country_code, population, ref_year SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year
FROM staging.stg_population_geonames FROM staging.stg_population_geonames
QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1 QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1
),
-- GeoNames spatial fallback: for cities where string name match fails,
-- find the nearest GeoNames location within ~15km.
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich.
-- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest.
geonames_spatial AS (
SELECT
vc.country_code,
vc.city_slug,
gn.geoname_id AS spatial_geoname_id,
gn.population AS spatial_population,
gn.ref_year AS spatial_ref_year
FROM venue_cities vc
JOIN geonames_pop gn
ON vc.country_code = gn.country_code
AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter
AND ABS(vc.centroid_lon - gn.lon) < 0.14
QUALIFY ROW_NUMBER() OVER (
PARTITION BY vc.country_code, vc.city_slug
ORDER BY ST_Distance_Sphere(
ST_Point(vc.centroid_lon, vc.centroid_lat),
ST_Point(gn.lon, gn.lat)
)
) = 1
) )
SELECT SELECT
vc.country_code, vc.country_code,
@@ -135,13 +161,14 @@ SELECT
)) AS country_slug, )) AS country_slug,
vc.centroid_lat AS lat, vc.centroid_lat AS lat,
vc.centroid_lon AS lon, vc.centroid_lon AS lon,
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0. -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
-- City name match is case/whitespace-insensitive within each country. -- Spatial fallback activates only when all string matches fail (~29% of cities).
COALESCE( COALESCE(
ep.population, ep.population,
usa.population, usa.population,
uk.population, uk.population,
gn.population, gn.population,
gs.spatial_population,
0 0
)::BIGINT AS population, )::BIGINT AS population,
COALESCE( COALESCE(
@@ -149,14 +176,15 @@ SELECT
usa.ref_year, usa.ref_year,
uk.ref_year, uk.ref_year,
gn.ref_year, gn.ref_year,
gs.spatial_ref_year,
0 0
)::INTEGER AS population_year, )::INTEGER AS population_year,
vc.padel_venue_count, vc.padel_venue_count,
ci.median_income_pps, ci.median_income_pps,
ci.income_year, ci.income_year,
-- GeoNames ID: FK to dim_locations / location_opportunity_profile. -- GeoNames ID: FK to dim_locations / location_opportunity_profile.
-- NULL when city name doesn't match any GeoNames entry. -- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.)
gn.geoname_id COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id
FROM venue_cities vc FROM venue_cities vc
LEFT JOIN country_income ci ON vc.country_code = ci.country_code LEFT JOIN country_income ci ON vc.country_code = ci.country_code
-- Eurostat EU population (via city code→name lookup) -- Eurostat EU population (via city code→name lookup)
@@ -171,10 +199,14 @@ LEFT JOIN us_pop usa
LEFT JOIN uk_pop uk LEFT JOIN uk_pop uk
ON vc.country_code = uk.country_code ON vc.country_code = uk.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name)) AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name))
-- GeoNames global fallback -- GeoNames string match (primary)
LEFT JOIN geonames_pop gn LEFT JOIN geonames_pop gn
ON vc.country_code = gn.country_code ON vc.country_code = gn.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name)) AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name))
-- GeoNames spatial fallback (nearest within ~15km, for when name match fails)
LEFT JOIN geonames_spatial gs
ON vc.country_code = gs.country_code
AND vc.city_slug = gs.city_slug
-- Enforce grain: if two cities in the same country have the same slug -- Enforce grain: if two cities in the same country have the same slug
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues -- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
QUALIFY ROW_NUMBER() OVER ( QUALIFY ROW_NUMBER() OVER (