feat(sql): dim_cities — GeoNames spatial population fallback
Adds a coordinate-based population lookup as a fallback when string name matching fails (~29% of cities). Uses bbox pre-filter (0.14° ≈ 15 km) then ST_Distance_Sphere to find the nearest GeoNames location in the same country. Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich. Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. Coverage: 70.5% → 98.5% (5,401 / 5,481 cities with population > 0). Key cities before/after: Wien: 0 → 1,691,468 Milano: 0 → 1,371,498 München: already matched by string; verified still correct at 1,488,719 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,7 +12,9 @@
|
||||
-- stg_population_uk → ONS LAD population
|
||||
-- stg_population_geonames → GeoNames global fallback
|
||||
--
|
||||
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0.
|
||||
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
|
||||
-- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails.
|
||||
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities).
|
||||
-- City name matching is case/whitespace-insensitive within each country.
|
||||
--
|
||||
-- Grain: (country_code, city_slug) — two cities in different countries can share a
|
||||
@@ -75,9 +77,33 @@ uk_pop AS (
|
||||
),
|
||||
-- GeoNames global fallback (all cities ≥50K)
|
||||
geonames_pop AS (
|
||||
SELECT geoname_id, city_name, country_code, population, ref_year
|
||||
SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year
|
||||
FROM staging.stg_population_geonames
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
-- GeoNames spatial fallback: for cities where string name match fails,
|
||||
-- find the nearest GeoNames location within ~15km.
|
||||
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich.
|
||||
-- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest.
|
||||
geonames_spatial AS (
|
||||
SELECT
|
||||
vc.country_code,
|
||||
vc.city_slug,
|
||||
gn.geoname_id AS spatial_geoname_id,
|
||||
gn.population AS spatial_population,
|
||||
gn.ref_year AS spatial_ref_year
|
||||
FROM venue_cities vc
|
||||
JOIN geonames_pop gn
|
||||
ON vc.country_code = gn.country_code
|
||||
AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter
|
||||
AND ABS(vc.centroid_lon - gn.lon) < 0.14
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
PARTITION BY vc.country_code, vc.city_slug
|
||||
ORDER BY ST_Distance_Sphere(
|
||||
ST_Point(vc.centroid_lon, vc.centroid_lat),
|
||||
ST_Point(gn.lon, gn.lat)
|
||||
)
|
||||
) = 1
|
||||
)
|
||||
SELECT
|
||||
vc.country_code,
|
||||
@@ -135,13 +161,14 @@ SELECT
|
||||
)) AS country_slug,
|
||||
vc.centroid_lat AS lat,
|
||||
vc.centroid_lon AS lon,
|
||||
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0.
|
||||
-- City name match is case/whitespace-insensitive within each country.
|
||||
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
|
||||
-- Spatial fallback activates only when all string matches fail (~29% of cities).
|
||||
COALESCE(
|
||||
ep.population,
|
||||
usa.population,
|
||||
uk.population,
|
||||
gn.population,
|
||||
gs.spatial_population,
|
||||
0
|
||||
)::BIGINT AS population,
|
||||
COALESCE(
|
||||
@@ -149,14 +176,15 @@ SELECT
|
||||
usa.ref_year,
|
||||
uk.ref_year,
|
||||
gn.ref_year,
|
||||
gs.spatial_ref_year,
|
||||
0
|
||||
)::INTEGER AS population_year,
|
||||
vc.padel_venue_count,
|
||||
ci.median_income_pps,
|
||||
ci.income_year,
|
||||
-- GeoNames ID: FK to dim_locations / location_opportunity_profile.
|
||||
-- NULL when city name doesn't match any GeoNames entry.
|
||||
gn.geoname_id
|
||||
-- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.)
|
||||
COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id
|
||||
FROM venue_cities vc
|
||||
LEFT JOIN country_income ci ON vc.country_code = ci.country_code
|
||||
-- Eurostat EU population (via city code→name lookup)
|
||||
@@ -171,10 +199,14 @@ LEFT JOIN us_pop usa
|
||||
LEFT JOIN uk_pop uk
|
||||
ON vc.country_code = uk.country_code
|
||||
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name))
|
||||
-- GeoNames global fallback
|
||||
-- GeoNames string match (primary)
|
||||
LEFT JOIN geonames_pop gn
|
||||
ON vc.country_code = gn.country_code
|
||||
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name))
|
||||
-- GeoNames spatial fallback (nearest within ~15km, for when name match fails)
|
||||
LEFT JOIN geonames_spatial gs
|
||||
ON vc.country_code = gs.country_code
|
||||
AND vc.city_slug = gs.city_slug
|
||||
-- Enforce grain: if two cities in the same country have the same slug
|
||||
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
|
||||
Reference in New Issue
Block a user