Files
padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql
Deeman 3aa30ab419 feat(sql): dim_cities — GeoNames spatial population fallback
Adds a coordinate-based population lookup as a fallback when string name
matching fails (~29% of cities). Uses bbox pre-filter (0.14° ≈ 15 km) then
ST_Distance_Sphere to find the nearest GeoNames location in the same country.

Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich.

Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string >
GeoNames spatial > 0.

Coverage: 70.5% → 98.5% (5,401 / 5,481 cities with population > 0).
Key cities before/after:
  Wien:   0 → 1,691,468
  Milano: 0 → 1,371,498
  München: already matched by string; verified still correct at 1,488,719

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 08:47:26 +01:00

216 lines
8.3 KiB
SQL

-- City dimension: canonical city records with venue count and country metadata.
-- Built from venue locations (dim_venues) as the primary source — padelnomics
-- tracks cities where padel venues actually exist, not an administrative city list.
--
-- Conformed dimension: used by city_market_profile and all pSEO serving models.
-- Integrates four sources:
-- dim_venues → city list, venue count, coordinates (Playtomic + OSM)
-- stg_income → country-level median income (Eurostat)
-- stg_city_labels → Eurostat city_code → city_name mapping (EU cities)
-- stg_population → Eurostat city-level population (EU, joined via city code)
-- stg_population_usa → US Census ACS place population
-- stg_population_uk → ONS LAD population
-- stg_population_geonames → GeoNames global fallback
--
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
-- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails.
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities).
-- City name matching is case/whitespace-insensitive within each country.
--
-- Grain: (country_code, city_slug) — two cities in different countries can share a
-- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs.
MODEL (
name foundation.dim_cities,
kind FULL,
cron '@daily',
grain (country_code, city_slug)
);
WITH
-- Primary: distinct cities from dim_venues (canonical padel city list)
venue_cities AS (
SELECT
country_code,
city AS city_name,
-- Lowercase before regex so uppercase letters aren't stripped to '-'
LOWER(REGEXP_REPLACE(LOWER(city), '[^a-z0-9]+', '-')) AS city_slug,
COUNT(*) AS padel_venue_count,
AVG(lat) AS centroid_lat,
AVG(lon) AS centroid_lon
FROM foundation.dim_venues
WHERE city IS NOT NULL AND LENGTH(city) > 0
GROUP BY country_code, city
),
-- Latest country income per country
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- Eurostat EU population: join city labels (code→name) with population values.
-- QUALIFY keeps only the most recent year per (country, city name).
eurostat_pop AS (
SELECT
cl.country_code,
cl.city_name,
p.population,
p.ref_year
FROM staging.stg_city_labels cl
JOIN staging.stg_population p ON cl.city_code = p.city_code
QUALIFY ROW_NUMBER() OVER (
PARTITION BY cl.country_code, cl.city_name
ORDER BY p.ref_year DESC
) = 1
),
-- US Census ACS population (place-level, filtered to ≥50K)
us_pop AS (
SELECT city_name, country_code, population, ref_year
FROM staging.stg_population_usa
QUALIFY ROW_NUMBER() OVER (PARTITION BY place_fips ORDER BY ref_year DESC) = 1
),
-- ONS UK Local Authority District population
uk_pop AS (
SELECT lad_name AS city_name, country_code, population, ref_year
FROM staging.stg_population_uk
QUALIFY ROW_NUMBER() OVER (PARTITION BY lad_code ORDER BY ref_year DESC) = 1
),
-- GeoNames global fallback (all cities ≥50K)
geonames_pop AS (
SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year
FROM staging.stg_population_geonames
QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1
),
-- GeoNames spatial fallback: for cities where string name match fails,
-- find the nearest GeoNames location within ~15km.
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich.
-- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest.
geonames_spatial AS (
SELECT
vc.country_code,
vc.city_slug,
gn.geoname_id AS spatial_geoname_id,
gn.population AS spatial_population,
gn.ref_year AS spatial_ref_year
FROM venue_cities vc
JOIN geonames_pop gn
ON vc.country_code = gn.country_code
AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter
AND ABS(vc.centroid_lon - gn.lon) < 0.14
QUALIFY ROW_NUMBER() OVER (
PARTITION BY vc.country_code, vc.city_slug
ORDER BY ST_Distance_Sphere(
ST_Point(vc.centroid_lon, vc.centroid_lat),
ST_Point(gn.lon, gn.lat)
)
) = 1
)
SELECT
vc.country_code,
vc.city_slug,
vc.city_name,
-- Human-readable country name for pSEO templates and internal linking
CASE vc.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE vc.country_code
END AS country_name_en,
-- URL-safe country slug
LOWER(REGEXP_REPLACE(
CASE vc.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE vc.country_code
END, '[^a-zA-Z0-9]+', '-'
)) AS country_slug,
vc.centroid_lat AS lat,
vc.centroid_lon AS lon,
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
-- Spatial fallback activates only when all string matches fail (~29% of cities).
COALESCE(
ep.population,
usa.population,
uk.population,
gn.population,
gs.spatial_population,
0
)::BIGINT AS population,
COALESCE(
ep.ref_year,
usa.ref_year,
uk.ref_year,
gn.ref_year,
gs.spatial_ref_year,
0
)::INTEGER AS population_year,
vc.padel_venue_count,
ci.median_income_pps,
ci.income_year,
-- GeoNames ID: FK to dim_locations / location_opportunity_profile.
-- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.)
COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id
FROM venue_cities vc
LEFT JOIN country_income ci ON vc.country_code = ci.country_code
-- Eurostat EU population (via city code→name lookup)
LEFT JOIN eurostat_pop ep
ON vc.country_code = ep.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(ep.city_name))
-- US Census population
LEFT JOIN us_pop usa
ON vc.country_code = usa.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(usa.city_name))
-- ONS UK population
LEFT JOIN uk_pop uk
ON vc.country_code = uk.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name))
-- GeoNames string match (primary)
LEFT JOIN geonames_pop gn
ON vc.country_code = gn.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name))
-- GeoNames spatial fallback (nearest within ~15km, for when name match fails)
LEFT JOIN geonames_spatial gs
ON vc.country_code = gs.country_code
AND vc.city_slug = gs.city_slug
-- Enforce grain: if two cities in the same country have the same slug
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
QUALIFY ROW_NUMBER() OVER (
PARTITION BY vc.country_code, vc.city_slug
ORDER BY vc.padel_venue_count DESC NULLS LAST
) = 1