-- City dimension: canonical city records with venue count and country metadata. -- Built from venue locations (dim_venues) as the primary source — padelnomics -- tracks cities where padel venues actually exist, not an administrative city list. -- -- Conformed dimension: used by city_market_profile and all pSEO serving models. -- Integrates four sources: -- dim_venues → city list, venue count, coordinates (Playtomic + OSM) -- stg_income → country-level median income (Eurostat) -- stg_city_labels → Eurostat city_code → city_name mapping (EU cities) -- stg_population → Eurostat city-level population (EU, joined via city code) -- stg_population_usa → US Census ACS place population -- stg_population_uk → ONS LAD population -- stg_population_geonames → GeoNames global fallback -- -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. -- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails. -- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities). -- City name matching is case/whitespace-insensitive within each country. -- -- Grain: (country_code, city_slug) — two cities in different countries can share a -- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs. MODEL ( name foundation.dim_cities, kind FULL, cron '@daily', grain (country_code, city_slug) ); WITH -- Primary: distinct cities from dim_venues (canonical padel city list) venue_cities AS ( SELECT country_code, city AS city_name, @slugify(city) AS city_slug, COUNT(*) AS padel_venue_count, AVG(lat) AS centroid_lat, AVG(lon) AS centroid_lon FROM foundation.dim_venues WHERE city IS NOT NULL AND LENGTH(city) > 0 GROUP BY country_code, city ), -- Latest country income per country country_income AS ( SELECT country_code, median_income_pps, ref_year AS income_year FROM staging.stg_income QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1 ), -- Eurostat EU population: join city labels (code→name) with population values. -- QUALIFY keeps only the most recent year per (country, city name). eurostat_pop AS ( SELECT cl.country_code, cl.city_name, p.population, p.ref_year FROM staging.stg_city_labels cl JOIN staging.stg_population p ON cl.city_code = p.city_code QUALIFY ROW_NUMBER() OVER ( PARTITION BY cl.country_code, cl.city_name ORDER BY p.ref_year DESC ) = 1 ), -- US Census ACS population (place-level, filtered to ≥50K) us_pop AS ( SELECT city_name, country_code, population, ref_year FROM staging.stg_population_usa QUALIFY ROW_NUMBER() OVER (PARTITION BY place_fips ORDER BY ref_year DESC) = 1 ), -- ONS UK Local Authority District population uk_pop AS ( SELECT lad_name AS city_name, country_code, population, ref_year FROM staging.stg_population_uk QUALIFY ROW_NUMBER() OVER (PARTITION BY lad_code ORDER BY ref_year DESC) = 1 ), -- GeoNames global fallback (all cities ≥50K) geonames_pop AS ( SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year FROM staging.stg_population_geonames QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1 ), -- GeoNames spatial fallback: for cities where string name match fails, -- find the nearest GeoNames location within ~15km. -- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich. -- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest. geonames_spatial AS ( SELECT vc.country_code, vc.city_slug, gn.geoname_id AS spatial_geoname_id, gn.population AS spatial_population, gn.ref_year AS spatial_ref_year FROM venue_cities vc JOIN geonames_pop gn ON vc.country_code = gn.country_code AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter AND ABS(vc.centroid_lon - gn.lon) < 0.14 QUALIFY ROW_NUMBER() OVER ( PARTITION BY vc.country_code, vc.city_slug ORDER BY ST_Distance_Sphere( ST_Point(vc.centroid_lon, vc.centroid_lat), ST_Point(gn.lon, gn.lat) ) ) = 1 ) SELECT vc.country_code, vc.city_slug, vc.city_name, -- Human-readable country name for pSEO templates and internal linking @country_name(vc.country_code) AS country_name_en, -- URL-safe country slug @country_slug(vc.country_code) AS country_slug, vc.centroid_lat AS lat, vc.centroid_lon AS lon, -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. -- Spatial fallback activates only when all string matches fail (~29% of cities). COALESCE( ep.population, usa.population, uk.population, gn.population, gs.spatial_population, 0 )::BIGINT AS population, COALESCE( ep.ref_year, usa.ref_year, uk.ref_year, gn.ref_year, gs.spatial_ref_year, 0 )::INTEGER AS population_year, vc.padel_venue_count, ci.median_income_pps, ci.income_year, -- GeoNames ID: FK to dim_locations / location_opportunity_profile. -- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.) COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id FROM venue_cities vc LEFT JOIN country_income ci ON vc.country_code = ci.country_code -- Eurostat EU population (via city code→name lookup) LEFT JOIN eurostat_pop ep ON vc.country_code = ep.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(ep.city_name)) -- US Census population LEFT JOIN us_pop usa ON vc.country_code = usa.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(usa.city_name)) -- ONS UK population LEFT JOIN uk_pop uk ON vc.country_code = uk.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name)) -- GeoNames string match (primary) LEFT JOIN geonames_pop gn ON vc.country_code = gn.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name)) -- GeoNames spatial fallback (nearest within ~15km, for when name match fails) LEFT JOIN geonames_spatial gs ON vc.country_code = gs.country_code AND vc.city_slug = gs.city_slug -- Enforce grain: if two cities in the same country have the same slug -- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues QUALIFY ROW_NUMBER() OVER ( PARTITION BY vc.country_code, vc.city_slug ORDER BY vc.padel_venue_count DESC NULLS LAST ) = 1