-- City dimension: canonical city records with venue count and country metadata. -- Built from venue locations (dim_venues) as the primary source — padelnomics -- tracks cities where padel venues actually exist, not an administrative city list. -- -- Conformed dimension: used by city_market_profile and all pSEO serving models. -- Integrates four sources: -- dim_venues → city list, venue count, coordinates (Playtomic + OSM) -- stg_income → country-level median income (Eurostat) -- stg_city_labels → Eurostat city_code → city_name mapping (EU cities) -- stg_population → Eurostat city-level population (EU, joined via city code) -- stg_population_usa → US Census ACS place population -- stg_population_uk → ONS LAD population -- stg_population_geonames → GeoNames global fallback -- -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. -- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails. -- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities). -- City name matching is case/whitespace-insensitive within each country. -- -- Grain: (country_code, city_slug) — two cities in different countries can share a -- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs. MODEL ( name foundation.dim_cities, kind FULL, cron '@daily', grain (country_code, city_slug) ); WITH -- Primary: distinct cities from dim_venues (canonical padel city list) venue_cities AS ( SELECT country_code, city AS city_name, -- Lowercase before regex so uppercase letters aren't stripped to '-' LOWER(REGEXP_REPLACE(LOWER(city), '[^a-z0-9]+', '-')) AS city_slug, COUNT(*) AS padel_venue_count, AVG(lat) AS centroid_lat, AVG(lon) AS centroid_lon FROM foundation.dim_venues WHERE city IS NOT NULL AND LENGTH(city) > 0 GROUP BY country_code, city ), -- Latest country income per country country_income AS ( SELECT country_code, median_income_pps, ref_year AS income_year FROM staging.stg_income QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1 ), -- Eurostat EU population: join city labels (code→name) with population values. -- QUALIFY keeps only the most recent year per (country, city name). eurostat_pop AS ( SELECT cl.country_code, cl.city_name, p.population, p.ref_year FROM staging.stg_city_labels cl JOIN staging.stg_population p ON cl.city_code = p.city_code QUALIFY ROW_NUMBER() OVER ( PARTITION BY cl.country_code, cl.city_name ORDER BY p.ref_year DESC ) = 1 ), -- US Census ACS population (place-level, filtered to ≥50K) us_pop AS ( SELECT city_name, country_code, population, ref_year FROM staging.stg_population_usa QUALIFY ROW_NUMBER() OVER (PARTITION BY place_fips ORDER BY ref_year DESC) = 1 ), -- ONS UK Local Authority District population uk_pop AS ( SELECT lad_name AS city_name, country_code, population, ref_year FROM staging.stg_population_uk QUALIFY ROW_NUMBER() OVER (PARTITION BY lad_code ORDER BY ref_year DESC) = 1 ), -- GeoNames global fallback (all cities ≥50K) geonames_pop AS ( SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year FROM staging.stg_population_geonames QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1 ), -- GeoNames spatial fallback: for cities where string name match fails, -- find the nearest GeoNames location within ~15km. -- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich. -- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest. geonames_spatial AS ( SELECT vc.country_code, vc.city_slug, gn.geoname_id AS spatial_geoname_id, gn.population AS spatial_population, gn.ref_year AS spatial_ref_year FROM venue_cities vc JOIN geonames_pop gn ON vc.country_code = gn.country_code AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter AND ABS(vc.centroid_lon - gn.lon) < 0.14 QUALIFY ROW_NUMBER() OVER ( PARTITION BY vc.country_code, vc.city_slug ORDER BY ST_Distance_Sphere( ST_Point(vc.centroid_lon, vc.centroid_lat), ST_Point(gn.lon, gn.lat) ) ) = 1 ) SELECT vc.country_code, vc.city_slug, vc.city_name, -- Human-readable country name for pSEO templates and internal linking CASE vc.country_code WHEN 'DE' THEN 'Germany' WHEN 'ES' THEN 'Spain' WHEN 'GB' THEN 'United Kingdom' WHEN 'FR' THEN 'France' WHEN 'IT' THEN 'Italy' WHEN 'PT' THEN 'Portugal' WHEN 'AT' THEN 'Austria' WHEN 'CH' THEN 'Switzerland' WHEN 'NL' THEN 'Netherlands' WHEN 'BE' THEN 'Belgium' WHEN 'SE' THEN 'Sweden' WHEN 'NO' THEN 'Norway' WHEN 'DK' THEN 'Denmark' WHEN 'FI' THEN 'Finland' WHEN 'US' THEN 'United States' WHEN 'AR' THEN 'Argentina' WHEN 'MX' THEN 'Mexico' WHEN 'AE' THEN 'UAE' WHEN 'AU' THEN 'Australia' WHEN 'IE' THEN 'Ireland' ELSE vc.country_code END AS country_name_en, -- URL-safe country slug LOWER(REGEXP_REPLACE( CASE vc.country_code WHEN 'DE' THEN 'Germany' WHEN 'ES' THEN 'Spain' WHEN 'GB' THEN 'United Kingdom' WHEN 'FR' THEN 'France' WHEN 'IT' THEN 'Italy' WHEN 'PT' THEN 'Portugal' WHEN 'AT' THEN 'Austria' WHEN 'CH' THEN 'Switzerland' WHEN 'NL' THEN 'Netherlands' WHEN 'BE' THEN 'Belgium' WHEN 'SE' THEN 'Sweden' WHEN 'NO' THEN 'Norway' WHEN 'DK' THEN 'Denmark' WHEN 'FI' THEN 'Finland' WHEN 'US' THEN 'United States' WHEN 'AR' THEN 'Argentina' WHEN 'MX' THEN 'Mexico' WHEN 'AE' THEN 'UAE' WHEN 'AU' THEN 'Australia' WHEN 'IE' THEN 'Ireland' ELSE vc.country_code END, '[^a-zA-Z0-9]+', '-' )) AS country_slug, vc.centroid_lat AS lat, vc.centroid_lon AS lon, -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. -- Spatial fallback activates only when all string matches fail (~29% of cities). COALESCE( ep.population, usa.population, uk.population, gn.population, gs.spatial_population, 0 )::BIGINT AS population, COALESCE( ep.ref_year, usa.ref_year, uk.ref_year, gn.ref_year, gs.spatial_ref_year, 0 )::INTEGER AS population_year, vc.padel_venue_count, ci.median_income_pps, ci.income_year, -- GeoNames ID: FK to dim_locations / location_opportunity_profile. -- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.) COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id FROM venue_cities vc LEFT JOIN country_income ci ON vc.country_code = ci.country_code -- Eurostat EU population (via city code→name lookup) LEFT JOIN eurostat_pop ep ON vc.country_code = ep.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(ep.city_name)) -- US Census population LEFT JOIN us_pop usa ON vc.country_code = usa.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(usa.city_name)) -- ONS UK population LEFT JOIN uk_pop uk ON vc.country_code = uk.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name)) -- GeoNames string match (primary) LEFT JOIN geonames_pop gn ON vc.country_code = gn.country_code AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name)) -- GeoNames spatial fallback (nearest within ~15km, for when name match fails) LEFT JOIN geonames_spatial gs ON vc.country_code = gs.country_code AND vc.city_slug = gs.city_slug -- Enforce grain: if two cities in the same country have the same slug -- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues QUALIFY ROW_NUMBER() OVER ( PARTITION BY vc.country_code, vc.city_slug ORDER BY vc.padel_venue_count DESC NULLS LAST ) = 1