Files
padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql
Deeman a00c8727d7 fix(content): slugify transliteration + article links + country overview ranking
- Add @slugify SQLMesh macro (STRIP_ACCENTS + ß→ss) replacing broken
  inline REGEXP_REPLACE that dropped non-ASCII chars (Düsseldorf → d-sseldorf)
- Apply @slugify to dim_venues, dim_cities, dim_locations
- Fix Python slugify() to pre-replace ß→ss before NFKD normalization
- Add language prefix to B2B article market links (/markets/germany → /de/markets/germany)
- Change country overview top-5 ranking: venue count (not raw market_score)
  for top cities, population for top opportunity cities

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 10:46:30 +01:00

169 lines
6.9 KiB
SQL

-- City dimension: canonical city records with venue count and country metadata.
-- Built from venue locations (dim_venues) as the primary source — padelnomics
-- tracks cities where padel venues actually exist, not an administrative city list.
--
-- Conformed dimension: used by city_market_profile and all pSEO serving models.
-- Integrates four sources:
-- dim_venues → city list, venue count, coordinates (Playtomic + OSM)
-- stg_income → country-level median income (Eurostat)
-- stg_city_labels → Eurostat city_code → city_name mapping (EU cities)
-- stg_population → Eurostat city-level population (EU, joined via city code)
-- stg_population_usa → US Census ACS place population
-- stg_population_uk → ONS LAD population
-- stg_population_geonames → GeoNames global fallback
--
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
-- GeoNames spatial fallback: finds nearest location within ~15km when string name match fails.
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich (~29% of cities).
-- City name matching is case/whitespace-insensitive within each country.
--
-- Grain: (country_code, city_slug) — two cities in different countries can share a
-- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs.
MODEL (
name foundation.dim_cities,
kind FULL,
cron '@daily',
grain (country_code, city_slug)
);
WITH
-- Primary: distinct cities from dim_venues (canonical padel city list)
venue_cities AS (
SELECT
country_code,
city AS city_name,
@slugify(city) AS city_slug,
COUNT(*) AS padel_venue_count,
AVG(lat) AS centroid_lat,
AVG(lon) AS centroid_lon
FROM foundation.dim_venues
WHERE city IS NOT NULL AND LENGTH(city) > 0
GROUP BY country_code, city
),
-- Latest country income per country
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- Eurostat EU population: join city labels (code→name) with population values.
-- QUALIFY keeps only the most recent year per (country, city name).
eurostat_pop AS (
SELECT
cl.country_code,
cl.city_name,
p.population,
p.ref_year
FROM staging.stg_city_labels cl
JOIN staging.stg_population p ON cl.city_code = p.city_code
QUALIFY ROW_NUMBER() OVER (
PARTITION BY cl.country_code, cl.city_name
ORDER BY p.ref_year DESC
) = 1
),
-- US Census ACS population (place-level, filtered to ≥50K)
us_pop AS (
SELECT city_name, country_code, population, ref_year
FROM staging.stg_population_usa
QUALIFY ROW_NUMBER() OVER (PARTITION BY place_fips ORDER BY ref_year DESC) = 1
),
-- ONS UK Local Authority District population
uk_pop AS (
SELECT lad_name AS city_name, country_code, population, ref_year
FROM staging.stg_population_uk
QUALIFY ROW_NUMBER() OVER (PARTITION BY lad_code ORDER BY ref_year DESC) = 1
),
-- GeoNames global fallback (all cities ≥50K)
geonames_pop AS (
SELECT geoname_id, city_name, country_code, lat, lon, population, ref_year
FROM staging.stg_population_geonames
QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1
),
-- GeoNames spatial fallback: for cities where string name match fails,
-- find the nearest GeoNames location within ~15km.
-- Fixes localization mismatches: Milano≠Milan, Wien≠Vienna, München≠Munich.
-- Uses bbox pre-filter (ABS < 0.14°) then exact sphere distance, picks nearest.
geonames_spatial AS (
SELECT
vc.country_code,
vc.city_slug,
gn.geoname_id AS spatial_geoname_id,
gn.population AS spatial_population,
gn.ref_year AS spatial_ref_year
FROM venue_cities vc
JOIN geonames_pop gn
ON vc.country_code = gn.country_code
AND ABS(vc.centroid_lat - gn.lat) < 0.14 -- ~15km bbox pre-filter
AND ABS(vc.centroid_lon - gn.lon) < 0.14
QUALIFY ROW_NUMBER() OVER (
PARTITION BY vc.country_code, vc.city_slug
ORDER BY ST_Distance_Sphere(
ST_Point(vc.centroid_lon, vc.centroid_lat),
ST_Point(gn.lon, gn.lat)
)
) = 1
)
SELECT
vc.country_code,
vc.city_slug,
vc.city_name,
-- Human-readable country name for pSEO templates and internal linking
@country_name(vc.country_code) AS country_name_en,
-- URL-safe country slug
@country_slug(vc.country_code) AS country_slug,
vc.centroid_lat AS lat,
vc.centroid_lon AS lon,
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0.
-- Spatial fallback activates only when all string matches fail (~29% of cities).
COALESCE(
ep.population,
usa.population,
uk.population,
gn.population,
gs.spatial_population,
0
)::BIGINT AS population,
COALESCE(
ep.ref_year,
usa.ref_year,
uk.ref_year,
gn.ref_year,
gs.spatial_ref_year,
0
)::INTEGER AS population_year,
vc.padel_venue_count,
ci.median_income_pps,
ci.income_year,
-- GeoNames ID: FK to dim_locations / location_opportunity_profile.
-- String match preferred; spatial fallback used when name doesn't match (Milano→Milan, etc.)
COALESCE(gn.geoname_id, gs.spatial_geoname_id) AS geoname_id
FROM venue_cities vc
LEFT JOIN country_income ci ON vc.country_code = ci.country_code
-- Eurostat EU population (via city code→name lookup)
LEFT JOIN eurostat_pop ep
ON vc.country_code = ep.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(ep.city_name))
-- US Census population
LEFT JOIN us_pop usa
ON vc.country_code = usa.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(usa.city_name))
-- ONS UK population
LEFT JOIN uk_pop uk
ON vc.country_code = uk.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name))
-- GeoNames string match (primary)
LEFT JOIN geonames_pop gn
ON vc.country_code = gn.country_code
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name))
-- GeoNames spatial fallback (nearest within ~15km, for when name match fails)
LEFT JOIN geonames_spatial gs
ON vc.country_code = gs.country_code
AND vc.city_slug = gs.city_slug
-- Enforce grain: if two cities in the same country have the same slug
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
QUALIFY ROW_NUMBER() OVER (
PARTITION BY vc.country_code, vc.city_slug
ORDER BY vc.padel_venue_count DESC NULLS LAST
) = 1