Files
padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql
Deeman 5ade38eeaf feat(data): Phase 2a — NUTS-1 regional income for opportunity score
- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to
  request URL so server pre-filters the large cube before download
- stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz,
  filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB
- dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer)
  + regional_income CTE; final SELECT uses COALESCE(regional, country) income
- init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz

Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of
both inheriting the same national average (~25.5K PPS).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 10:26:15 +01:00

223 lines
7.9 KiB
SQL

-- Location dimension: all known populated places globally (GeoNames cities1000).
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
--
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
--
-- Enriched with:
-- stg_income → country-level median income PPS (fallback)
-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped)
-- stg_padel_courts → padel venue count + nearest court distance (km)
-- stg_tennis_courts → tennis court count within 25km radius
--
-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps.
-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1.
-- All other countries fall back to country-level income (stg_income).
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
-- exact sphere distance is computed.
MODEL (
name foundation.dim_locations,
kind FULL,
cron '@daily',
grain (country_code, geoname_id)
);
WITH
-- Base: all GeoNames locations with valid coordinates
locations AS (
SELECT
geoname_id,
city_name AS location_name,
-- URL-safe location slug
LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug,
country_code,
lat,
lon,
admin1_code,
admin2_code,
population,
ref_year
FROM staging.stg_population_geonames
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows)
admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS (
VALUES
('DE', '01', 'DE1'), -- Baden-Württemberg
('DE', '02', 'DE2'), -- Bayern
('DE', '03', 'DE5'), -- Bremen
('DE', '04', 'DE6'), -- Hamburg
('DE', '05', 'DE7'), -- Hessen
('DE', '06', 'DE9'), -- Niedersachsen
('DE', '07', 'DEA'), -- Nordrhein-Westfalen
('DE', '08', 'DEB'), -- Rheinland-Pfalz
('DE', '09', 'DEC'), -- Saarland
('DE', '10', 'DEF'), -- Schleswig-Holstein
('DE', '11', 'DE4'), -- Brandenburg
('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern
('DE', '13', 'DED'), -- Sachsen
('DE', '14', 'DEE'), -- Sachsen-Anhalt
('DE', '15', 'DEG'), -- Thüringen
('DE', '16', 'DE3') -- Berlin
),
-- Latest NUTS-1 regional income per region
regional_income AS (
SELECT
m.country_code,
m.admin1_code,
r.regional_income_pps,
r.ref_year AS regional_income_year
FROM admin1_to_nuts1 m
JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code
QUALIFY ROW_NUMBER() OVER (
PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC
) = 1
),
-- Padel court lat/lon for distance and density calculations
padel_courts AS (
SELECT lat, lon, country_code
FROM staging.stg_padel_courts
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
nearest_padel AS (
SELECT
l.geoname_id,
MIN(
ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0
) AS nearest_padel_court_km
FROM locations l
JOIN padel_courts p
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
ON ABS(l.lat - p.lat) < 0.5
AND ABS(l.lon - p.lon) < 0.5
GROUP BY l.geoname_id
),
-- Padel venues within 5km of each location (counts as "local padel supply")
padel_local AS (
SELECT
l.geoname_id,
COUNT(*) AS padel_venue_count
FROM locations l
JOIN padel_courts p
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
AND ABS(l.lon - p.lon) < 0.05
WHERE ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0 <= 5.0
GROUP BY l.geoname_id
),
-- Tennis courts within 25km of each location (sports culture proxy)
tennis_nearby AS (
SELECT
l.geoname_id,
COUNT(*) AS tennis_courts_within_25km
FROM locations l
JOIN staging.stg_tennis_courts t
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
AND ABS(l.lon - t.lon) < 0.23
WHERE ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(t.lon, t.lat)
) / 1000.0 <= 25.0
GROUP BY l.geoname_id
)
SELECT
l.geoname_id,
l.country_code,
-- Human-readable country name (consistent with dim_cities)
CASE l.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE l.country_code
END AS country_name_en,
-- URL-safe country slug
LOWER(REGEXP_REPLACE(
CASE l.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE l.country_code
END, '[^a-zA-Z0-9]+', '-'
)) AS country_slug,
l.location_name,
l.location_slug,
l.lat,
l.lon,
l.admin1_code,
l.admin2_code,
l.population,
l.ref_year AS population_year,
COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps,
COALESCE(ri.regional_income_year, ci.income_year) AS income_year,
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
-- Venues per 100K residents (NULL if population = 0)
CASE WHEN l.population > 0
THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
ELSE NULL
END AS padel_venues_per_100k,
np.nearest_padel_court_km,
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
CURRENT_DATE AS refreshed_date
FROM locations l
LEFT JOIN country_income ci ON l.country_code = ci.country_code
LEFT JOIN regional_income ri ON l.country_code = ri.country_code
AND l.admin1_code = ri.admin1_code
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
-- Enforce grain: deduplicate if city slug collides within same country
QUALIFY ROW_NUMBER() OVER (
PARTITION BY l.country_code, l.geoname_id
ORDER BY l.population DESC NULLS LAST
) = 1