- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to request URL so server pre-filters the large cube before download - stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz, filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB - dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer) + regional_income CTE; final SELECT uses COALESCE(regional, country) income - init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of both inheriting the same national average (~25.5K PPS). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
223 lines
7.9 KiB
SQL
223 lines
7.9 KiB
SQL
-- Location dimension: all known populated places globally (GeoNames cities1000).
|
|
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
|
|
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
|
|
--
|
|
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
|
|
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
|
|
--
|
|
-- Enriched with:
|
|
-- stg_income → country-level median income PPS (fallback)
|
|
-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped)
|
|
-- stg_padel_courts → padel venue count + nearest court distance (km)
|
|
-- stg_tennis_courts → tennis court count within 25km radius
|
|
--
|
|
-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps.
|
|
-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1.
|
|
-- All other countries fall back to country-level income (stg_income).
|
|
--
|
|
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
|
|
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
|
|
-- exact sphere distance is computed.
|
|
|
|
MODEL (
|
|
name foundation.dim_locations,
|
|
kind FULL,
|
|
cron '@daily',
|
|
grain (country_code, geoname_id)
|
|
);
|
|
|
|
WITH
|
|
-- Base: all GeoNames locations with valid coordinates
|
|
locations AS (
|
|
SELECT
|
|
geoname_id,
|
|
city_name AS location_name,
|
|
-- URL-safe location slug
|
|
LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug,
|
|
country_code,
|
|
lat,
|
|
lon,
|
|
admin1_code,
|
|
admin2_code,
|
|
population,
|
|
ref_year
|
|
FROM staging.stg_population_geonames
|
|
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
|
),
|
|
-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions
|
|
country_income AS (
|
|
SELECT country_code, median_income_pps, ref_year AS income_year
|
|
FROM staging.stg_income
|
|
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
|
),
|
|
-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows)
|
|
admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS (
|
|
VALUES
|
|
('DE', '01', 'DE1'), -- Baden-Württemberg
|
|
('DE', '02', 'DE2'), -- Bayern
|
|
('DE', '03', 'DE5'), -- Bremen
|
|
('DE', '04', 'DE6'), -- Hamburg
|
|
('DE', '05', 'DE7'), -- Hessen
|
|
('DE', '06', 'DE9'), -- Niedersachsen
|
|
('DE', '07', 'DEA'), -- Nordrhein-Westfalen
|
|
('DE', '08', 'DEB'), -- Rheinland-Pfalz
|
|
('DE', '09', 'DEC'), -- Saarland
|
|
('DE', '10', 'DEF'), -- Schleswig-Holstein
|
|
('DE', '11', 'DE4'), -- Brandenburg
|
|
('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern
|
|
('DE', '13', 'DED'), -- Sachsen
|
|
('DE', '14', 'DEE'), -- Sachsen-Anhalt
|
|
('DE', '15', 'DEG'), -- Thüringen
|
|
('DE', '16', 'DE3') -- Berlin
|
|
),
|
|
-- Latest NUTS-1 regional income per region
|
|
regional_income AS (
|
|
SELECT
|
|
m.country_code,
|
|
m.admin1_code,
|
|
r.regional_income_pps,
|
|
r.ref_year AS regional_income_year
|
|
FROM admin1_to_nuts1 m
|
|
JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code
|
|
QUALIFY ROW_NUMBER() OVER (
|
|
PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC
|
|
) = 1
|
|
),
|
|
-- Padel court lat/lon for distance and density calculations
|
|
padel_courts AS (
|
|
SELECT lat, lon, country_code
|
|
FROM staging.stg_padel_courts
|
|
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
|
),
|
|
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
|
|
nearest_padel AS (
|
|
SELECT
|
|
l.geoname_id,
|
|
MIN(
|
|
ST_Distance_Sphere(
|
|
ST_Point(l.lon, l.lat),
|
|
ST_Point(p.lon, p.lat)
|
|
) / 1000.0
|
|
) AS nearest_padel_court_km
|
|
FROM locations l
|
|
JOIN padel_courts p
|
|
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
|
|
ON ABS(l.lat - p.lat) < 0.5
|
|
AND ABS(l.lon - p.lon) < 0.5
|
|
GROUP BY l.geoname_id
|
|
),
|
|
-- Padel venues within 5km of each location (counts as "local padel supply")
|
|
padel_local AS (
|
|
SELECT
|
|
l.geoname_id,
|
|
COUNT(*) AS padel_venue_count
|
|
FROM locations l
|
|
JOIN padel_courts p
|
|
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
|
|
AND ABS(l.lon - p.lon) < 0.05
|
|
WHERE ST_Distance_Sphere(
|
|
ST_Point(l.lon, l.lat),
|
|
ST_Point(p.lon, p.lat)
|
|
) / 1000.0 <= 5.0
|
|
GROUP BY l.geoname_id
|
|
),
|
|
-- Tennis courts within 25km of each location (sports culture proxy)
|
|
tennis_nearby AS (
|
|
SELECT
|
|
l.geoname_id,
|
|
COUNT(*) AS tennis_courts_within_25km
|
|
FROM locations l
|
|
JOIN staging.stg_tennis_courts t
|
|
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
|
|
AND ABS(l.lon - t.lon) < 0.23
|
|
WHERE ST_Distance_Sphere(
|
|
ST_Point(l.lon, l.lat),
|
|
ST_Point(t.lon, t.lat)
|
|
) / 1000.0 <= 25.0
|
|
GROUP BY l.geoname_id
|
|
)
|
|
SELECT
|
|
l.geoname_id,
|
|
l.country_code,
|
|
-- Human-readable country name (consistent with dim_cities)
|
|
CASE l.country_code
|
|
WHEN 'DE' THEN 'Germany'
|
|
WHEN 'ES' THEN 'Spain'
|
|
WHEN 'GB' THEN 'United Kingdom'
|
|
WHEN 'FR' THEN 'France'
|
|
WHEN 'IT' THEN 'Italy'
|
|
WHEN 'PT' THEN 'Portugal'
|
|
WHEN 'AT' THEN 'Austria'
|
|
WHEN 'CH' THEN 'Switzerland'
|
|
WHEN 'NL' THEN 'Netherlands'
|
|
WHEN 'BE' THEN 'Belgium'
|
|
WHEN 'SE' THEN 'Sweden'
|
|
WHEN 'NO' THEN 'Norway'
|
|
WHEN 'DK' THEN 'Denmark'
|
|
WHEN 'FI' THEN 'Finland'
|
|
WHEN 'US' THEN 'United States'
|
|
WHEN 'AR' THEN 'Argentina'
|
|
WHEN 'MX' THEN 'Mexico'
|
|
WHEN 'AE' THEN 'UAE'
|
|
WHEN 'AU' THEN 'Australia'
|
|
WHEN 'IE' THEN 'Ireland'
|
|
ELSE l.country_code
|
|
END AS country_name_en,
|
|
-- URL-safe country slug
|
|
LOWER(REGEXP_REPLACE(
|
|
CASE l.country_code
|
|
WHEN 'DE' THEN 'Germany'
|
|
WHEN 'ES' THEN 'Spain'
|
|
WHEN 'GB' THEN 'United Kingdom'
|
|
WHEN 'FR' THEN 'France'
|
|
WHEN 'IT' THEN 'Italy'
|
|
WHEN 'PT' THEN 'Portugal'
|
|
WHEN 'AT' THEN 'Austria'
|
|
WHEN 'CH' THEN 'Switzerland'
|
|
WHEN 'NL' THEN 'Netherlands'
|
|
WHEN 'BE' THEN 'Belgium'
|
|
WHEN 'SE' THEN 'Sweden'
|
|
WHEN 'NO' THEN 'Norway'
|
|
WHEN 'DK' THEN 'Denmark'
|
|
WHEN 'FI' THEN 'Finland'
|
|
WHEN 'US' THEN 'United States'
|
|
WHEN 'AR' THEN 'Argentina'
|
|
WHEN 'MX' THEN 'Mexico'
|
|
WHEN 'AE' THEN 'UAE'
|
|
WHEN 'AU' THEN 'Australia'
|
|
WHEN 'IE' THEN 'Ireland'
|
|
ELSE l.country_code
|
|
END, '[^a-zA-Z0-9]+', '-'
|
|
)) AS country_slug,
|
|
l.location_name,
|
|
l.location_slug,
|
|
l.lat,
|
|
l.lon,
|
|
l.admin1_code,
|
|
l.admin2_code,
|
|
l.population,
|
|
l.ref_year AS population_year,
|
|
COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps,
|
|
COALESCE(ri.regional_income_year, ci.income_year) AS income_year,
|
|
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
|
|
-- Venues per 100K residents (NULL if population = 0)
|
|
CASE WHEN l.population > 0
|
|
THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
|
|
ELSE NULL
|
|
END AS padel_venues_per_100k,
|
|
np.nearest_padel_court_km,
|
|
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
|
|
CURRENT_DATE AS refreshed_date
|
|
FROM locations l
|
|
LEFT JOIN country_income ci ON l.country_code = ci.country_code
|
|
LEFT JOIN regional_income ri ON l.country_code = ri.country_code
|
|
AND l.admin1_code = ri.admin1_code
|
|
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
|
|
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
|
|
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
|
|
-- Enforce grain: deduplicate if city slug collides within same country
|
|
QUALIFY ROW_NUMBER() OVER (
|
|
PARTITION BY l.country_code, l.geoname_id
|
|
ORDER BY l.population DESC NULLS LAST
|
|
) = 1
|