feat(transform): add dim_locations + dual market scoring models

dim_locations (foundation):
- Seeded from stg_population_geonames (all locations, not venue-dependent)
- Grain: (country_code, geoname_id)
- Enriched with: padel venues within 5km, nearest court distance (ST_Distance_Sphere),
  tennis courts within 25km, country income
- Covers zero-court Gemeinden for opportunity scoring

location_opportunity_profile (serving) — Padelnomics Marktpotenzial-Score:
- Answers "Where should I build?" — no padel_venue_count filter
- Formula: population (25) + income (20) + supply gap inverted (30) +
           catchment gap (15) + tennis culture (10) = 100pts
- Sorted by opportunity_score DESC

city_market_profile (serving) — Padelnomics Marktreife-Score:
- Add saturation discount (×0.85 when venues_per_100k > 8)
- Update header comment to reference Marktreife-Score branding
- Kept WHERE padel_venue_count > 0 (established markets only)
- column name market_score unchanged (avoids downstream breakage)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 16:28:16 +01:00
parent c109488d9d
commit ebfdc84a94
3 changed files with 264 additions and 3 deletions

View File

@@ -0,0 +1,183 @@
-- Location dimension: all known populated places globally (GeoNames cities1000).
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
--
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
--
-- Enriched with:
-- stg_income → country-level median income PPS
-- stg_padel_courts → padel venue count + nearest court distance (km)
-- stg_tennis_courts → tennis court count within 25km radius
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
-- exact sphere distance is computed.
MODEL (
name foundation.dim_locations,
kind FULL,
cron '@daily',
grain (country_code, geoname_id)
);
WITH
-- Base: all GeoNames locations with valid coordinates
locations AS (
SELECT
geoname_id,
city_name AS location_name,
-- URL-safe location slug
LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug,
country_code,
lat,
lon,
admin1_code,
admin2_code,
population,
population_year AS population_year,
ref_year
FROM staging.stg_population_geonames
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Country income (same source and pattern as dim_cities)
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- Padel court lat/lon for distance and density calculations
padel_courts AS (
SELECT lat, lon, country_code
FROM staging.stg_padel_courts
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
nearest_padel AS (
SELECT
l.geoname_id,
MIN(
ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0
) AS nearest_padel_court_km
FROM locations l
JOIN padel_courts p
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
ON ABS(l.lat - p.lat) < 0.5
AND ABS(l.lon - p.lon) < 0.5
GROUP BY l.geoname_id
),
-- Padel venues within 5km of each location (counts as "local padel supply")
padel_local AS (
SELECT
l.geoname_id,
COUNT(*) AS padel_venue_count
FROM locations l
JOIN padel_courts p
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
AND ABS(l.lon - p.lon) < 0.05
WHERE ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0 <= 5.0
GROUP BY l.geoname_id
),
-- Tennis courts within 25km of each location (sports culture proxy)
tennis_nearby AS (
SELECT
l.geoname_id,
COUNT(*) AS tennis_courts_within_25km
FROM locations l
JOIN staging.stg_tennis_courts t
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
AND ABS(l.lon - t.lon) < 0.23
WHERE ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(t.lon, t.lat)
) / 1000.0 <= 25.0
GROUP BY l.geoname_id
)
SELECT
l.geoname_id,
l.country_code,
-- Human-readable country name (consistent with dim_cities)
CASE l.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE l.country_code
END AS country_name_en,
-- URL-safe country slug
LOWER(REGEXP_REPLACE(
CASE l.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE l.country_code
END, '[^a-zA-Z0-9]+', '-'
)) AS country_slug,
l.location_name,
l.location_slug,
l.lat,
l.lon,
l.admin1_code,
l.admin2_code,
l.population,
l.ref_year AS population_year,
ci.median_income_pps,
ci.income_year,
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
-- Venues per 100K residents (NULL if population = 0)
CASE WHEN l.population > 0
THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
ELSE NULL
END AS padel_venues_per_100k,
np.nearest_padel_court_km,
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
CURRENT_DATE AS refreshed_date
FROM locations l
LEFT JOIN country_income ci ON l.country_code = ci.country_code
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
-- Enforce grain: deduplicate if city slug collides within same country
QUALIFY ROW_NUMBER() OVER (
PARTITION BY l.country_code, l.geoname_id
ORDER BY l.population DESC NULLS LAST
) = 1