feat(transform): add dim_locations + dual market scoring models

dim_locations (foundation):
- Seeded from stg_population_geonames (all locations, not venue-dependent)
- Grain: (country_code, geoname_id)
- Enriched with: padel venues within 5km, nearest court distance (ST_Distance_Sphere),
  tennis courts within 25km, country income
- Covers zero-court Gemeinden for opportunity scoring

location_opportunity_profile (serving) — Padelnomics Marktpotenzial-Score:
- Answers "Where should I build?" — no padel_venue_count filter
- Formula: population (25) + income (20) + supply gap inverted (30) +
           catchment gap (15) + tennis culture (10) = 100pts
- Sorted by opportunity_score DESC

city_market_profile (serving) — Padelnomics Marktreife-Score:
- Add saturation discount (×0.85 when venues_per_100k > 8)
- Update header comment to reference Marktreife-Score branding
- Kept WHERE padel_venue_count > 0 (established markets only)
- column name market_score unchanged (avoids downstream breakage)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 16:28:16 +01:00
parent c109488d9d
commit ebfdc84a94
3 changed files with 264 additions and 3 deletions

View File

@@ -0,0 +1,183 @@
-- Location dimension: all known populated places globally (GeoNames cities1000).
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
--
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
--
-- Enriched with:
-- stg_income → country-level median income PPS
-- stg_padel_courts → padel venue count + nearest court distance (km)
-- stg_tennis_courts → tennis court count within 25km radius
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
-- exact sphere distance is computed.
MODEL (
name foundation.dim_locations,
kind FULL,
cron '@daily',
grain (country_code, geoname_id)
);
WITH
-- Base: all GeoNames locations with valid coordinates
locations AS (
SELECT
geoname_id,
city_name AS location_name,
-- URL-safe location slug
LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug,
country_code,
lat,
lon,
admin1_code,
admin2_code,
population,
population_year AS population_year,
ref_year
FROM staging.stg_population_geonames
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Country income (same source and pattern as dim_cities)
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- Padel court lat/lon for distance and density calculations
padel_courts AS (
SELECT lat, lon, country_code
FROM staging.stg_padel_courts
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
nearest_padel AS (
SELECT
l.geoname_id,
MIN(
ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0
) AS nearest_padel_court_km
FROM locations l
JOIN padel_courts p
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
ON ABS(l.lat - p.lat) < 0.5
AND ABS(l.lon - p.lon) < 0.5
GROUP BY l.geoname_id
),
-- Padel venues within 5km of each location (counts as "local padel supply")
padel_local AS (
SELECT
l.geoname_id,
COUNT(*) AS padel_venue_count
FROM locations l
JOIN padel_courts p
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
AND ABS(l.lon - p.lon) < 0.05
WHERE ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0 <= 5.0
GROUP BY l.geoname_id
),
-- Tennis courts within 25km of each location (sports culture proxy)
tennis_nearby AS (
SELECT
l.geoname_id,
COUNT(*) AS tennis_courts_within_25km
FROM locations l
JOIN staging.stg_tennis_courts t
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
AND ABS(l.lon - t.lon) < 0.23
WHERE ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(t.lon, t.lat)
) / 1000.0 <= 25.0
GROUP BY l.geoname_id
)
SELECT
l.geoname_id,
l.country_code,
-- Human-readable country name (consistent with dim_cities)
CASE l.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE l.country_code
END AS country_name_en,
-- URL-safe country slug
LOWER(REGEXP_REPLACE(
CASE l.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
WHEN 'FR' THEN 'France'
WHEN 'IT' THEN 'Italy'
WHEN 'PT' THEN 'Portugal'
WHEN 'AT' THEN 'Austria'
WHEN 'CH' THEN 'Switzerland'
WHEN 'NL' THEN 'Netherlands'
WHEN 'BE' THEN 'Belgium'
WHEN 'SE' THEN 'Sweden'
WHEN 'NO' THEN 'Norway'
WHEN 'DK' THEN 'Denmark'
WHEN 'FI' THEN 'Finland'
WHEN 'US' THEN 'United States'
WHEN 'AR' THEN 'Argentina'
WHEN 'MX' THEN 'Mexico'
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE l.country_code
END, '[^a-zA-Z0-9]+', '-'
)) AS country_slug,
l.location_name,
l.location_slug,
l.lat,
l.lon,
l.admin1_code,
l.admin2_code,
l.population,
l.ref_year AS population_year,
ci.median_income_pps,
ci.income_year,
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
-- Venues per 100K residents (NULL if population = 0)
CASE WHEN l.population > 0
THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
ELSE NULL
END AS padel_venues_per_100k,
np.nearest_padel_court_km,
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
CURRENT_DATE AS refreshed_date
FROM locations l
LEFT JOIN country_income ci ON l.country_code = ci.country_code
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
-- Enforce grain: deduplicate if city slug collides within same country
QUALIFY ROW_NUMBER() OVER (
PARTITION BY l.country_code, l.geoname_id
ORDER BY l.population DESC NULLS LAST
) = 1

View File

@@ -1,11 +1,16 @@
-- One Big Table: per-city padel market intelligence.
-- Consumed by: SEO article generation, planner city-select pre-fill, API endpoints.
--
-- Market score v2 (0100):
-- 30 pts population — log-scaled to 1M+ city ceiling (was 40pts/500K)
-- Padelnomics Marktreife-Score v2 (0100):
-- Answers "How mature/established is this padel market?"
-- Only computed for cities with ≥1 padel venue (padel_venue_count > 0).
-- For white-space opportunity scoring, see serving.location_opportunity_profile.
--
-- 30 pts population — log-scaled to 1M+ city ceiling
-- 25 pts income PPS — normalised to 200 ceiling (covers CH/NO/LU outliers)
-- 30 pts demand — observed occupancy if available, else venue density
-- 15 pts data quality — completeness discount, not a market signal
-- ×0.85 saturation — discount when venues_per_100k > 8 (oversupplied market)
MODEL (
name serving.city_market_profile,
@@ -73,7 +78,11 @@ scored AS (
-- Data quality (15 pts): measures completeness, not market quality.
-- Reduced from 20pts — kept as confidence discount, not market signal.
+ 15.0 * data_confidence
, 1) AS market_score
, 1)
-- Saturation discount: venues_per_100k > 8 signals oversupply.
-- ~8/100K ≈ Spain-tier density; above this marginal return decreases.
* CASE WHEN venues_per_100k > 8 THEN 0.85 ELSE 1.0 END
AS market_score
FROM base
)
SELECT

View File

@@ -0,0 +1,69 @@
-- Per-location padel investment opportunity intelligence.
-- Consumed by: Gemeinde-level pSEO pages, opportunity map, "top markets" lists.
--
-- Padelnomics Marktpotenzial-Score (0100):
-- Answers "Where should I build a padel court?"
-- Covers ALL GeoNames locations (pop ≥ 1K) — NOT filtered to existing padel markets.
-- Zero-court locations score highest on supply gap component (white space = opportunity).
--
-- 25 pts addressable market — log-scaled population, ceiling 500K
-- (opportunity peaks in mid-size cities; megacities already served)
-- 20 pts economic power — country income PPS, normalised to 200
-- 30 pts supply gap — INVERTED venue density; 0 courts/100K = full marks
-- 15 pts catchment gap — distance to nearest padel court (>30km = full marks)
-- 10 pts sports culture — tennis courts within 25km (≥10 = full marks)
MODEL (
name serving.location_opportunity_profile,
kind FULL,
cron '@daily',
grain (country_code, geoname_id)
);
SELECT
l.geoname_id,
l.country_code,
l.country_name_en,
l.country_slug,
l.location_name,
l.location_slug,
l.lat,
l.lon,
l.admin1_code,
l.admin2_code,
l.population,
l.population_year,
l.median_income_pps,
l.income_year,
l.padel_venue_count,
l.padel_venues_per_100k,
l.nearest_padel_court_km,
l.tennis_courts_within_25km,
ROUND(
-- Addressable market (25 pts): log-scaled to 500K ceiling.
-- Lower ceiling than Marktreife (1M) — opportunity peaks in mid-size cities
-- that can support a court but aren't already saturated by large-city operators.
25.0 * LEAST(1.0, LN(GREATEST(l.population, 1)) / LN(500000))
-- Economic power (20 pts): country-level income PPS normalised to 200.
-- Drives willingness-to-pay for court fees (€20-35/hr target range).
+ 20.0 * LEAST(1.0, COALESCE(l.median_income_pps, 100) / 200.0)
-- Supply gap (30 pts): INVERTED venue density.
-- 0 courts/100K = full 30 pts (white space); ≥4/100K = 0 pts (served market).
-- This is the key signal that separates Marktpotenzial from Marktreife.
+ 30.0 * GREATEST(0.0, 1.0 - COALESCE(l.padel_venues_per_100k, 0) / 4.0)
-- Catchment gap (15 pts): distance to nearest existing padel court.
-- >30km = full 15 pts (underserved catchment area).
-- NULL = no courts found anywhere (rare edge case) → neutral 0.5.
+ 15.0 * COALESCE(LEAST(1.0, l.nearest_padel_court_km / 30.0), 0.5)
-- Sports culture proxy (10 pts): tennis courts within 25km.
-- ≥10 courts = full 10 pts (proven racket sport market = faster padel adoption).
-- 0 courts = 0 pts. Many new padel courts open inside existing tennis clubs.
+ 10.0 * LEAST(1.0, l.tennis_courts_within_25km / 10.0)
, 1) AS opportunity_score,
CURRENT_DATE AS refreshed_date
FROM foundation.dim_locations l
ORDER BY opportunity_score DESC