feat(transform): add dim_locations + dual market scoring models
dim_locations (foundation):
- Seeded from stg_population_geonames (all locations, not venue-dependent)
- Grain: (country_code, geoname_id)
- Enriched with: padel venues within 5km, nearest court distance (ST_Distance_Sphere),
tennis courts within 25km, country income
- Covers zero-court Gemeinden for opportunity scoring
location_opportunity_profile (serving) — Padelnomics Marktpotenzial-Score:
- Answers "Where should I build?" — no padel_venue_count filter
- Formula: population (25) + income (20) + supply gap inverted (30) +
catchment gap (15) + tennis culture (10) = 100pts
- Sorted by opportunity_score DESC
city_market_profile (serving) — Padelnomics Marktreife-Score:
- Add saturation discount (×0.85 when venues_per_100k > 8)
- Update header comment to reference Marktreife-Score branding
- Kept WHERE padel_venue_count > 0 (established markets only)
- column name market_score unchanged (avoids downstream breakage)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,183 @@
|
|||||||
|
-- Location dimension: all known populated places globally (GeoNames cities1000).
|
||||||
|
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
|
||||||
|
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
|
||||||
|
--
|
||||||
|
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
|
||||||
|
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
|
||||||
|
--
|
||||||
|
-- Enriched with:
|
||||||
|
-- stg_income → country-level median income PPS
|
||||||
|
-- stg_padel_courts → padel venue count + nearest court distance (km)
|
||||||
|
-- stg_tennis_courts → tennis court count within 25km radius
|
||||||
|
--
|
||||||
|
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
|
||||||
|
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
|
||||||
|
-- exact sphere distance is computed.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name foundation.dim_locations,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain (country_code, geoname_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH
|
||||||
|
-- Base: all GeoNames locations with valid coordinates
|
||||||
|
locations AS (
|
||||||
|
SELECT
|
||||||
|
geoname_id,
|
||||||
|
city_name AS location_name,
|
||||||
|
-- URL-safe location slug
|
||||||
|
LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug,
|
||||||
|
country_code,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
admin1_code,
|
||||||
|
admin2_code,
|
||||||
|
population,
|
||||||
|
population_year AS population_year,
|
||||||
|
ref_year
|
||||||
|
FROM staging.stg_population_geonames
|
||||||
|
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||||||
|
),
|
||||||
|
-- Country income (same source and pattern as dim_cities)
|
||||||
|
country_income AS (
|
||||||
|
SELECT country_code, median_income_pps, ref_year AS income_year
|
||||||
|
FROM staging.stg_income
|
||||||
|
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
||||||
|
),
|
||||||
|
-- Padel court lat/lon for distance and density calculations
|
||||||
|
padel_courts AS (
|
||||||
|
SELECT lat, lon, country_code
|
||||||
|
FROM staging.stg_padel_courts
|
||||||
|
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||||||
|
),
|
||||||
|
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
|
||||||
|
nearest_padel AS (
|
||||||
|
SELECT
|
||||||
|
l.geoname_id,
|
||||||
|
MIN(
|
||||||
|
ST_Distance_Sphere(
|
||||||
|
ST_Point(l.lon, l.lat),
|
||||||
|
ST_Point(p.lon, p.lat)
|
||||||
|
) / 1000.0
|
||||||
|
) AS nearest_padel_court_km
|
||||||
|
FROM locations l
|
||||||
|
JOIN padel_courts p
|
||||||
|
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
|
||||||
|
ON ABS(l.lat - p.lat) < 0.5
|
||||||
|
AND ABS(l.lon - p.lon) < 0.5
|
||||||
|
GROUP BY l.geoname_id
|
||||||
|
),
|
||||||
|
-- Padel venues within 5km of each location (counts as "local padel supply")
|
||||||
|
padel_local AS (
|
||||||
|
SELECT
|
||||||
|
l.geoname_id,
|
||||||
|
COUNT(*) AS padel_venue_count
|
||||||
|
FROM locations l
|
||||||
|
JOIN padel_courts p
|
||||||
|
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
|
||||||
|
AND ABS(l.lon - p.lon) < 0.05
|
||||||
|
WHERE ST_Distance_Sphere(
|
||||||
|
ST_Point(l.lon, l.lat),
|
||||||
|
ST_Point(p.lon, p.lat)
|
||||||
|
) / 1000.0 <= 5.0
|
||||||
|
GROUP BY l.geoname_id
|
||||||
|
),
|
||||||
|
-- Tennis courts within 25km of each location (sports culture proxy)
|
||||||
|
tennis_nearby AS (
|
||||||
|
SELECT
|
||||||
|
l.geoname_id,
|
||||||
|
COUNT(*) AS tennis_courts_within_25km
|
||||||
|
FROM locations l
|
||||||
|
JOIN staging.stg_tennis_courts t
|
||||||
|
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
|
||||||
|
AND ABS(l.lon - t.lon) < 0.23
|
||||||
|
WHERE ST_Distance_Sphere(
|
||||||
|
ST_Point(l.lon, l.lat),
|
||||||
|
ST_Point(t.lon, t.lat)
|
||||||
|
) / 1000.0 <= 25.0
|
||||||
|
GROUP BY l.geoname_id
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
l.geoname_id,
|
||||||
|
l.country_code,
|
||||||
|
-- Human-readable country name (consistent with dim_cities)
|
||||||
|
CASE l.country_code
|
||||||
|
WHEN 'DE' THEN 'Germany'
|
||||||
|
WHEN 'ES' THEN 'Spain'
|
||||||
|
WHEN 'GB' THEN 'United Kingdom'
|
||||||
|
WHEN 'FR' THEN 'France'
|
||||||
|
WHEN 'IT' THEN 'Italy'
|
||||||
|
WHEN 'PT' THEN 'Portugal'
|
||||||
|
WHEN 'AT' THEN 'Austria'
|
||||||
|
WHEN 'CH' THEN 'Switzerland'
|
||||||
|
WHEN 'NL' THEN 'Netherlands'
|
||||||
|
WHEN 'BE' THEN 'Belgium'
|
||||||
|
WHEN 'SE' THEN 'Sweden'
|
||||||
|
WHEN 'NO' THEN 'Norway'
|
||||||
|
WHEN 'DK' THEN 'Denmark'
|
||||||
|
WHEN 'FI' THEN 'Finland'
|
||||||
|
WHEN 'US' THEN 'United States'
|
||||||
|
WHEN 'AR' THEN 'Argentina'
|
||||||
|
WHEN 'MX' THEN 'Mexico'
|
||||||
|
WHEN 'AE' THEN 'UAE'
|
||||||
|
WHEN 'AU' THEN 'Australia'
|
||||||
|
WHEN 'IE' THEN 'Ireland'
|
||||||
|
ELSE l.country_code
|
||||||
|
END AS country_name_en,
|
||||||
|
-- URL-safe country slug
|
||||||
|
LOWER(REGEXP_REPLACE(
|
||||||
|
CASE l.country_code
|
||||||
|
WHEN 'DE' THEN 'Germany'
|
||||||
|
WHEN 'ES' THEN 'Spain'
|
||||||
|
WHEN 'GB' THEN 'United Kingdom'
|
||||||
|
WHEN 'FR' THEN 'France'
|
||||||
|
WHEN 'IT' THEN 'Italy'
|
||||||
|
WHEN 'PT' THEN 'Portugal'
|
||||||
|
WHEN 'AT' THEN 'Austria'
|
||||||
|
WHEN 'CH' THEN 'Switzerland'
|
||||||
|
WHEN 'NL' THEN 'Netherlands'
|
||||||
|
WHEN 'BE' THEN 'Belgium'
|
||||||
|
WHEN 'SE' THEN 'Sweden'
|
||||||
|
WHEN 'NO' THEN 'Norway'
|
||||||
|
WHEN 'DK' THEN 'Denmark'
|
||||||
|
WHEN 'FI' THEN 'Finland'
|
||||||
|
WHEN 'US' THEN 'United States'
|
||||||
|
WHEN 'AR' THEN 'Argentina'
|
||||||
|
WHEN 'MX' THEN 'Mexico'
|
||||||
|
WHEN 'AE' THEN 'UAE'
|
||||||
|
WHEN 'AU' THEN 'Australia'
|
||||||
|
WHEN 'IE' THEN 'Ireland'
|
||||||
|
ELSE l.country_code
|
||||||
|
END, '[^a-zA-Z0-9]+', '-'
|
||||||
|
)) AS country_slug,
|
||||||
|
l.location_name,
|
||||||
|
l.location_slug,
|
||||||
|
l.lat,
|
||||||
|
l.lon,
|
||||||
|
l.admin1_code,
|
||||||
|
l.admin2_code,
|
||||||
|
l.population,
|
||||||
|
l.ref_year AS population_year,
|
||||||
|
ci.median_income_pps,
|
||||||
|
ci.income_year,
|
||||||
|
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
|
||||||
|
-- Venues per 100K residents (NULL if population = 0)
|
||||||
|
CASE WHEN l.population > 0
|
||||||
|
THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
|
||||||
|
ELSE NULL
|
||||||
|
END AS padel_venues_per_100k,
|
||||||
|
np.nearest_padel_court_km,
|
||||||
|
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
|
||||||
|
CURRENT_DATE AS refreshed_date
|
||||||
|
FROM locations l
|
||||||
|
LEFT JOIN country_income ci ON l.country_code = ci.country_code
|
||||||
|
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
|
||||||
|
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
|
||||||
|
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
|
||||||
|
-- Enforce grain: deduplicate if city slug collides within same country
|
||||||
|
QUALIFY ROW_NUMBER() OVER (
|
||||||
|
PARTITION BY l.country_code, l.geoname_id
|
||||||
|
ORDER BY l.population DESC NULLS LAST
|
||||||
|
) = 1
|
||||||
@@ -1,11 +1,16 @@
|
|||||||
-- One Big Table: per-city padel market intelligence.
|
-- One Big Table: per-city padel market intelligence.
|
||||||
-- Consumed by: SEO article generation, planner city-select pre-fill, API endpoints.
|
-- Consumed by: SEO article generation, planner city-select pre-fill, API endpoints.
|
||||||
--
|
--
|
||||||
-- Market score v2 (0–100):
|
-- Padelnomics Marktreife-Score v2 (0–100):
|
||||||
-- 30 pts population — log-scaled to 1M+ city ceiling (was 40pts/500K)
|
-- Answers "How mature/established is this padel market?"
|
||||||
|
-- Only computed for cities with ≥1 padel venue (padel_venue_count > 0).
|
||||||
|
-- For white-space opportunity scoring, see serving.location_opportunity_profile.
|
||||||
|
--
|
||||||
|
-- 30 pts population — log-scaled to 1M+ city ceiling
|
||||||
-- 25 pts income PPS — normalised to 200 ceiling (covers CH/NO/LU outliers)
|
-- 25 pts income PPS — normalised to 200 ceiling (covers CH/NO/LU outliers)
|
||||||
-- 30 pts demand — observed occupancy if available, else venue density
|
-- 30 pts demand — observed occupancy if available, else venue density
|
||||||
-- 15 pts data quality — completeness discount, not a market signal
|
-- 15 pts data quality — completeness discount, not a market signal
|
||||||
|
-- ×0.85 saturation — discount when venues_per_100k > 8 (oversupplied market)
|
||||||
|
|
||||||
MODEL (
|
MODEL (
|
||||||
name serving.city_market_profile,
|
name serving.city_market_profile,
|
||||||
@@ -73,7 +78,11 @@ scored AS (
|
|||||||
-- Data quality (15 pts): measures completeness, not market quality.
|
-- Data quality (15 pts): measures completeness, not market quality.
|
||||||
-- Reduced from 20pts — kept as confidence discount, not market signal.
|
-- Reduced from 20pts — kept as confidence discount, not market signal.
|
||||||
+ 15.0 * data_confidence
|
+ 15.0 * data_confidence
|
||||||
, 1) AS market_score
|
, 1)
|
||||||
|
-- Saturation discount: venues_per_100k > 8 signals oversupply.
|
||||||
|
-- ~8/100K ≈ Spain-tier density; above this marginal return decreases.
|
||||||
|
* CASE WHEN venues_per_100k > 8 THEN 0.85 ELSE 1.0 END
|
||||||
|
AS market_score
|
||||||
FROM base
|
FROM base
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
|
|||||||
@@ -0,0 +1,69 @@
|
|||||||
|
-- Per-location padel investment opportunity intelligence.
|
||||||
|
-- Consumed by: Gemeinde-level pSEO pages, opportunity map, "top markets" lists.
|
||||||
|
--
|
||||||
|
-- Padelnomics Marktpotenzial-Score (0–100):
|
||||||
|
-- Answers "Where should I build a padel court?"
|
||||||
|
-- Covers ALL GeoNames locations (pop ≥ 1K) — NOT filtered to existing padel markets.
|
||||||
|
-- Zero-court locations score highest on supply gap component (white space = opportunity).
|
||||||
|
--
|
||||||
|
-- 25 pts addressable market — log-scaled population, ceiling 500K
|
||||||
|
-- (opportunity peaks in mid-size cities; megacities already served)
|
||||||
|
-- 20 pts economic power — country income PPS, normalised to 200
|
||||||
|
-- 30 pts supply gap — INVERTED venue density; 0 courts/100K = full marks
|
||||||
|
-- 15 pts catchment gap — distance to nearest padel court (>30km = full marks)
|
||||||
|
-- 10 pts sports culture — tennis courts within 25km (≥10 = full marks)
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name serving.location_opportunity_profile,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain (country_code, geoname_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
l.geoname_id,
|
||||||
|
l.country_code,
|
||||||
|
l.country_name_en,
|
||||||
|
l.country_slug,
|
||||||
|
l.location_name,
|
||||||
|
l.location_slug,
|
||||||
|
l.lat,
|
||||||
|
l.lon,
|
||||||
|
l.admin1_code,
|
||||||
|
l.admin2_code,
|
||||||
|
l.population,
|
||||||
|
l.population_year,
|
||||||
|
l.median_income_pps,
|
||||||
|
l.income_year,
|
||||||
|
l.padel_venue_count,
|
||||||
|
l.padel_venues_per_100k,
|
||||||
|
l.nearest_padel_court_km,
|
||||||
|
l.tennis_courts_within_25km,
|
||||||
|
ROUND(
|
||||||
|
-- Addressable market (25 pts): log-scaled to 500K ceiling.
|
||||||
|
-- Lower ceiling than Marktreife (1M) — opportunity peaks in mid-size cities
|
||||||
|
-- that can support a court but aren't already saturated by large-city operators.
|
||||||
|
25.0 * LEAST(1.0, LN(GREATEST(l.population, 1)) / LN(500000))
|
||||||
|
|
||||||
|
-- Economic power (20 pts): country-level income PPS normalised to 200.
|
||||||
|
-- Drives willingness-to-pay for court fees (€20-35/hr target range).
|
||||||
|
+ 20.0 * LEAST(1.0, COALESCE(l.median_income_pps, 100) / 200.0)
|
||||||
|
|
||||||
|
-- Supply gap (30 pts): INVERTED venue density.
|
||||||
|
-- 0 courts/100K = full 30 pts (white space); ≥4/100K = 0 pts (served market).
|
||||||
|
-- This is the key signal that separates Marktpotenzial from Marktreife.
|
||||||
|
+ 30.0 * GREATEST(0.0, 1.0 - COALESCE(l.padel_venues_per_100k, 0) / 4.0)
|
||||||
|
|
||||||
|
-- Catchment gap (15 pts): distance to nearest existing padel court.
|
||||||
|
-- >30km = full 15 pts (underserved catchment area).
|
||||||
|
-- NULL = no courts found anywhere (rare edge case) → neutral 0.5.
|
||||||
|
+ 15.0 * COALESCE(LEAST(1.0, l.nearest_padel_court_km / 30.0), 0.5)
|
||||||
|
|
||||||
|
-- Sports culture proxy (10 pts): tennis courts within 25km.
|
||||||
|
-- ≥10 courts = full 10 pts (proven racket sport market = faster padel adoption).
|
||||||
|
-- 0 courts = 0 pts. Many new padel courts open inside existing tennis clubs.
|
||||||
|
+ 10.0 * LEAST(1.0, l.tennis_courts_within_25km / 10.0)
|
||||||
|
, 1) AS opportunity_score,
|
||||||
|
CURRENT_DATE AS refreshed_date
|
||||||
|
FROM foundation.dim_locations l
|
||||||
|
ORDER BY opportunity_score DESC
|
||||||
Reference in New Issue
Block a user