From ebfdc84a9464c6060b60d5c1f6f7ab12455b22a3 Mon Sep 17 00:00:00 2001 From: Deeman Date: Tue, 24 Feb 2026 16:28:16 +0100 Subject: [PATCH] feat(transform): add dim_locations + dual market scoring models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dim_locations (foundation): - Seeded from stg_population_geonames (all locations, not venue-dependent) - Grain: (country_code, geoname_id) - Enriched with: padel venues within 5km, nearest court distance (ST_Distance_Sphere), tennis courts within 25km, country income - Covers zero-court Gemeinden for opportunity scoring location_opportunity_profile (serving) — Padelnomics Marktpotenzial-Score: - Answers "Where should I build?" — no padel_venue_count filter - Formula: population (25) + income (20) + supply gap inverted (30) + catchment gap (15) + tennis culture (10) = 100pts - Sorted by opportunity_score DESC city_market_profile (serving) — Padelnomics Marktreife-Score: - Add saturation discount (×0.85 when venues_per_100k > 8) - Update header comment to reference Marktreife-Score branding - Kept WHERE padel_venue_count > 0 (established markets only) - column name market_score unchanged (avoids downstream breakage) Co-Authored-By: Claude Sonnet 4.6 --- .../models/foundation/dim_locations.sql | 183 ++++++++++++++++++ .../models/serving/city_market_profile.sql | 15 +- .../serving/location_opportunity_profile.sql | 69 +++++++ 3 files changed, 264 insertions(+), 3 deletions(-) create mode 100644 transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql create mode 100644 transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql new file mode 100644 index 0000000..c6177d3 --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -0,0 +1,183 @@ +-- Location dimension: all known populated places globally (GeoNames cities1000). +-- This is the opportunity-scoring root — NOT filtered to places with padel courts. +-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location. +-- +-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations +-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully. +-- +-- Enriched with: +-- stg_income → country-level median income PPS +-- stg_padel_courts → padel venue count + nearest court distance (km) +-- stg_tennis_courts → tennis court count within 25km radius +-- +-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension). +-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the +-- exact sphere distance is computed. + +MODEL ( + name foundation.dim_locations, + kind FULL, + cron '@daily', + grain (country_code, geoname_id) +); + +WITH +-- Base: all GeoNames locations with valid coordinates +locations AS ( + SELECT + geoname_id, + city_name AS location_name, + -- URL-safe location slug + LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug, + country_code, + lat, + lon, + admin1_code, + admin2_code, + population, + population_year AS population_year, + ref_year + FROM staging.stg_population_geonames + WHERE lat IS NOT NULL AND lon IS NOT NULL +), +-- Country income (same source and pattern as dim_cities) +country_income AS ( + SELECT country_code, median_income_pps, ref_year AS income_year + FROM staging.stg_income + QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1 +), +-- Padel court lat/lon for distance and density calculations +padel_courts AS ( + SELECT lat, lon, country_code + FROM staging.stg_padel_courts + WHERE lat IS NOT NULL AND lon IS NOT NULL +), +-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance) +nearest_padel AS ( + SELECT + l.geoname_id, + MIN( + ST_Distance_Sphere( + ST_Point(l.lon, l.lat), + ST_Point(p.lon, p.lat) + ) / 1000.0 + ) AS nearest_padel_court_km + FROM locations l + JOIN padel_courts p + -- ~55km bounding box pre-filter to limit cross-join before sphere calc + ON ABS(l.lat - p.lat) < 0.5 + AND ABS(l.lon - p.lon) < 0.5 + GROUP BY l.geoname_id +), +-- Padel venues within 5km of each location (counts as "local padel supply") +padel_local AS ( + SELECT + l.geoname_id, + COUNT(*) AS padel_venue_count + FROM locations l + JOIN padel_courts p + ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter + AND ABS(l.lon - p.lon) < 0.05 + WHERE ST_Distance_Sphere( + ST_Point(l.lon, l.lat), + ST_Point(p.lon, p.lat) + ) / 1000.0 <= 5.0 + GROUP BY l.geoname_id +), +-- Tennis courts within 25km of each location (sports culture proxy) +tennis_nearby AS ( + SELECT + l.geoname_id, + COUNT(*) AS tennis_courts_within_25km + FROM locations l + JOIN staging.stg_tennis_courts t + ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter + AND ABS(l.lon - t.lon) < 0.23 + WHERE ST_Distance_Sphere( + ST_Point(l.lon, l.lat), + ST_Point(t.lon, t.lat) + ) / 1000.0 <= 25.0 + GROUP BY l.geoname_id +) +SELECT + l.geoname_id, + l.country_code, + -- Human-readable country name (consistent with dim_cities) + CASE l.country_code + WHEN 'DE' THEN 'Germany' + WHEN 'ES' THEN 'Spain' + WHEN 'GB' THEN 'United Kingdom' + WHEN 'FR' THEN 'France' + WHEN 'IT' THEN 'Italy' + WHEN 'PT' THEN 'Portugal' + WHEN 'AT' THEN 'Austria' + WHEN 'CH' THEN 'Switzerland' + WHEN 'NL' THEN 'Netherlands' + WHEN 'BE' THEN 'Belgium' + WHEN 'SE' THEN 'Sweden' + WHEN 'NO' THEN 'Norway' + WHEN 'DK' THEN 'Denmark' + WHEN 'FI' THEN 'Finland' + WHEN 'US' THEN 'United States' + WHEN 'AR' THEN 'Argentina' + WHEN 'MX' THEN 'Mexico' + WHEN 'AE' THEN 'UAE' + WHEN 'AU' THEN 'Australia' + WHEN 'IE' THEN 'Ireland' + ELSE l.country_code + END AS country_name_en, + -- URL-safe country slug + LOWER(REGEXP_REPLACE( + CASE l.country_code + WHEN 'DE' THEN 'Germany' + WHEN 'ES' THEN 'Spain' + WHEN 'GB' THEN 'United Kingdom' + WHEN 'FR' THEN 'France' + WHEN 'IT' THEN 'Italy' + WHEN 'PT' THEN 'Portugal' + WHEN 'AT' THEN 'Austria' + WHEN 'CH' THEN 'Switzerland' + WHEN 'NL' THEN 'Netherlands' + WHEN 'BE' THEN 'Belgium' + WHEN 'SE' THEN 'Sweden' + WHEN 'NO' THEN 'Norway' + WHEN 'DK' THEN 'Denmark' + WHEN 'FI' THEN 'Finland' + WHEN 'US' THEN 'United States' + WHEN 'AR' THEN 'Argentina' + WHEN 'MX' THEN 'Mexico' + WHEN 'AE' THEN 'UAE' + WHEN 'AU' THEN 'Australia' + WHEN 'IE' THEN 'Ireland' + ELSE l.country_code + END, '[^a-zA-Z0-9]+', '-' + )) AS country_slug, + l.location_name, + l.location_slug, + l.lat, + l.lon, + l.admin1_code, + l.admin2_code, + l.population, + l.ref_year AS population_year, + ci.median_income_pps, + ci.income_year, + COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count, + -- Venues per 100K residents (NULL if population = 0) + CASE WHEN l.population > 0 + THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2) + ELSE NULL + END AS padel_venues_per_100k, + np.nearest_padel_court_km, + COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km, + CURRENT_DATE AS refreshed_date +FROM locations l +LEFT JOIN country_income ci ON l.country_code = ci.country_code +LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id +LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id +LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id +-- Enforce grain: deduplicate if city slug collides within same country +QUALIFY ROW_NUMBER() OVER ( + PARTITION BY l.country_code, l.geoname_id + ORDER BY l.population DESC NULLS LAST +) = 1 diff --git a/transform/sqlmesh_padelnomics/models/serving/city_market_profile.sql b/transform/sqlmesh_padelnomics/models/serving/city_market_profile.sql index 338d18c..47ad2f7 100644 --- a/transform/sqlmesh_padelnomics/models/serving/city_market_profile.sql +++ b/transform/sqlmesh_padelnomics/models/serving/city_market_profile.sql @@ -1,11 +1,16 @@ -- One Big Table: per-city padel market intelligence. -- Consumed by: SEO article generation, planner city-select pre-fill, API endpoints. -- --- Market score v2 (0–100): --- 30 pts population — log-scaled to 1M+ city ceiling (was 40pts/500K) +-- Padelnomics Marktreife-Score v2 (0–100): +-- Answers "How mature/established is this padel market?" +-- Only computed for cities with ≥1 padel venue (padel_venue_count > 0). +-- For white-space opportunity scoring, see serving.location_opportunity_profile. +-- +-- 30 pts population — log-scaled to 1M+ city ceiling -- 25 pts income PPS — normalised to 200 ceiling (covers CH/NO/LU outliers) -- 30 pts demand — observed occupancy if available, else venue density -- 15 pts data quality — completeness discount, not a market signal +-- ×0.85 saturation — discount when venues_per_100k > 8 (oversupplied market) MODEL ( name serving.city_market_profile, @@ -73,7 +78,11 @@ scored AS ( -- Data quality (15 pts): measures completeness, not market quality. -- Reduced from 20pts — kept as confidence discount, not market signal. + 15.0 * data_confidence - , 1) AS market_score + , 1) + -- Saturation discount: venues_per_100k > 8 signals oversupply. + -- ~8/100K ≈ Spain-tier density; above this marginal return decreases. + * CASE WHEN venues_per_100k > 8 THEN 0.85 ELSE 1.0 END + AS market_score FROM base ) SELECT diff --git a/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql b/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql new file mode 100644 index 0000000..e848db0 --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql @@ -0,0 +1,69 @@ +-- Per-location padel investment opportunity intelligence. +-- Consumed by: Gemeinde-level pSEO pages, opportunity map, "top markets" lists. +-- +-- Padelnomics Marktpotenzial-Score (0–100): +-- Answers "Where should I build a padel court?" +-- Covers ALL GeoNames locations (pop ≥ 1K) — NOT filtered to existing padel markets. +-- Zero-court locations score highest on supply gap component (white space = opportunity). +-- +-- 25 pts addressable market — log-scaled population, ceiling 500K +-- (opportunity peaks in mid-size cities; megacities already served) +-- 20 pts economic power — country income PPS, normalised to 200 +-- 30 pts supply gap — INVERTED venue density; 0 courts/100K = full marks +-- 15 pts catchment gap — distance to nearest padel court (>30km = full marks) +-- 10 pts sports culture — tennis courts within 25km (≥10 = full marks) + +MODEL ( + name serving.location_opportunity_profile, + kind FULL, + cron '@daily', + grain (country_code, geoname_id) +); + +SELECT + l.geoname_id, + l.country_code, + l.country_name_en, + l.country_slug, + l.location_name, + l.location_slug, + l.lat, + l.lon, + l.admin1_code, + l.admin2_code, + l.population, + l.population_year, + l.median_income_pps, + l.income_year, + l.padel_venue_count, + l.padel_venues_per_100k, + l.nearest_padel_court_km, + l.tennis_courts_within_25km, + ROUND( + -- Addressable market (25 pts): log-scaled to 500K ceiling. + -- Lower ceiling than Marktreife (1M) — opportunity peaks in mid-size cities + -- that can support a court but aren't already saturated by large-city operators. + 25.0 * LEAST(1.0, LN(GREATEST(l.population, 1)) / LN(500000)) + + -- Economic power (20 pts): country-level income PPS normalised to 200. + -- Drives willingness-to-pay for court fees (€20-35/hr target range). + + 20.0 * LEAST(1.0, COALESCE(l.median_income_pps, 100) / 200.0) + + -- Supply gap (30 pts): INVERTED venue density. + -- 0 courts/100K = full 30 pts (white space); ≥4/100K = 0 pts (served market). + -- This is the key signal that separates Marktpotenzial from Marktreife. + + 30.0 * GREATEST(0.0, 1.0 - COALESCE(l.padel_venues_per_100k, 0) / 4.0) + + -- Catchment gap (15 pts): distance to nearest existing padel court. + -- >30km = full 15 pts (underserved catchment area). + -- NULL = no courts found anywhere (rare edge case) → neutral 0.5. + + 15.0 * COALESCE(LEAST(1.0, l.nearest_padel_court_km / 30.0), 0.5) + + -- Sports culture proxy (10 pts): tennis courts within 25km. + -- ≥10 courts = full 10 pts (proven racket sport market = faster padel adoption). + -- 0 courts = 0 pts. Many new padel courts open inside existing tennis clubs. + + 10.0 * LEAST(1.0, l.tennis_courts_within_25km / 10.0) + , 1) AS opportunity_score, + CURRENT_DATE AS refreshed_date +FROM foundation.dim_locations l +ORDER BY opportunity_score DESC