merge: H3 catchment index for Marktpotenzial-Score v3

2026-03-06 10:19:51 +01:00
parent 3ad2885c84 4e4ff61699
commit dec4f07fbb
3 changed files with 67 additions and 17 deletions
--- a/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql
+++ b/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql
@@ -1,21 +1,30 @@
 -- Per-location padel investment opportunity intelligence.
 -- Consumed by: Gemeinde-level pSEO pages, opportunity map, "top markets" lists.
 --
-- Padelnomics Marktpotenzial-Score v2 (0–100):
+-- Padelnomics Marktpotenzial-Score v3 (0–100):
 -- Answers "Where should I build a padel court?"
 -- Covers ALL GeoNames locations (pop ≥ 1K) — NOT filtered to existing padel markets.
 -- Zero-court locations score highest on supply gap component (white space = opportunity).
 --
--   25 pts  addressable market — log-scaled population, ceiling 500K
--           (opportunity peaks in mid-size cities; megacities already served)
+-- H3 catchment methodology (v3):
+--   Addressable market and supply gap now use a regional catchment lens rather than
+--   the location's own population/court count. Each location is assigned an H3 cell
+--   at resolution 4 (~10km center-to-center). Catchment = cell + 6 neighbours (k_ring=1),
+--   covering ~462km² — roughly a 15-18km radius, matching realistic driving distance.
+--   Population and court counts are first aggregated per H3 cell (hex_stats CTE), then
+--   summed across the 7-cell ring (catchment CTE) to avoid scanning all 140K locations
+--   per location.
+--
+--   25 pts  addressable market — log-scaled catchment population, ceiling 500K
+--           (opportunity peaks in mid-size catchments; megacities already served)
 --   20 pts  economic power     — country income PPS, normalised to 35,000
 --                               EU PPS values range 18k-37k; /35k gives real spread.
 --                               DE ≈ 13.2pts, ES ≈ 10.7pts, SE ≈ 14.3pts.
 --                               Previously /200 caused all countries to saturate at 20/20.
--   30 pts  supply gap         — INVERTED venue density; 0 courts/100K = full marks.
--                               Ceiling raised to 8/100K (was 4) for a gentler gradient
--                               and to account for ~87% data undercount vs FIP totals.
--                               Linear: GREATEST(0, 1 - density/8)
+--   30 pts  supply gap         — INVERTED catchment venue density; 0 courts/100K = full marks.
+--                               Ceiling 8/100K for a gentler gradient and to account for
+--                               ~87% data undercount vs FIP totals.
+--                               Linear: GREATEST(0, 1 - catchment_density/8)
 --   15 pts  catchment gap      — distance to nearest padel court.
 --                               DuckDB LEAST ignores NULLs: LEAST(1.0, NULL/30) = 1.0,
 --                               so NULL nearest_km = full marks (no court in bounding box
@@ -31,6 +40,30 @@ MODEL (
  grain (country_code, geoname_id)
 );

+WITH
+-- Aggregate population and court counts per H3 cell (res 4, ~10km edge).
+-- Grouping by cell first (~30-50K distinct cells vs 140K locations) keeps the
+-- subsequent lateral join small.
+hex_stats AS (
+  SELECT
+    h3_cell_res4,
+    SUM(population)        AS hex_population,
+    SUM(padel_venue_count) AS hex_padel_courts
+  FROM foundation.dim_locations
+  GROUP BY h3_cell_res4
+),
+-- For each location, sum hex_stats across the cell + 6 neighbours (k_ring=1).
+-- Effective catchment: ~462km², ~15-18km radius — realistic driving distance.
+catchment AS (
+  SELECT
+    l.geoname_id,
+    SUM(hs.hex_population)   AS catchment_population,
+    SUM(hs.hex_padel_courts) AS catchment_padel_courts
+  FROM foundation.dim_locations l,
+  LATERAL (SELECT UNNEST(h3_grid_disk(l.h3_cell_res4, 1)) AS cell) ring
+  JOIN hex_stats hs ON hs.h3_cell_res4 = ring.cell
+  GROUP BY l.geoname_id
+)
 SELECT
  l.geoname_id,
  l.country_code,
@@ -50,11 +83,21 @@ SELECT
  l.padel_venues_per_100k,
  l.nearest_padel_court_km,
  l.tennis_courts_within_25km,
+  -- Catchment metrics (H3 res-4 cell + 6 neighbours, ~15-18km radius)
+  COALESCE(c.catchment_population, l.population)::BIGINT               AS catchment_population,
+  COALESCE(c.catchment_padel_courts, l.padel_venue_count)::INTEGER     AS catchment_padel_courts,
+  CASE WHEN COALESCE(c.catchment_population, l.population) > 0
+    THEN ROUND(
+      COALESCE(c.catchment_padel_courts, l.padel_venue_count)::DOUBLE
+      / COALESCE(c.catchment_population, l.population) * 100000, 2)
+    ELSE NULL
+  END                                                                    AS catchment_venues_per_100k,
  ROUND(
-    -- Addressable market (25 pts): log-scaled to 500K ceiling.
-    -- Lower ceiling than Marktreife (1M) — opportunity peaks in mid-size cities
-    -- that can support a court but aren't already saturated by large-city operators.
-    25.0 * LEAST(1.0, LN(GREATEST(l.population, 1)) / LN(500000))
+    -- Addressable market (25 pts): log-scaled catchment population, ceiling 500K.
+    -- v3: uses H3 catchment population (cell + 6 neighbours, ~15-18km radius) instead
+    -- of local city population, so mid-size cities surrounded by dense Gemeinden score
+    -- correctly (e.g. Oldenburg pulls in Ammerland, Wesermarsch, etc.).
+    25.0 * LEAST(1.0, LN(GREATEST(COALESCE(c.catchment_population, l.population), 1)) / LN(500000))

    -- Economic power (20 pts): country-level income PPS normalised to 35,000.
    -- Drives willingness-to-pay for court fees (€20-35/hr target range).
@@ -64,12 +107,16 @@ SELECT
    -- Default 15000 for missing data = reasonable developing-market assumption (~0.43).
    + 20.0 * LEAST(1.0, COALESCE(l.median_income_pps, 15000) / 35000.0)

-    -- Supply gap (30 pts): INVERTED venue density.
-    -- 0 courts/100K = full 30 pts (white space); ≥8/100K = 0 pts (served market).
-    -- Ceiling raised from 4→8/100K for a gentler gradient and to account for data
-    -- undercount (~87% of real courts not in our data).
-    -- This is the key signal that separates Marktpotenzial from Marktreife.
-    + 30.0 * GREATEST(0.0, 1.0 - COALESCE(l.padel_venues_per_100k, 0) / 8.0)
+    -- Supply gap (30 pts): INVERTED catchment venue density.
+    -- v3: uses catchment courts / catchment population instead of local 5km count / city pop.
+    -- 0 courts/100K across the ~15-18km ring = full 30 pts (genuine white space).
+    -- ≥8/100K = 0 pts (well-served regional market).
+    + 30.0 * GREATEST(0.0, 1.0 - COALESCE(
+        CASE WHEN COALESCE(c.catchment_population, l.population) > 0
+          THEN COALESCE(c.catchment_padel_courts, l.padel_venue_count)::DOUBLE
+               / COALESCE(c.catchment_population, l.population) * 100000
+          ELSE 0.0
+        END, 0.0) / 8.0)

    -- Catchment gap (15 pts): distance to nearest existing padel court.
    -- >30km = full 15 pts (underserved catchment area).
@@ -83,4 +130,5 @@ SELECT
  , 1)                                                                   AS opportunity_score,
  CURRENT_DATE                                                           AS refreshed_date
 FROM foundation.dim_locations l
+LEFT JOIN catchment c ON c.geoname_id = l.geoname_id
 ORDER BY opportunity_score DESC