diff --git a/transform/sqlmesh_padelnomics/config.yaml b/transform/sqlmesh_padelnomics/config.yaml index 7e5da9a..1d2b6e8 100644 --- a/transform/sqlmesh_padelnomics/config.yaml +++ b/transform/sqlmesh_padelnomics/config.yaml @@ -6,6 +6,7 @@ gateways: local: "{{ env_var('DUCKDB_PATH', 'data/lakehouse.duckdb') }}" extensions: - spatial + - h3 default_gateway: duckdb diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index 38a54a7..015bb28 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -215,6 +215,7 @@ SELECT l.location_slug, l.lat, l.lon, + h3_latlng_to_cell(l.lat, l.lon, 4) AS h3_cell_res4, l.admin1_code, l.admin2_code, l.population, diff --git a/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql b/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql index b746cab..bbb5b41 100644 --- a/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql +++ b/transform/sqlmesh_padelnomics/models/serving/location_opportunity_profile.sql @@ -1,21 +1,30 @@ -- Per-location padel investment opportunity intelligence. -- Consumed by: Gemeinde-level pSEO pages, opportunity map, "top markets" lists. -- --- Padelnomics Marktpotenzial-Score v2 (0–100): +-- Padelnomics Marktpotenzial-Score v3 (0–100): -- Answers "Where should I build a padel court?" -- Covers ALL GeoNames locations (pop ≥ 1K) — NOT filtered to existing padel markets. -- Zero-court locations score highest on supply gap component (white space = opportunity). -- --- 25 pts addressable market — log-scaled population, ceiling 500K --- (opportunity peaks in mid-size cities; megacities already served) +-- H3 catchment methodology (v3): +-- Addressable market and supply gap now use a regional catchment lens rather than +-- the location's own population/court count. Each location is assigned an H3 cell +-- at resolution 4 (~10km center-to-center). Catchment = cell + 6 neighbours (k_ring=1), +-- covering ~462km² — roughly a 15-18km radius, matching realistic driving distance. +-- Population and court counts are first aggregated per H3 cell (hex_stats CTE), then +-- summed across the 7-cell ring (catchment CTE) to avoid scanning all 140K locations +-- per location. +-- +-- 25 pts addressable market — log-scaled catchment population, ceiling 500K +-- (opportunity peaks in mid-size catchments; megacities already served) -- 20 pts economic power — country income PPS, normalised to 35,000 -- EU PPS values range 18k-37k; /35k gives real spread. -- DE ≈ 13.2pts, ES ≈ 10.7pts, SE ≈ 14.3pts. -- Previously /200 caused all countries to saturate at 20/20. --- 30 pts supply gap — INVERTED venue density; 0 courts/100K = full marks. --- Ceiling raised to 8/100K (was 4) for a gentler gradient --- and to account for ~87% data undercount vs FIP totals. --- Linear: GREATEST(0, 1 - density/8) +-- 30 pts supply gap — INVERTED catchment venue density; 0 courts/100K = full marks. +-- Ceiling 8/100K for a gentler gradient and to account for +-- ~87% data undercount vs FIP totals. +-- Linear: GREATEST(0, 1 - catchment_density/8) -- 15 pts catchment gap — distance to nearest padel court. -- DuckDB LEAST ignores NULLs: LEAST(1.0, NULL/30) = 1.0, -- so NULL nearest_km = full marks (no court in bounding box @@ -31,6 +40,30 @@ MODEL ( grain (country_code, geoname_id) ); +WITH +-- Aggregate population and court counts per H3 cell (res 4, ~10km edge). +-- Grouping by cell first (~30-50K distinct cells vs 140K locations) keeps the +-- subsequent lateral join small. +hex_stats AS ( + SELECT + h3_cell_res4, + SUM(population) AS hex_population, + SUM(padel_venue_count) AS hex_padel_courts + FROM foundation.dim_locations + GROUP BY h3_cell_res4 +), +-- For each location, sum hex_stats across the cell + 6 neighbours (k_ring=1). +-- Effective catchment: ~462km², ~15-18km radius — realistic driving distance. +catchment AS ( + SELECT + l.geoname_id, + SUM(hs.hex_population) AS catchment_population, + SUM(hs.hex_padel_courts) AS catchment_padel_courts + FROM foundation.dim_locations l, + LATERAL (SELECT UNNEST(h3_grid_disk(l.h3_cell_res4, 1)) AS cell) ring + JOIN hex_stats hs ON hs.h3_cell_res4 = ring.cell + GROUP BY l.geoname_id +) SELECT l.geoname_id, l.country_code, @@ -50,11 +83,21 @@ SELECT l.padel_venues_per_100k, l.nearest_padel_court_km, l.tennis_courts_within_25km, + -- Catchment metrics (H3 res-4 cell + 6 neighbours, ~15-18km radius) + COALESCE(c.catchment_population, l.population)::BIGINT AS catchment_population, + COALESCE(c.catchment_padel_courts, l.padel_venue_count)::INTEGER AS catchment_padel_courts, + CASE WHEN COALESCE(c.catchment_population, l.population) > 0 + THEN ROUND( + COALESCE(c.catchment_padel_courts, l.padel_venue_count)::DOUBLE + / COALESCE(c.catchment_population, l.population) * 100000, 2) + ELSE NULL + END AS catchment_venues_per_100k, ROUND( - -- Addressable market (25 pts): log-scaled to 500K ceiling. - -- Lower ceiling than Marktreife (1M) — opportunity peaks in mid-size cities - -- that can support a court but aren't already saturated by large-city operators. - 25.0 * LEAST(1.0, LN(GREATEST(l.population, 1)) / LN(500000)) + -- Addressable market (25 pts): log-scaled catchment population, ceiling 500K. + -- v3: uses H3 catchment population (cell + 6 neighbours, ~15-18km radius) instead + -- of local city population, so mid-size cities surrounded by dense Gemeinden score + -- correctly (e.g. Oldenburg pulls in Ammerland, Wesermarsch, etc.). + 25.0 * LEAST(1.0, LN(GREATEST(COALESCE(c.catchment_population, l.population), 1)) / LN(500000)) -- Economic power (20 pts): country-level income PPS normalised to 35,000. -- Drives willingness-to-pay for court fees (€20-35/hr target range). @@ -64,12 +107,16 @@ SELECT -- Default 15000 for missing data = reasonable developing-market assumption (~0.43). + 20.0 * LEAST(1.0, COALESCE(l.median_income_pps, 15000) / 35000.0) - -- Supply gap (30 pts): INVERTED venue density. - -- 0 courts/100K = full 30 pts (white space); ≥8/100K = 0 pts (served market). - -- Ceiling raised from 4→8/100K for a gentler gradient and to account for data - -- undercount (~87% of real courts not in our data). - -- This is the key signal that separates Marktpotenzial from Marktreife. - + 30.0 * GREATEST(0.0, 1.0 - COALESCE(l.padel_venues_per_100k, 0) / 8.0) + -- Supply gap (30 pts): INVERTED catchment venue density. + -- v3: uses catchment courts / catchment population instead of local 5km count / city pop. + -- 0 courts/100K across the ~15-18km ring = full 30 pts (genuine white space). + -- ≥8/100K = 0 pts (well-served regional market). + + 30.0 * GREATEST(0.0, 1.0 - COALESCE( + CASE WHEN COALESCE(c.catchment_population, l.population) > 0 + THEN COALESCE(c.catchment_padel_courts, l.padel_venue_count)::DOUBLE + / COALESCE(c.catchment_population, l.population) * 100000 + ELSE 0.0 + END, 0.0) / 8.0) -- Catchment gap (15 pts): distance to nearest existing padel court. -- >30km = full 15 pts (underserved catchment area). @@ -83,4 +130,5 @@ SELECT , 1) AS opportunity_score, CURRENT_DATE AS refreshed_date FROM foundation.dim_locations l +LEFT JOIN catchment c ON c.geoname_id = l.geoname_id ORDER BY opportunity_score DESC