Files
padelnomics/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql
Deeman 118c2c0fc7 feat(scoring): Opportunity Score v4 → v5 — fix correlated components
- Merge supply gap (30pts) + catchment gap (15pts) → supply deficit (35pts, GREATEST)
  Eliminates ~80% correlated double-count on a single signal.
- Add sports culture signal (10pts): tennis court density as racquet-sport adoption proxy.
  Ceiling 50 courts/25km. Harmless when tennis data is zero (contributes 0).
- Add construction affordability (5pts): income relative to PLI construction costs.
  Joins dim_countries.pli_construction. High income + low build cost = high score.
- Reduce economic power from 20 → 15pts to make room.

New weights: addressable market 25, economic power 15, supply deficit 35,
sports culture 10, construction affordability 5, market validation 10.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 15:30:04 +01:00

291 lines
12 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- Unified location profile: both scores at (country_code, geoname_id) grain.
-- Base: dim_locations (ALL GeoNames locations, pop ≥ 1K, ~140K rows).
-- Enriched with dim_cities (city_slug, city_name, exact venue count) and
-- venue_pricing_benchmarks (Playtomic pricing/occupancy).
--
-- Two scores per location:
--
-- Padelnomics Market Score (Marktreife-Score v4, 0100):
-- "How mature/established is this padel market?"
-- Only meaningful for locations matched to a dim_cities row (city_slug IS NOT NULL)
-- with padel venues. 0 for all other locations.
--
-- v4 changes: lower count gate (5→3), lower density ceiling (LN(21)→LN(11)),
-- better demand fallback (0.4→0.65 with 0.3 floor), economic context discrimination (200→25K).
--
-- 40 pts supply development — log-scaled density (LN ceiling 10/100k) × count gate (3)
-- 25 pts demand evidence — occupancy when available; 65% density proxy + 0.3 floor otherwise
-- 15 pts addressable market — log-scaled population, ceiling 1M
-- 10 pts economic context — income PPS normalised to 25,000 ceiling
-- 10 pts data quality — completeness discount
--
-- Padelnomics Opportunity Score (Marktpotenzial-Score v5, 0100):
-- "Where should I build a padel court?"
-- Computed for ALL locations — zero-court locations score highest on supply deficit.
-- H3 catchment methodology: addressable market and supply deficit use a regional
-- H3 catchment (res-5 cell + 6 neighbours, ~24km radius).
--
-- v5 changes: merge supply gap + catchment gap → single supply deficit (35 pts),
-- add sports culture proxy (10 pts, tennis density), add construction affordability (5 pts),
-- reduce economic power from 20 → 15 pts.
--
-- 25 pts addressable market — log-scaled catchment population, ceiling 500K
-- 15 pts economic power — income PPS, normalised to 35,000
-- 35 pts supply deficit — max(density gap, distance gap); eliminates double-count
-- 10 pts sports culture — tennis court density as racquet-sport adoption proxy
-- 5 pts construction affordability — income relative to construction costs (PLI)
-- 10 pts market validation — country-level avg market maturity (from market_scored CTE)
--
-- Consumers query directly with WHERE filters:
-- cities API: WHERE country_slug = ? AND city_slug IS NOT NULL
-- opportunity API: WHERE country_slug = ? AND opportunity_score > 0
-- planner_defaults: WHERE city_slug IS NOT NULL
-- pseo_*: WHERE city_slug IS NOT NULL AND city_padel_venue_count > 0
MODEL (
name serving.location_profiles,
kind FULL,
cron '@daily',
grain (country_code, geoname_id)
);
WITH
-- All locations from dim_locations (superset)
base AS (
SELECT
l.geoname_id,
l.country_code,
l.country_name_en,
l.country_slug,
l.location_name,
l.location_slug,
l.lat,
l.lon,
l.admin1_code,
l.admin2_code,
l.population,
l.population_year,
l.median_income_pps,
l.income_year,
l.padel_venue_count,
l.padel_venues_per_100k,
l.nearest_padel_court_km,
l.tennis_courts_within_25km,
l.h3_cell_res5
FROM foundation.dim_locations l
),
-- Aggregate population and court counts per H3 cell (res 5, ~8.5km edge).
-- Grouping by cell first (~50-80K distinct cells vs 140K locations) keeps the
-- subsequent lateral join small.
hex_stats AS (
SELECT
h3_cell_res5,
SUM(population) AS hex_population,
SUM(padel_venue_count) AS hex_padel_courts
FROM foundation.dim_locations
GROUP BY h3_cell_res5
),
-- For each location, sum hex_stats across the cell + 6 neighbours (k_ring=1).
-- Effective catchment: ~24km radius — realistic driving distance.
catchment AS (
SELECT
l.geoname_id,
SUM(hs.hex_population) AS catchment_population,
SUM(hs.hex_padel_courts) AS catchment_padel_courts
FROM base l,
LATERAL (SELECT UNNEST(h3_grid_disk(l.h3_cell_res5, 1)) AS cell) ring
JOIN hex_stats hs ON hs.h3_cell_res5 = ring.cell
GROUP BY l.geoname_id
),
-- Match dim_cities via (country_code, geoname_id) to get city_slug + exact venue count.
-- QUALIFY handles rare multi-city-per-geoname collisions (keep highest venue count).
city_match AS (
SELECT
c.country_code,
c.geoname_id,
c.city_slug,
c.city_name,
c.padel_venue_count AS city_padel_venue_count
FROM foundation.dim_cities c
WHERE c.geoname_id IS NOT NULL
QUALIFY ROW_NUMBER() OVER (
PARTITION BY c.country_code, c.geoname_id
ORDER BY c.padel_venue_count DESC
) = 1
),
-- Pricing / occupancy from Playtomic (via city_slug) + H3 catchment + country PLI
with_pricing AS (
SELECT
b.*,
cm.city_slug,
cm.city_name,
cm.city_padel_venue_count,
vpb.median_hourly_rate,
vpb.median_peak_rate,
vpb.median_offpeak_rate,
vpb.median_occupancy_rate,
vpb.median_daily_revenue_per_venue,
vpb.price_currency,
dc.pli_construction,
COALESCE(ct.catchment_population, b.population)::BIGINT AS catchment_population,
COALESCE(ct.catchment_padel_courts, b.padel_venue_count)::INTEGER AS catchment_padel_courts
FROM base b
LEFT JOIN city_match cm
ON b.country_code = cm.country_code
AND b.geoname_id = cm.geoname_id
LEFT JOIN serving.venue_pricing_benchmarks vpb
ON cm.country_code = vpb.country_code
AND cm.city_slug = vpb.city_slug
LEFT JOIN catchment ct
ON b.geoname_id = ct.geoname_id
LEFT JOIN foundation.dim_countries dc
ON b.country_code = dc.country_code
),
-- Step 1: market score only — needed first so we can aggregate country averages.
market_scored AS (
SELECT *,
-- City-level venue density (from dim_cities exact count, not dim_locations spatial 5km)
CASE WHEN population > 0
THEN ROUND(COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000, 2)
ELSE NULL
END AS city_venues_per_100k,
-- Data confidence (for market_score)
CASE
WHEN population > 0 AND COALESCE(city_padel_venue_count, 0) > 0 THEN 1.0
WHEN population > 0 OR COALESCE(city_padel_venue_count, 0) > 0 THEN 0.5
ELSE 0.0
END AS data_confidence,
-- ── Market Score (Marktreife-Score v4) ──────────────────────────────────
-- 0 when no city match or no venues (city_padel_venue_count NULL or 0)
CASE WHEN COALESCE(city_padel_venue_count, 0) > 0 THEN
ROUND(
-- Supply development (40 pts)
-- density ceiling 10/100k (LN(11)), count gate 3 venues
40.0 * LEAST(1.0, LN(
COALESCE(
CASE WHEN population > 0
THEN COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000
ELSE 0 END
, 0) + 1) / LN(11))
* LEAST(1.0, COALESCE(city_padel_venue_count, 0) / 3.0)
-- Demand evidence (25 pts)
-- with occupancy: scale to 65% target. Without: 65% of supply proxy + 0.3 floor
-- (existence of venues IS evidence of demand)
+ 25.0 * CASE
WHEN median_occupancy_rate IS NOT NULL
THEN LEAST(1.0, median_occupancy_rate / 0.65)
ELSE GREATEST(0.3, 0.65 * LEAST(1.0, LN(
COALESCE(
CASE WHEN population > 0
THEN COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000
ELSE 0 END
, 0) + 1) / LN(11))
* LEAST(1.0, COALESCE(city_padel_venue_count, 0) / 3.0))
END
-- Addressable market (15 pts)
+ 15.0 * LEAST(1.0, LN(GREATEST(population, 1)) / LN(1000000))
-- Economic context (10 pts)
-- ceiling 25,000 PPS discriminates between wealthy and poorer markets
+ 10.0 * LEAST(1.0, COALESCE(median_income_pps, 15000) / 25000.0)
-- Data quality (10 pts)
+ 10.0 * CASE
WHEN population > 0 AND COALESCE(city_padel_venue_count, 0) > 0 THEN 1.0
WHEN population > 0 OR COALESCE(city_padel_venue_count, 0) > 0 THEN 0.5
ELSE 0.0
END
, 1)
ELSE 0
END AS market_score
FROM with_pricing
),
-- Step 2: country-level avg market maturity — used as market validation signal (10 pts).
-- Filter to market_score > 0 (cities with padel courts only) so zero-court locations
-- don't dilute the country signal. ES proven demand → ~60, SE struggling → ~35.
country_market AS (
SELECT
country_code,
ROUND(AVG(market_score), 1) AS country_avg_market_score
FROM market_scored
WHERE market_score > 0
GROUP BY country_code
),
-- Step 3: add opportunity_score using country market validation signal.
scored AS (
SELECT ms.*,
-- ── Opportunity Score (Marktpotenzial-Score v5, H3 catchment) ──────────
ROUND(
-- Addressable market (25 pts): log-scaled catchment population, ceiling 500K
25.0 * LEAST(1.0, LN(GREATEST(catchment_population, 1)) / LN(500000))
-- Economic power (15 pts): income PPS normalised to 35,000
+ 15.0 * LEAST(1.0, COALESCE(median_income_pps, 15000) / 35000.0)
-- Supply deficit (35 pts): max of density gap and distance gap.
-- Merges old supply gap (30) + catchment gap (15) which were ~80% correlated.
+ 35.0 * GREATEST(
-- density-based gap (H3 catchment): 0 courts = 1.0, 8/100k = 0.0
GREATEST(0.0, 1.0 - COALESCE(
CASE WHEN catchment_population > 0
THEN GREATEST(catchment_padel_courts, COALESCE(city_padel_venue_count, 0))::DOUBLE / catchment_population * 100000
ELSE 0.0
END, 0.0) / 8.0),
-- distance-based gap: 30km+ = 1.0, 0km = 0.0; NULL = 0.5
COALESCE(LEAST(1.0, nearest_padel_court_km / 30.0), 0.5)
)
-- Sports culture (10 pts): tennis density as racquet-sport adoption proxy.
-- Ceiling 50 courts within 25km. Harmless when tennis data is zero (contributes 0).
+ 10.0 * LEAST(1.0, COALESCE(tennis_courts_within_25km, 0) / 50.0)
-- Construction affordability (5 pts): income purchasing power relative to build costs.
-- PLI construction is EU27=100 index. High income + low construction cost = high score.
+ 5.0 * LEAST(1.0,
COALESCE(median_income_pps, 15000) / 35000.0
/ GREATEST(0.5, COALESCE(pli_construction, 100.0) / 100.0)
)
-- Market validation (10 pts): country-level avg market maturity.
-- ES (~70/100): proven demand → ~7 pts. SE (~35/100): emerging → ~3.5 pts.
-- NULL (no courts in country yet): 0.5 neutral → 5 pts (untested, not penalised).
+ 10.0 * COALESCE(cm.country_avg_market_score / 100.0, 0.5)
, 1) AS opportunity_score
FROM market_scored ms
LEFT JOIN country_market cm ON ms.country_code = cm.country_code
)
SELECT
s.geoname_id,
s.country_code,
s.country_name_en,
s.country_slug,
s.location_name,
s.location_slug,
s.city_slug,
s.city_name,
s.lat,
s.lon,
s.admin1_code,
s.admin2_code,
s.population,
s.population_year,
s.median_income_pps,
s.income_year,
s.padel_venue_count,
s.padel_venues_per_100k,
s.nearest_padel_court_km,
s.tennis_courts_within_25km,
s.city_padel_venue_count,
s.city_venues_per_100k,
s.data_confidence,
s.catchment_population,
s.catchment_padel_courts,
CASE WHEN s.catchment_population > 0
THEN ROUND(s.catchment_padel_courts::DOUBLE / s.catchment_population * 100000, 2)
ELSE NULL
END AS catchment_venues_per_100k,
s.market_score,
s.opportunity_score,
s.median_hourly_rate,
s.median_peak_rate,
s.median_offpeak_rate,
s.median_occupancy_rate,
s.median_daily_revenue_per_venue,
s.price_currency,
CURRENT_DATE AS refreshed_date
FROM scored s
ORDER BY s.market_score DESC, s.opportunity_score DESC