Files
padelnomics/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql
Deeman ff6401254a feat(score): Opportunity Score v8 — better spread/discrimination
Reweight: addressable market 20→15, economic power 15→10, supply deficit 40→50.
Supply deficit existence dampener (country_venues/50, floor 0.1): zero-venue
countries drop from ~80 to ~17. Steeper addressable market curve (LN/500K →
SQRT/1M). NULL distance gap → 0.0 (was 0.5). Added country_percentile output
column (PERCENT_RANK within country, 0–100).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 22:14:30 +01:00

320 lines
13 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- Unified location profile: both scores at (country_code, geoname_id) grain.
-- Base: dim_locations (ALL GeoNames locations, pop ≥ 1K, ~140K rows).
-- Enriched with dim_cities (city_slug, city_name, exact venue count) and
-- venue_pricing_benchmarks (Playtomic pricing/occupancy).
--
-- Two scores per location:
--
-- Padelnomics Market Score (Marktreife-Score v4, 0100):
-- "How mature/established is this padel market?"
-- Only meaningful for locations matched to a dim_cities row (city_slug IS NOT NULL)
-- with padel venues. 0 for all other locations.
--
-- v4 changes: lower count gate (5→3), lower density ceiling (LN(21)→LN(11)),
-- better demand fallback (0.4→0.65 with 0.3 floor), economic context discrimination (200→25K).
--
-- 40 pts supply development — log-scaled density (LN ceiling 10/100k) × count gate (3)
-- 25 pts demand evidence — occupancy when available; 65% density proxy + 0.3 floor otherwise
-- 15 pts addressable market — log-scaled population, ceiling 1M
-- 10 pts economic context — income PPS normalised to 25,000 ceiling
-- 10 pts data quality — completeness discount
--
-- Padelnomics Opportunity Score (Marktpotenzial-Score v8, 0100):
-- "Where should I build a padel court?"
-- Computed for ALL locations — zero-court locations score highest on supply deficit.
-- H3 catchment methodology: addressable market and supply deficit use a regional
-- H3 catchment (res-5 cell + 6 neighbours, ~24km radius).
--
-- v8 changes: better spread/discrimination.
-- - Reweight: addressable market 20→15, economic power 15→10, supply deficit 40→50.
-- - Supply deficit existence dampener: country_venues/50 factor (0.11.0).
-- Zero-venue countries get max 5 pts supply deficit (was 50).
-- - Steeper addressable market curve: LN/500K → SQRT/1M.
-- - NULL distance gap → 0.0 (was 0.5). Unknown = assume nearby.
-- - Added country_percentile output column (PERCENT_RANK within country).
--
-- 15 pts addressable market — sqrt-scaled catchment population, ceiling 1M
-- 10 pts economic power — income PPS, normalised to 35,000
-- 50 pts supply deficit — max(density gap, distance gap) × existence dampener
-- 10 pts sports culture — tennis court density as racquet-sport adoption proxy
-- 5 pts construction affordability — income relative to construction costs (PLI)
-- 10 pts market headroom — inverse country-level avg market maturity
--
-- Consumers query directly with WHERE filters:
-- cities API: WHERE country_slug = ? AND city_slug IS NOT NULL
-- opportunity API: WHERE country_slug = ? AND opportunity_score > 0
-- planner_defaults: WHERE city_slug IS NOT NULL
-- pseo_*: WHERE city_slug IS NOT NULL AND city_padel_venue_count > 0
MODEL (
name serving.location_profiles,
kind FULL,
cron '@daily',
grain (country_code, geoname_id)
);
WITH
-- All locations from dim_locations (superset)
base AS (
SELECT
l.geoname_id,
l.country_code,
l.country_name_en,
l.country_slug,
l.location_name,
l.location_slug,
l.lat,
l.lon,
l.admin1_code,
l.admin2_code,
l.population,
l.population_year,
l.median_income_pps,
l.income_year,
l.padel_venue_count,
l.padel_venues_per_100k,
l.nearest_padel_court_km,
l.tennis_courts_within_25km,
l.h3_cell_res5
FROM foundation.dim_locations l
),
-- Aggregate population and court counts per H3 cell (res 5, ~8.5km edge).
-- Grouping by cell first (~50-80K distinct cells vs 140K locations) keeps the
-- subsequent lateral join small.
hex_stats AS (
SELECT
h3_cell_res5,
SUM(population) AS hex_population,
SUM(padel_venue_count) AS hex_padel_courts
FROM foundation.dim_locations
GROUP BY h3_cell_res5
),
-- For each location, sum hex_stats across the cell + 6 neighbours (k_ring=1).
-- Effective catchment: ~24km radius — realistic driving distance.
catchment AS (
SELECT
l.geoname_id,
SUM(hs.hex_population) AS catchment_population,
SUM(hs.hex_padel_courts) AS catchment_padel_courts
FROM base l,
LATERAL (SELECT UNNEST(h3_grid_disk(l.h3_cell_res5, 1)) AS cell) ring
JOIN hex_stats hs ON hs.h3_cell_res5 = ring.cell
GROUP BY l.geoname_id
),
-- Match dim_cities via (country_code, geoname_id) to get city_slug + exact venue count.
-- QUALIFY handles rare multi-city-per-geoname collisions (keep highest venue count).
city_match AS (
SELECT
c.country_code,
c.geoname_id,
c.city_slug,
c.city_name,
c.padel_venue_count AS city_padel_venue_count
FROM foundation.dim_cities c
WHERE c.geoname_id IS NOT NULL
QUALIFY ROW_NUMBER() OVER (
PARTITION BY c.country_code, c.geoname_id
ORDER BY c.padel_venue_count DESC
) = 1
),
-- Pricing / occupancy from Playtomic (via city_slug) + H3 catchment + country PLI
with_pricing AS (
SELECT
b.*,
cm.city_slug,
cm.city_name,
cm.city_padel_venue_count,
vpb.median_hourly_rate,
vpb.median_peak_rate,
vpb.median_offpeak_rate,
vpb.median_occupancy_rate,
vpb.median_daily_revenue_per_venue,
vpb.price_currency,
dc.pli_construction,
COALESCE(ct.catchment_population, b.population)::BIGINT AS catchment_population,
COALESCE(ct.catchment_padel_courts, b.padel_venue_count)::INTEGER AS catchment_padel_courts
FROM base b
LEFT JOIN city_match cm
ON b.country_code = cm.country_code
AND b.geoname_id = cm.geoname_id
LEFT JOIN serving.venue_pricing_benchmarks vpb
ON cm.country_code = vpb.country_code
AND cm.city_slug = vpb.city_slug
LEFT JOIN catchment ct
ON b.geoname_id = ct.geoname_id
LEFT JOIN foundation.dim_countries dc
ON b.country_code = dc.country_code
),
-- Step 1: market score only — needed first so we can aggregate country averages.
market_scored AS (
SELECT *,
-- City-level venue density (from dim_cities exact count, not dim_locations spatial 5km)
CASE WHEN population > 0
THEN ROUND(COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000, 2)
ELSE NULL
END AS city_venues_per_100k,
-- Data confidence (for market_score)
CASE
WHEN population > 0 AND COALESCE(city_padel_venue_count, 0) > 0 THEN 1.0
WHEN population > 0 OR COALESCE(city_padel_venue_count, 0) > 0 THEN 0.5
ELSE 0.0
END AS data_confidence,
-- ── Market Score (Marktreife-Score v4) ──────────────────────────────────
-- 0 when no city match or no venues (city_padel_venue_count NULL or 0)
CASE WHEN COALESCE(city_padel_venue_count, 0) > 0 THEN
ROUND(
-- Supply development (40 pts)
-- density ceiling 10/100k (LN(11)), count gate 3 venues
40.0 * LEAST(1.0, LN(
COALESCE(
CASE WHEN population > 0
THEN COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000
ELSE 0 END
, 0) + 1) / LN(11))
* LEAST(1.0, COALESCE(city_padel_venue_count, 0) / 3.0)
-- Demand evidence (25 pts)
-- with occupancy: scale to 65% target. Without: 65% of supply proxy + 0.3 floor
-- (existence of venues IS evidence of demand)
+ 25.0 * CASE
WHEN median_occupancy_rate IS NOT NULL
THEN LEAST(1.0, median_occupancy_rate / 0.65)
ELSE GREATEST(0.3, 0.65 * LEAST(1.0, LN(
COALESCE(
CASE WHEN population > 0
THEN COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000
ELSE 0 END
, 0) + 1) / LN(11))
* LEAST(1.0, COALESCE(city_padel_venue_count, 0) / 3.0))
END
-- Addressable market (15 pts)
+ 15.0 * LEAST(1.0, LN(GREATEST(population, 1)) / LN(1000000))
-- Economic context (10 pts)
-- ceiling 25,000 PPS discriminates between wealthy and poorer markets
+ 10.0 * LEAST(1.0, COALESCE(median_income_pps, 15000) / 25000.0)
-- Data quality (10 pts)
+ 10.0 * CASE
WHEN population > 0 AND COALESCE(city_padel_venue_count, 0) > 0 THEN 1.0
WHEN population > 0 OR COALESCE(city_padel_venue_count, 0) > 0 THEN 0.5
ELSE 0.0
END
, 1)
ELSE 0
END AS market_score
FROM with_pricing
),
-- Step 2: country-level avg market maturity — used as market headroom signal (10 pts).
-- Filter to market_score > 0 (cities with padel courts only) so zero-court locations
-- don't dilute the country signal. Higher avg = more saturated = less headroom.
country_market AS (
SELECT
country_code,
ROUND(AVG(market_score), 1) AS country_avg_market_score
FROM market_scored
WHERE market_score > 0
GROUP BY country_code
),
-- Step 3: country-level supply saturation — venues per 100K at the country level.
-- Used to dampen supply deficit in saturated markets (Spain, Sweden).
country_supply AS (
SELECT
country_code,
SUM(city_padel_venue_count) AS country_venues,
SUM(population) AS country_pop,
CASE WHEN SUM(population) > 0
THEN SUM(city_padel_venue_count) * 100000.0 / SUM(population)
ELSE 0
END AS venues_per_100k
FROM foundation.dim_cities
WHERE population > 0
GROUP BY country_code
),
-- Step 4: add opportunity_score using country market validation + supply saturation.
scored AS (
SELECT ms.*,
-- ── Opportunity Score (Marktpotenzial-Score v8, H3 catchment) ──────────
ROUND(
-- Addressable market (15 pts): sqrt-scaled catchment population, ceiling 1M
15.0 * LEAST(1.0, SQRT(GREATEST(catchment_population, 1) / 1000000.0))
-- Economic power (10 pts): income PPS normalised to 35,000
+ 10.0 * LEAST(1.0, COALESCE(median_income_pps, 15000) / 35000.0)
-- Supply deficit (50 pts): max of density gap and distance gap.
-- Dampened by market existence: country_venues/50 (0.11.0).
-- 0 venues in country → factor 0.1 → max 5 pts supply deficit
-- 10 venues → 0.2 → max 10 pts
-- 50+ venues → 1.0 → full credit
+ 50.0 * GREATEST(
-- density-based gap (H3 catchment): 0 courts = 1.0, 5/100k = 0.0
GREATEST(0.0, 1.0 - COALESCE(
CASE WHEN catchment_population > 0
THEN GREATEST(catchment_padel_courts, COALESCE(city_padel_venue_count, 0))::DOUBLE / catchment_population * 100000
ELSE 0.0
END, 0.0) / 5.0),
-- distance-based gap: 30km+ = 1.0, 0km = 0.0; NULL = 0.0 (assume nearby)
COALESCE(LEAST(1.0, nearest_padel_court_km / 30.0), 0.0)
)
-- Market existence dampener: zero-venue countries get 0.1, 50+ venues = 1.0
* GREATEST(0.1, LEAST(1.0, COALESCE(cs.country_venues, 0) / 50.0))
-- Sports culture (10 pts): tennis density as racquet-sport adoption proxy.
-- Ceiling 50 courts within 25km. Harmless when tennis data is zero (contributes 0).
+ 10.0 * LEAST(1.0, COALESCE(tennis_courts_within_25km, 0) / 50.0)
-- Construction affordability (5 pts): income purchasing power relative to build costs.
-- PLI construction is EU27=100 index. High income + low construction cost = high score.
+ 5.0 * LEAST(1.0,
COALESCE(median_income_pps, 15000) / 35000.0
/ GREATEST(0.5, COALESCE(pli_construction, 100.0) / 100.0)
)
-- Market headroom (10 pts): INVERSE country-level avg market maturity.
-- High avg market score = saturated market = LESS opportunity for new entrants.
-- ES (~46/100): proven demand, less headroom → ~5.4 pts.
-- SE (~40/100): emerging → ~6 pts. NULL: 0.5 neutral → 5 pts.
+ 10.0 * (1.0 - COALESCE(cm.country_avg_market_score / 100.0, 0.5))
, 1) AS opportunity_score
FROM market_scored ms
LEFT JOIN country_market cm ON ms.country_code = cm.country_code
LEFT JOIN country_supply cs ON ms.country_code = cs.country_code
)
SELECT
s.geoname_id,
s.country_code,
s.country_name_en,
s.country_slug,
s.location_name,
s.location_slug,
s.city_slug,
s.city_name,
s.lat,
s.lon,
s.admin1_code,
s.admin2_code,
s.population,
s.population_year,
s.median_income_pps,
s.income_year,
s.padel_venue_count,
s.padel_venues_per_100k,
s.nearest_padel_court_km,
s.tennis_courts_within_25km,
s.city_padel_venue_count,
s.city_venues_per_100k,
s.data_confidence,
s.catchment_population,
s.catchment_padel_courts,
CASE WHEN s.catchment_population > 0
THEN ROUND(s.catchment_padel_courts::DOUBLE / s.catchment_population * 100000, 2)
ELSE NULL
END AS catchment_venues_per_100k,
LEAST(GREATEST(s.market_score, 0), 100) AS market_score,
LEAST(GREATEST(s.opportunity_score, 0), 100) AS opportunity_score,
ROUND(PERCENT_RANK() OVER (
PARTITION BY s.country_code ORDER BY s.opportunity_score
) * 100, 0) AS country_percentile,
s.median_hourly_rate,
s.median_peak_rate,
s.median_offpeak_rate,
s.median_occupancy_rate,
s.median_daily_revenue_per_venue,
s.price_currency,
CURRENT_DATE AS refreshed_date
FROM scored s
ORDER BY s.market_score DESC, s.opportunity_score DESC