padelnomics/transform/sqlmesh_padelnomics/models/serving/location_profiles.sql

-- Unified location profile: both scores at (country_code, geoname_id) grain.
-- Base: dim_locations (ALL GeoNames locations, pop ≥ 1K, ~140K rows).
-- Enriched with dim_cities (city_slug, city_name, exact venue count) and
-- venue_pricing_benchmarks (Playtomic pricing/occupancy).
--
-- Two scores per location:
--
-- Padelnomics Market Score (Marktreife-Score v4, 0–100):
--   "How mature/established is this padel market?"
--   Only meaningful for locations matched to a dim_cities row (city_slug IS NOT NULL)
--   with padel venues. 0 for all other locations.
--
--   v4 changes: lower count gate (5→3), lower density ceiling (LN(21)→LN(11)),
--   better demand fallback (0.4→0.65 with 0.3 floor), economic context discrimination (200→25K).
--
--   40 pts  supply development — log-scaled density (LN ceiling 10/100k) × count gate (3)
--   25 pts  demand evidence   — occupancy when available; 65% density proxy + 0.3 floor otherwise
--   15 pts  addressable market — log-scaled population, ceiling 1M
--   10 pts  economic context  — income PPS normalised to 25,000 ceiling
--   10 pts  data quality      — completeness discount
--
-- Padelnomics Opportunity Score (Marktpotenzial-Score v8, 0–100):
--   "Where should I build a padel court?"
--   Computed for ALL locations — zero-court locations score highest on supply deficit.
--   H3 catchment methodology: addressable market and supply deficit use a regional
--   H3 catchment (res-5 cell + 6 neighbours, ~24km radius).
--
--   v8 changes: better spread/discrimination.
--   - Reweight: addressable market 20→15, economic power 15→10, supply deficit 40→50.
--   - Supply deficit existence dampener: country_venues/50 factor (0.1–1.0).
--     Zero-venue countries get max 5 pts supply deficit (was 50).
--   - Steeper addressable market curve: LN/500K → SQRT/1M.
--   - NULL distance gap → 0.0 (was 0.5). Unknown = assume nearby.
--   - Added country_percentile output column (PERCENT_RANK within country).
--
--   15 pts  addressable market        — sqrt-scaled catchment population, ceiling 1M
--   10 pts  economic power            — income PPS, normalised to 35,000
--   50 pts  supply deficit            — max(density gap, distance gap) × existence dampener
--   10 pts  sports culture            — tennis court density as racquet-sport adoption proxy
--    5 pts  construction affordability — income relative to construction costs (PLI)
--   10 pts  market headroom           — inverse country-level avg market maturity
--
-- Consumers query directly with WHERE filters:
--   cities API:       WHERE country_slug = ? AND city_slug IS NOT NULL
--   opportunity API:  WHERE country_slug = ? AND opportunity_score > 0
--   planner_defaults: WHERE city_slug IS NOT NULL
--   pseo_*:           WHERE city_slug IS NOT NULL AND city_padel_venue_count > 0

MODEL (
  name serving.location_profiles,
  kind FULL,
  cron '@daily',
  grain (country_code, geoname_id)
);

WITH
-- All locations from dim_locations (superset)
base AS (
  SELECT
    l.geoname_id,
    l.country_code,
    l.country_name_en,
    l.country_slug,
    l.location_name,
    l.location_slug,
    l.lat,
    l.lon,
    l.admin1_code,
    l.admin2_code,
    l.population,
    l.population_year,
    l.median_income_pps,
    l.income_year,
    l.padel_venue_count,
    l.padel_venues_per_100k,
    l.nearest_padel_court_km,
    l.tennis_courts_within_25km,
    l.h3_cell_res5
  FROM foundation.dim_locations l
),
-- Aggregate population and court counts per H3 cell (res 5, ~8.5km edge).
-- Grouping by cell first (~50-80K distinct cells vs 140K locations) keeps the
-- subsequent lateral join small.
hex_stats AS (
  SELECT
    h3_cell_res5,
    SUM(population)        AS hex_population,
    SUM(padel_venue_count) AS hex_padel_courts
  FROM foundation.dim_locations
  GROUP BY h3_cell_res5
),
-- For each location, sum hex_stats across the cell + 6 neighbours (k_ring=1).
-- Effective catchment: ~24km radius — realistic driving distance.
catchment AS (
  SELECT
    l.geoname_id,
    SUM(hs.hex_population)   AS catchment_population,
    SUM(hs.hex_padel_courts) AS catchment_padel_courts
  FROM base l,
  LATERAL (SELECT UNNEST(h3_grid_disk(l.h3_cell_res5, 1)) AS cell) ring
  JOIN hex_stats hs ON hs.h3_cell_res5 = ring.cell
  GROUP BY l.geoname_id
),
-- Match dim_cities via (country_code, geoname_id) to get city_slug + exact venue count.
-- QUALIFY handles rare multi-city-per-geoname collisions (keep highest venue count).
city_match AS (
  SELECT
    c.country_code,
    c.geoname_id,
    c.city_slug,
    c.city_name,
    c.padel_venue_count AS city_padel_venue_count
  FROM foundation.dim_cities c
  WHERE c.geoname_id IS NOT NULL
  QUALIFY ROW_NUMBER() OVER (
    PARTITION BY c.country_code, c.geoname_id
    ORDER BY c.padel_venue_count DESC
  ) = 1
),
-- Pricing / occupancy from Playtomic (via city_slug) + H3 catchment + country PLI
with_pricing AS (
  SELECT
    b.*,
    cm.city_slug,
    cm.city_name,
    cm.city_padel_venue_count,
    vpb.median_hourly_rate,
    vpb.median_peak_rate,
    vpb.median_offpeak_rate,
    vpb.median_occupancy_rate,
    vpb.median_daily_revenue_per_venue,
    vpb.price_currency,
    dc.pli_construction,
    COALESCE(ct.catchment_population, b.population)::BIGINT           AS catchment_population,
    COALESCE(ct.catchment_padel_courts, b.padel_venue_count)::INTEGER AS catchment_padel_courts
  FROM base b
  LEFT JOIN city_match cm
    ON b.country_code = cm.country_code
    AND b.geoname_id = cm.geoname_id
  LEFT JOIN serving.venue_pricing_benchmarks vpb
    ON cm.country_code = vpb.country_code
    AND cm.city_slug = vpb.city_slug
  LEFT JOIN catchment ct
    ON b.geoname_id = ct.geoname_id
  LEFT JOIN foundation.dim_countries dc
    ON b.country_code = dc.country_code
),
-- Step 1: market score only — needed first so we can aggregate country averages.
market_scored AS (
  SELECT *,
    -- City-level venue density (from dim_cities exact count, not dim_locations spatial 5km)
    CASE WHEN population > 0
      THEN ROUND(COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000, 2)
      ELSE NULL
    END AS city_venues_per_100k,
    -- Data confidence (for market_score)
    CASE
      WHEN population > 0 AND COALESCE(city_padel_venue_count, 0) > 0 THEN 1.0
      WHEN population > 0 OR  COALESCE(city_padel_venue_count, 0) > 0 THEN 0.5
      ELSE 0.0
    END AS data_confidence,
    -- ── Market Score (Marktreife-Score v4) ──────────────────────────────────
    -- 0 when no city match or no venues (city_padel_venue_count NULL or 0)
    CASE WHEN COALESCE(city_padel_venue_count, 0) > 0 THEN
      ROUND(
        -- Supply development (40 pts)
        -- density ceiling 10/100k (LN(11)), count gate 3 venues
        40.0 * LEAST(1.0, LN(
            COALESCE(
              CASE WHEN population > 0
                THEN COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000
                ELSE 0 END
            , 0) + 1) / LN(11))
             * LEAST(1.0, COALESCE(city_padel_venue_count, 0) / 3.0)
        -- Demand evidence (25 pts)
        -- with occupancy: scale to 65% target. Without: 65% of supply proxy + 0.3 floor
        -- (existence of venues IS evidence of demand)
        + 25.0 * CASE
            WHEN median_occupancy_rate IS NOT NULL
              THEN LEAST(1.0, median_occupancy_rate / 0.65)
            ELSE GREATEST(0.3, 0.65 * LEAST(1.0, LN(
                COALESCE(
                  CASE WHEN population > 0
                    THEN COALESCE(city_padel_venue_count, 0)::DOUBLE / population * 100000
                    ELSE 0 END
                , 0) + 1) / LN(11))
                     * LEAST(1.0, COALESCE(city_padel_venue_count, 0) / 3.0))
          END
        -- Addressable market (15 pts)
        + 15.0 * LEAST(1.0, LN(GREATEST(population, 1)) / LN(1000000))
        -- Economic context (10 pts)
        -- ceiling 25,000 PPS discriminates between wealthy and poorer markets
        + 10.0 * LEAST(1.0, COALESCE(median_income_pps, 15000) / 25000.0)
        -- Data quality (10 pts)
        + 10.0 * CASE
            WHEN population > 0 AND COALESCE(city_padel_venue_count, 0) > 0 THEN 1.0
            WHEN population > 0 OR  COALESCE(city_padel_venue_count, 0) > 0 THEN 0.5
            ELSE 0.0
          END
      , 1)
    ELSE 0
    END AS market_score
  FROM with_pricing
),
-- Step 2: country-level avg market maturity — used as market headroom signal (10 pts).
-- Filter to market_score > 0 (cities with padel courts only) so zero-court locations
-- don't dilute the country signal. Higher avg = more saturated = less headroom.
country_market AS (
  SELECT
    country_code,
    ROUND(AVG(market_score), 1) AS country_avg_market_score
  FROM market_scored
  WHERE market_score > 0
  GROUP BY country_code
),
-- Step 3: country-level supply saturation — venues per 100K at the country level.
-- Used to dampen supply deficit in saturated markets (Spain, Sweden).
country_supply AS (
  SELECT
    country_code,
    SUM(city_padel_venue_count)                                             AS country_venues,
    SUM(population)                                                         AS country_pop,
    CASE WHEN SUM(population) > 0
         THEN SUM(city_padel_venue_count) * 100000.0 / SUM(population)
         ELSE 0
    END                                                                     AS venues_per_100k
  FROM foundation.dim_cities
  WHERE population > 0
  GROUP BY country_code
),
-- Step 4: add opportunity_score using country market validation + supply saturation.
scored AS (
  SELECT ms.*,
    -- ── Opportunity Score (Marktpotenzial-Score v8, H3 catchment) ──────────
    ROUND(
      -- Addressable market (15 pts): sqrt-scaled catchment population, ceiling 1M
      15.0 * LEAST(1.0, SQRT(GREATEST(catchment_population, 1) / 1000000.0))
      -- Economic power (10 pts): income PPS normalised to 35,000
      + 10.0 * LEAST(1.0, COALESCE(median_income_pps, 15000) / 35000.0)
      -- Supply deficit (50 pts): max of density gap and distance gap.
      -- Dampened by market existence: country_venues/50 (0.1–1.0).
      --   0 venues in country → factor 0.1 → max 5 pts supply deficit
      --   10 venues → 0.2 → max 10 pts
      --   50+ venues → 1.0 → full credit
      + 50.0 * GREATEST(
          -- density-based gap (H3 catchment): 0 courts = 1.0, 5/100k = 0.0
          GREATEST(0.0, 1.0 - COALESCE(
            CASE WHEN catchment_population > 0
              THEN GREATEST(catchment_padel_courts, COALESCE(city_padel_venue_count, 0))::DOUBLE / catchment_population * 100000
              ELSE 0.0
            END, 0.0) / 5.0),
          -- distance-based gap: 30km+ = 1.0, 0km = 0.0; NULL = 0.0 (assume nearby)
          COALESCE(LEAST(1.0, nearest_padel_court_km / 30.0), 0.0)
        )
        -- Market existence dampener: zero-venue countries get 0.1, 50+ venues = 1.0
        * GREATEST(0.1, LEAST(1.0, COALESCE(cs.country_venues, 0) / 50.0))
      -- Sports culture (10 pts): tennis density as racquet-sport adoption proxy.
      -- Ceiling 50 courts within 25km. Harmless when tennis data is zero (contributes 0).
      + 10.0 * LEAST(1.0, COALESCE(tennis_courts_within_25km, 0) / 50.0)
      -- Construction affordability (5 pts): income purchasing power relative to build costs.
      -- PLI construction is EU27=100 index. High income + low construction cost = high score.
      + 5.0 * LEAST(1.0,
          COALESCE(median_income_pps, 15000) / 35000.0
          / GREATEST(0.5, COALESCE(pli_construction, 100.0) / 100.0)
        )
      -- Market headroom (10 pts): INVERSE country-level avg market maturity.
      -- High avg market score = saturated market = LESS opportunity for new entrants.
      -- ES (~46/100): proven demand, less headroom → ~5.4 pts.
      -- SE (~40/100): emerging → ~6 pts. NULL: 0.5 neutral → 5 pts.
      + 10.0 * (1.0 - COALESCE(cm.country_avg_market_score / 100.0, 0.5))
    , 1) AS opportunity_score
  FROM market_scored ms
  LEFT JOIN country_market cm ON ms.country_code = cm.country_code
  LEFT JOIN country_supply cs ON ms.country_code = cs.country_code
)
SELECT
  s.geoname_id,
  s.country_code,
  s.country_name_en,
  s.country_slug,
  s.location_name,
  s.location_slug,
  s.city_slug,
  s.city_name,
  s.lat,
  s.lon,
  s.admin1_code,
  s.admin2_code,
  s.population,
  s.population_year,
  s.median_income_pps,
  s.income_year,
  s.padel_venue_count,
  s.padel_venues_per_100k,
  s.nearest_padel_court_km,
  s.tennis_courts_within_25km,
  s.city_padel_venue_count,
  s.city_venues_per_100k,
  s.data_confidence,
  s.catchment_population,
  s.catchment_padel_courts,
  CASE WHEN s.catchment_population > 0
    THEN ROUND(s.catchment_padel_courts::DOUBLE / s.catchment_population * 100000, 2)
    ELSE NULL
  END AS catchment_venues_per_100k,
  LEAST(GREATEST(s.market_score, 0), 100) AS market_score,
  LEAST(GREATEST(s.opportunity_score, 0), 100) AS opportunity_score,
  ROUND(PERCENT_RANK() OVER (
    PARTITION BY s.country_code ORDER BY s.opportunity_score
  ) * 100, 0) AS country_percentile,
  s.median_hourly_rate,
  s.median_peak_rate,
  s.median_offpeak_rate,
  s.median_occupancy_rate,
  s.median_daily_revenue_per_venue,
  s.price_currency,
  CURRENT_DATE AS refreshed_date
FROM scored s
ORDER BY s.market_score DESC, s.opportunity_score DESC