padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql

-- Location dimension: all known populated places globally (GeoNames cities1000).
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
--
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
--
-- Enriched with:
--   stg_nuts2_boundaries + stg_regional_income → EU NUTS-2/NUTS-1 income (spatial join)
--   stg_income_usa       → US state-level income (PPS-normalised)
--   stg_income           → country-level income (fallback for all countries)
--   stg_padel_courts     → padel venue count + nearest court distance (km)
--   stg_tennis_courts    → tennis court count within 25km radius
--
-- Income resolution cascade:
--   1. EU NUTS-2 regional income  (finest; spatial join via ST_Contains)
--   2. EU NUTS-1 regional income  (fallback when NUTS-2 income missing from dataset)
--   3. US state income            (ratio-normalised to PPS scale; see us_income CTE)
--   4. Country-level income       (global fallback from stg_income / ilc_di03)
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- Spatial joins use BETWEEN predicates (not ABS()) to enable DuckDB's IEJoin
-- (interval join) optimization: O((N+M) log M) vs O(N×M) nested-loop.
-- Country pre-filters restrict the left side to ~20K rows for padel/tennis CTEs
-- (~8 countries each), down from ~140K global locations.

MODEL (
  name foundation.dim_locations,
  kind FULL,
  cron '@daily',
  grain (country_code, geoname_id)
);

WITH
-- Base: all GeoNames locations with valid coordinates
locations AS (
  SELECT
    geoname_id,
    city_name                                                            AS location_name,
    -- URL-safe location slug
    LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-'))          AS location_slug,
    country_code,
    lat,
    lon,
    admin1_code,
    admin2_code,
    population,
    ref_year
  FROM staging.stg_population_geonames
  WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Country income (ilc_di03) — global fallback for all countries
country_income AS (
  SELECT country_code, median_income_pps, ref_year AS income_year
  FROM staging.stg_income
  QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- ── EU NUTS-2 income via spatial join ──────────────────────────────────────
-- Each EU location's (lon, lat) is matched against NUTS-2 boundary polygons.
-- The bounding box pre-filter (bbox_lat/lon_min/max) eliminates most candidates
-- before the exact ST_Contains test runs.
nuts2_match AS (
  SELECT
    l.geoname_id,
    b.nuts2_code,
    -- Derive parent NUTS-1 code (first 3 chars of NUTS-2)
    SUBSTR(b.nuts2_code, 1, 3)                   AS nuts1_code
  FROM locations l
  JOIN staging.stg_nuts2_boundaries b
    -- Bounding-box pre-filter: only test ST_Contains for polygons whose bbox
    -- overlaps the point, reducing 140K x 242 to ~140K x 3-5 actual tests.
    ON l.lat BETWEEN b.bbox_lat_min AND b.bbox_lat_max
    AND l.lon BETWEEN b.bbox_lon_min AND b.bbox_lon_max
  WHERE ST_Contains(b.geometry, ST_Point(l.lon, l.lat))
  -- A point should fall in exactly one polygon; QUALIFY guards against rare
  -- boundary-precision duplicates.
  QUALIFY ROW_NUMBER() OVER (PARTITION BY l.geoname_id ORDER BY b.nuts2_code) = 1
),
-- NUTS-2 income: latest year per NUTS-2 code
nuts2_income AS (
  SELECT nuts_code, regional_income_pps, ref_year AS regional_income_year
  FROM staging.stg_regional_income
  WHERE nuts_level = 2
  QUALIFY ROW_NUMBER() OVER (PARTITION BY nuts_code ORDER BY ref_year DESC) = 1
),
-- NUTS-1 income: fallback when NUTS-2 income not in dataset for this region
nuts1_income AS (
  SELECT nuts_code, regional_income_pps, ref_year AS regional_income_year
  FROM staging.stg_regional_income
  WHERE nuts_level = 1
  QUALIFY ROW_NUMBER() OVER (PARTITION BY nuts_code ORDER BY ref_year DESC) = 1
),
-- Combined EU regional income: NUTS-2 preferred, NUTS-1 fallback
regional_income AS (
  SELECT
    nm.geoname_id,
    COALESCE(n2.regional_income_pps, n1.regional_income_pps)           AS regional_income_pps,
    COALESCE(n2.regional_income_year, n1.regional_income_year)         AS regional_income_year
  FROM nuts2_match nm
  LEFT JOIN nuts2_income n2  ON nm.nuts2_code = n2.nuts_code
  LEFT JOIN nuts1_income n1  ON nm.nuts1_code = n1.nuts_code
),
-- ── US state-level income ──────────────────────────────────────────────────
-- GeoNames admin1_code for US = 2-letter state abbreviation (ISO 3166-2:US).
-- Census ACS uses 2-digit FIPS codes. This 51-row VALUES CTE bridges them.
us_state_fips (admin1_code, state_fips) AS (
  VALUES
    ('AL', '01'), ('AK', '02'), ('AZ', '04'), ('AR', '05'), ('CA', '06'),
    ('CO', '08'), ('CT', '09'), ('DE', '10'), ('FL', '12'), ('GA', '13'),
    ('HI', '15'), ('ID', '16'), ('IL', '17'), ('IN', '18'), ('IA', '19'),
    ('KS', '20'), ('KY', '21'), ('LA', '22'), ('ME', '23'), ('MD', '24'),
    ('MA', '25'), ('MI', '26'), ('MN', '27'), ('MS', '28'), ('MO', '29'),
    ('MT', '30'), ('NE', '31'), ('NV', '32'), ('NH', '33'), ('NJ', '34'),
    ('NM', '35'), ('NY', '36'), ('NC', '37'), ('ND', '38'), ('OH', '39'),
    ('OK', '40'), ('OR', '41'), ('PA', '42'), ('RI', '44'), ('SC', '45'),
    ('SD', '46'), ('TN', '47'), ('TX', '48'), ('UT', '49'), ('VT', '50'),
    ('VA', '51'), ('WA', '53'), ('WV', '54'), ('WI', '55'), ('WY', '56'),
    ('DC', '11')
),
-- US state income normalised to PPS-equivalent scale.
-- US Census reports median household income in nominal USD. To compare with EU
-- PPS per inhabitant, we use a ratio normalisation:
--   state_pps ≈ US_PPS_CONSTANT × (state_income / national_median)
--
-- Constants (update when ACS vintage changes):
--   national_median = 80,610  — 2023 ACS national median household income (USD)
--   US_PPS_CONSTANT = 30,000  — approximate US per-capita income in PPS terms
--                              (derived from OECD PPP tables, 2022)
--
-- This gives a realistic spread across states:
--   California ($91,905) → ~34,200 PPS   [≈ wealthy EU region]
--   Texas      ($67,321) → ~25,100 PPS   [≈ Germany average]
--   Mississippi($52,985) → ~19,700 PPS   [≈ lower-income EU region]
us_income AS (
  SELECT
    m.admin1_code,
    ROUND(s.median_income_usd / 80610.0 * 30000.0, 0)  AS median_income_pps,
    s.ref_year                                           AS income_year
  FROM us_state_fips m
  JOIN staging.stg_income_usa s ON m.state_fips = s.state_fips
  QUALIFY ROW_NUMBER() OVER (
    PARTITION BY m.admin1_code ORDER BY s.ref_year DESC
  ) = 1
),
-- Padel court lat/lon for distance and density calculations
padel_courts AS (
  SELECT lat, lon, country_code
  FROM staging.stg_padel_courts
  WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
-- BETWEEN enables DuckDB IEJoin (O((N+M) log M)) vs ABS() nested-loop (O(N×M)).
-- Country pre-filter reduces left side from ~140K to ~20K rows (padel is ~8 countries).
nearest_padel AS (
  SELECT
    l.geoname_id,
    MIN(
      ST_Distance_Sphere(
        ST_Point(l.lon, l.lat),
        ST_Point(p.lon, p.lat)
      ) / 1000.0
    ) AS nearest_padel_court_km
  FROM locations l
  JOIN padel_courts p
    -- ~55km bounding box pre-filter; BETWEEN triggers IEJoin optimization
    ON l.lat BETWEEN p.lat - 0.5 AND p.lat + 0.5
    AND l.lon BETWEEN p.lon - 0.5 AND p.lon + 0.5
  WHERE l.country_code IN (
    SELECT DISTINCT country_code FROM padel_courts WHERE country_code IS NOT NULL
  )
  GROUP BY l.geoname_id
),
-- Padel venues within 5km of each location (counts as "local padel supply")
padel_local AS (
  SELECT
    l.geoname_id,
    COUNT(*) AS padel_venue_count
  FROM locations l
  JOIN padel_courts p
    -- ~5km bbox pre-filter; BETWEEN triggers IEJoin optimization
    ON l.lat BETWEEN p.lat - 0.05 AND p.lat + 0.05
    AND l.lon BETWEEN p.lon - 0.05 AND p.lon + 0.05
  WHERE l.country_code IN (
    SELECT DISTINCT country_code FROM padel_courts WHERE country_code IS NOT NULL
  )
    AND ST_Distance_Sphere(
      ST_Point(l.lon, l.lat),
      ST_Point(p.lon, p.lat)
    ) / 1000.0 <= 5.0
  GROUP BY l.geoname_id
),
-- Tennis courts within 25km of each location (sports culture proxy)
-- Country pre-filter reduces left side from ~140K to ~20K rows (tennis courts are European only).
tennis_nearby AS (
  SELECT
    l.geoname_id,
    COUNT(*) AS tennis_courts_within_25km
  FROM locations l
  JOIN staging.stg_tennis_courts t
    -- ~25km bbox pre-filter; BETWEEN triggers IEJoin optimization
    ON l.lat BETWEEN t.lat - 0.23 AND t.lat + 0.23
    AND l.lon BETWEEN t.lon - 0.23 AND t.lon + 0.23
  WHERE l.country_code IN (
    SELECT DISTINCT country_code
    FROM staging.stg_tennis_courts
    WHERE country_code IS NOT NULL
  )
    AND ST_Distance_Sphere(
      ST_Point(l.lon, l.lat),
      ST_Point(t.lon, t.lat)
    ) / 1000.0 <= 25.0
  GROUP BY l.geoname_id
)
SELECT
  l.geoname_id,
  l.country_code,
  -- Human-readable country name (consistent with dim_cities)
  CASE l.country_code
    WHEN 'DE' THEN 'Germany'
    WHEN 'ES' THEN 'Spain'
    WHEN 'GB' THEN 'United Kingdom'
    WHEN 'FR' THEN 'France'
    WHEN 'IT' THEN 'Italy'
    WHEN 'PT' THEN 'Portugal'
    WHEN 'AT' THEN 'Austria'
    WHEN 'CH' THEN 'Switzerland'
    WHEN 'NL' THEN 'Netherlands'
    WHEN 'BE' THEN 'Belgium'
    WHEN 'SE' THEN 'Sweden'
    WHEN 'NO' THEN 'Norway'
    WHEN 'DK' THEN 'Denmark'
    WHEN 'FI' THEN 'Finland'
    WHEN 'US' THEN 'United States'
    WHEN 'AR' THEN 'Argentina'
    WHEN 'MX' THEN 'Mexico'
    WHEN 'AE' THEN 'UAE'
    WHEN 'AU' THEN 'Australia'
    WHEN 'IE' THEN 'Ireland'
    ELSE l.country_code
  END                                                                    AS country_name_en,
  -- URL-safe country slug
  LOWER(REGEXP_REPLACE(
    CASE l.country_code
      WHEN 'DE' THEN 'Germany'
      WHEN 'ES' THEN 'Spain'
      WHEN 'GB' THEN 'United Kingdom'
      WHEN 'FR' THEN 'France'
      WHEN 'IT' THEN 'Italy'
      WHEN 'PT' THEN 'Portugal'
      WHEN 'AT' THEN 'Austria'
      WHEN 'CH' THEN 'Switzerland'
      WHEN 'NL' THEN 'Netherlands'
      WHEN 'BE' THEN 'Belgium'
      WHEN 'SE' THEN 'Sweden'
      WHEN 'NO' THEN 'Norway'
      WHEN 'DK' THEN 'Denmark'
      WHEN 'FI' THEN 'Finland'
      WHEN 'US' THEN 'United States'
      WHEN 'AR' THEN 'Argentina'
      WHEN 'MX' THEN 'Mexico'
      WHEN 'AE' THEN 'UAE'
      WHEN 'AU' THEN 'Australia'
      WHEN 'IE' THEN 'Ireland'
      ELSE l.country_code
    END, '[^a-zA-Z0-9]+', '-'
  ))                                                                     AS country_slug,
  l.location_name,
  l.location_slug,
  l.lat,
  l.lon,
  l.admin1_code,
  l.admin2_code,
  l.population,
  l.ref_year                                                             AS population_year,
  -- Income cascade: EU NUTS-2 → EU NUTS-1 → US state → country-level
  COALESCE(
    ri.regional_income_pps,    -- EU: NUTS-2 (finest) or NUTS-1 (fallback)
    us.median_income_pps,      -- US: state-level PPS-equivalent
    ci.median_income_pps       -- Global: country-level from ilc_di03
  )                                                                      AS median_income_pps,
  COALESCE(
    ri.regional_income_year,
    us.income_year,
    ci.income_year
  )                                                                      AS income_year,
  COALESCE(pl.padel_venue_count, 0)::INTEGER                            AS padel_venue_count,
  -- Venues per 100K residents (NULL if population = 0)
  CASE WHEN l.population > 0
    THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
    ELSE NULL
  END                                                                    AS padel_venues_per_100k,
  np.nearest_padel_court_km,
  COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER                   AS tennis_courts_within_25km,
  CURRENT_DATE                                                          AS refreshed_date
FROM locations l
LEFT JOIN country_income ci   ON l.country_code = ci.country_code
LEFT JOIN regional_income ri  ON l.geoname_id   = ri.geoname_id
LEFT JOIN us_income us        ON l.country_code  = 'US'
                              AND l.admin1_code  = us.admin1_code
LEFT JOIN nearest_padel np    ON l.geoname_id   = np.geoname_id
LEFT JOIN padel_local pl      ON l.geoname_id   = pl.geoname_id
LEFT JOIN tennis_nearby tn    ON l.geoname_id   = tn.geoname_id
-- Enforce grain: deduplicate if location slug collides within same country
QUALIFY ROW_NUMBER() OVER (
  PARTITION BY l.country_code, l.geoname_id
  ORDER BY l.population DESC NULLS LAST
) = 1