- stg_regional_income: expanded NUTS-1+2 (LENGTH IN 3,4), nuts_code rename, nuts_level - stg_nuts2_boundaries: new — ST_Read GISCO GeoJSON, bbox columns for spatial pre-filter - stg_income_usa: new — Census ACS state-level income staging model - dim_locations: spatial join replaces admin1_to_nuts1 VALUES CTE; us_income CTE with PPS normalisation (income/80610×30000); income cascade: NUTS-2→NUTS-1→US state→country - init_landing_seeds: compress=False for ST_Read files; gisco GeoJSON + census income seeds - CHANGELOG + PROJECT.md updated Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
290 lines
12 KiB
SQL
290 lines
12 KiB
SQL
-- Location dimension: all known populated places globally (GeoNames cities1000).
|
||
-- This is the opportunity-scoring root — NOT filtered to places with padel courts.
|
||
-- Grain: (country_code, geoname_id) — stable GeoNames numeric ID per location.
|
||
--
|
||
-- Unlike dim_cities (seeded from dim_venues / existing padel markets), dim_locations
|
||
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
|
||
--
|
||
-- Enriched with:
|
||
-- stg_nuts2_boundaries + stg_regional_income → EU NUTS-2/NUTS-1 income (spatial join)
|
||
-- stg_income_usa → US state-level income (PPS-normalised)
|
||
-- stg_income → country-level income (fallback for all countries)
|
||
-- stg_padel_courts → padel venue count + nearest court distance (km)
|
||
-- stg_tennis_courts → tennis court count within 25km radius
|
||
--
|
||
-- Income resolution cascade:
|
||
-- 1. EU NUTS-2 regional income (finest; spatial join via ST_Contains)
|
||
-- 2. EU NUTS-1 regional income (fallback when NUTS-2 income missing from dataset)
|
||
-- 3. US state income (ratio-normalised to PPS scale; see us_income CTE)
|
||
-- 4. Country-level income (global fallback from stg_income / ilc_di03)
|
||
--
|
||
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
|
||
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
|
||
-- exact sphere distance is computed.
|
||
|
||
MODEL (
|
||
name foundation.dim_locations,
|
||
kind FULL,
|
||
cron '@daily',
|
||
grain (country_code, geoname_id)
|
||
);
|
||
|
||
WITH
|
||
-- Base: all GeoNames locations with valid coordinates
|
||
locations AS (
|
||
SELECT
|
||
geoname_id,
|
||
city_name AS location_name,
|
||
-- URL-safe location slug
|
||
LOWER(REGEXP_REPLACE(LOWER(city_name), '[^a-z0-9]+', '-')) AS location_slug,
|
||
country_code,
|
||
lat,
|
||
lon,
|
||
admin1_code,
|
||
admin2_code,
|
||
population,
|
||
ref_year
|
||
FROM staging.stg_population_geonames
|
||
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||
),
|
||
-- Country income (ilc_di03) — global fallback for all countries
|
||
country_income AS (
|
||
SELECT country_code, median_income_pps, ref_year AS income_year
|
||
FROM staging.stg_income
|
||
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
||
),
|
||
-- ── EU NUTS-2 income via spatial join ──────────────────────────────────────
|
||
-- Each EU location's (lon, lat) is matched against NUTS-2 boundary polygons.
|
||
-- The bounding box pre-filter (bbox_lat/lon_min/max) eliminates most candidates
|
||
-- before the exact ST_Contains test runs.
|
||
nuts2_match AS (
|
||
SELECT
|
||
l.geoname_id,
|
||
b.nuts2_code,
|
||
-- Derive parent NUTS-1 code (first 3 chars of NUTS-2)
|
||
SUBSTR(b.nuts2_code, 1, 3) AS nuts1_code
|
||
FROM locations l
|
||
JOIN staging.stg_nuts2_boundaries b
|
||
-- Bounding-box pre-filter: only test ST_Contains for polygons whose bbox
|
||
-- overlaps the point, reducing 140K x 242 to ~140K x 3-5 actual tests.
|
||
ON l.lat BETWEEN b.bbox_lat_min AND b.bbox_lat_max
|
||
AND l.lon BETWEEN b.bbox_lon_min AND b.bbox_lon_max
|
||
WHERE ST_Contains(b.geometry, ST_Point(l.lon, l.lat))
|
||
-- A point should fall in exactly one polygon; QUALIFY guards against rare
|
||
-- boundary-precision duplicates.
|
||
QUALIFY ROW_NUMBER() OVER (PARTITION BY l.geoname_id ORDER BY b.nuts2_code) = 1
|
||
),
|
||
-- NUTS-2 income: latest year per NUTS-2 code
|
||
nuts2_income AS (
|
||
SELECT nuts_code, regional_income_pps, ref_year AS regional_income_year
|
||
FROM staging.stg_regional_income
|
||
WHERE nuts_level = 2
|
||
QUALIFY ROW_NUMBER() OVER (PARTITION BY nuts_code ORDER BY ref_year DESC) = 1
|
||
),
|
||
-- NUTS-1 income: fallback when NUTS-2 income not in dataset for this region
|
||
nuts1_income AS (
|
||
SELECT nuts_code, regional_income_pps, ref_year AS regional_income_year
|
||
FROM staging.stg_regional_income
|
||
WHERE nuts_level = 1
|
||
QUALIFY ROW_NUMBER() OVER (PARTITION BY nuts_code ORDER BY ref_year DESC) = 1
|
||
),
|
||
-- Combined EU regional income: NUTS-2 preferred, NUTS-1 fallback
|
||
regional_income AS (
|
||
SELECT
|
||
nm.geoname_id,
|
||
COALESCE(n2.regional_income_pps, n1.regional_income_pps) AS regional_income_pps,
|
||
COALESCE(n2.regional_income_year, n1.regional_income_year) AS regional_income_year
|
||
FROM nuts2_match nm
|
||
LEFT JOIN nuts2_income n2 ON nm.nuts2_code = n2.nuts_code
|
||
LEFT JOIN nuts1_income n1 ON nm.nuts1_code = n1.nuts_code
|
||
),
|
||
-- ── US state-level income ──────────────────────────────────────────────────
|
||
-- GeoNames admin1_code for US = 2-letter state abbreviation (ISO 3166-2:US).
|
||
-- Census ACS uses 2-digit FIPS codes. This 51-row VALUES CTE bridges them.
|
||
us_state_fips (admin1_code, state_fips) AS (
|
||
VALUES
|
||
('AL', '01'), ('AK', '02'), ('AZ', '04'), ('AR', '05'), ('CA', '06'),
|
||
('CO', '08'), ('CT', '09'), ('DE', '10'), ('FL', '12'), ('GA', '13'),
|
||
('HI', '15'), ('ID', '16'), ('IL', '17'), ('IN', '18'), ('IA', '19'),
|
||
('KS', '20'), ('KY', '21'), ('LA', '22'), ('ME', '23'), ('MD', '24'),
|
||
('MA', '25'), ('MI', '26'), ('MN', '27'), ('MS', '28'), ('MO', '29'),
|
||
('MT', '30'), ('NE', '31'), ('NV', '32'), ('NH', '33'), ('NJ', '34'),
|
||
('NM', '35'), ('NY', '36'), ('NC', '37'), ('ND', '38'), ('OH', '39'),
|
||
('OK', '40'), ('OR', '41'), ('PA', '42'), ('RI', '44'), ('SC', '45'),
|
||
('SD', '46'), ('TN', '47'), ('TX', '48'), ('UT', '49'), ('VT', '50'),
|
||
('VA', '51'), ('WA', '53'), ('WV', '54'), ('WI', '55'), ('WY', '56'),
|
||
('DC', '11')
|
||
),
|
||
-- US state income normalised to PPS-equivalent scale.
|
||
-- US Census reports median household income in nominal USD. To compare with EU
|
||
-- PPS per inhabitant, we use a ratio normalisation:
|
||
-- state_pps ≈ US_PPS_CONSTANT × (state_income / national_median)
|
||
--
|
||
-- Constants (update when ACS vintage changes):
|
||
-- national_median = 80,610 — 2023 ACS national median household income (USD)
|
||
-- US_PPS_CONSTANT = 30,000 — approximate US per-capita income in PPS terms
|
||
-- (derived from OECD PPP tables, 2022)
|
||
--
|
||
-- This gives a realistic spread across states:
|
||
-- California ($91,905) → ~34,200 PPS [≈ wealthy EU region]
|
||
-- Texas ($67,321) → ~25,100 PPS [≈ Germany average]
|
||
-- Mississippi($52,985) → ~19,700 PPS [≈ lower-income EU region]
|
||
us_income AS (
|
||
SELECT
|
||
m.admin1_code,
|
||
ROUND(s.median_income_usd / 80610.0 * 30000.0, 0) AS median_income_pps,
|
||
s.ref_year AS income_year
|
||
FROM us_state_fips m
|
||
JOIN staging.stg_income_usa s ON m.state_fips = s.state_fips
|
||
QUALIFY ROW_NUMBER() OVER (
|
||
PARTITION BY m.admin1_code ORDER BY s.ref_year DESC
|
||
) = 1
|
||
),
|
||
-- Padel court lat/lon for distance and density calculations
|
||
padel_courts AS (
|
||
SELECT lat, lon, country_code
|
||
FROM staging.stg_padel_courts
|
||
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||
),
|
||
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
|
||
nearest_padel AS (
|
||
SELECT
|
||
l.geoname_id,
|
||
MIN(
|
||
ST_Distance_Sphere(
|
||
ST_Point(l.lon, l.lat),
|
||
ST_Point(p.lon, p.lat)
|
||
) / 1000.0
|
||
) AS nearest_padel_court_km
|
||
FROM locations l
|
||
JOIN padel_courts p
|
||
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
|
||
ON ABS(l.lat - p.lat) < 0.5
|
||
AND ABS(l.lon - p.lon) < 0.5
|
||
GROUP BY l.geoname_id
|
||
),
|
||
-- Padel venues within 5km of each location (counts as "local padel supply")
|
||
padel_local AS (
|
||
SELECT
|
||
l.geoname_id,
|
||
COUNT(*) AS padel_venue_count
|
||
FROM locations l
|
||
JOIN padel_courts p
|
||
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
|
||
AND ABS(l.lon - p.lon) < 0.05
|
||
WHERE ST_Distance_Sphere(
|
||
ST_Point(l.lon, l.lat),
|
||
ST_Point(p.lon, p.lat)
|
||
) / 1000.0 <= 5.0
|
||
GROUP BY l.geoname_id
|
||
),
|
||
-- Tennis courts within 25km of each location (sports culture proxy)
|
||
tennis_nearby AS (
|
||
SELECT
|
||
l.geoname_id,
|
||
COUNT(*) AS tennis_courts_within_25km
|
||
FROM locations l
|
||
JOIN staging.stg_tennis_courts t
|
||
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
|
||
AND ABS(l.lon - t.lon) < 0.23
|
||
WHERE ST_Distance_Sphere(
|
||
ST_Point(l.lon, l.lat),
|
||
ST_Point(t.lon, t.lat)
|
||
) / 1000.0 <= 25.0
|
||
GROUP BY l.geoname_id
|
||
)
|
||
SELECT
|
||
l.geoname_id,
|
||
l.country_code,
|
||
-- Human-readable country name (consistent with dim_cities)
|
||
CASE l.country_code
|
||
WHEN 'DE' THEN 'Germany'
|
||
WHEN 'ES' THEN 'Spain'
|
||
WHEN 'GB' THEN 'United Kingdom'
|
||
WHEN 'FR' THEN 'France'
|
||
WHEN 'IT' THEN 'Italy'
|
||
WHEN 'PT' THEN 'Portugal'
|
||
WHEN 'AT' THEN 'Austria'
|
||
WHEN 'CH' THEN 'Switzerland'
|
||
WHEN 'NL' THEN 'Netherlands'
|
||
WHEN 'BE' THEN 'Belgium'
|
||
WHEN 'SE' THEN 'Sweden'
|
||
WHEN 'NO' THEN 'Norway'
|
||
WHEN 'DK' THEN 'Denmark'
|
||
WHEN 'FI' THEN 'Finland'
|
||
WHEN 'US' THEN 'United States'
|
||
WHEN 'AR' THEN 'Argentina'
|
||
WHEN 'MX' THEN 'Mexico'
|
||
WHEN 'AE' THEN 'UAE'
|
||
WHEN 'AU' THEN 'Australia'
|
||
WHEN 'IE' THEN 'Ireland'
|
||
ELSE l.country_code
|
||
END AS country_name_en,
|
||
-- URL-safe country slug
|
||
LOWER(REGEXP_REPLACE(
|
||
CASE l.country_code
|
||
WHEN 'DE' THEN 'Germany'
|
||
WHEN 'ES' THEN 'Spain'
|
||
WHEN 'GB' THEN 'United Kingdom'
|
||
WHEN 'FR' THEN 'France'
|
||
WHEN 'IT' THEN 'Italy'
|
||
WHEN 'PT' THEN 'Portugal'
|
||
WHEN 'AT' THEN 'Austria'
|
||
WHEN 'CH' THEN 'Switzerland'
|
||
WHEN 'NL' THEN 'Netherlands'
|
||
WHEN 'BE' THEN 'Belgium'
|
||
WHEN 'SE' THEN 'Sweden'
|
||
WHEN 'NO' THEN 'Norway'
|
||
WHEN 'DK' THEN 'Denmark'
|
||
WHEN 'FI' THEN 'Finland'
|
||
WHEN 'US' THEN 'United States'
|
||
WHEN 'AR' THEN 'Argentina'
|
||
WHEN 'MX' THEN 'Mexico'
|
||
WHEN 'AE' THEN 'UAE'
|
||
WHEN 'AU' THEN 'Australia'
|
||
WHEN 'IE' THEN 'Ireland'
|
||
ELSE l.country_code
|
||
END, '[^a-zA-Z0-9]+', '-'
|
||
)) AS country_slug,
|
||
l.location_name,
|
||
l.location_slug,
|
||
l.lat,
|
||
l.lon,
|
||
l.admin1_code,
|
||
l.admin2_code,
|
||
l.population,
|
||
l.ref_year AS population_year,
|
||
-- Income cascade: EU NUTS-2 → EU NUTS-1 → US state → country-level
|
||
COALESCE(
|
||
ri.regional_income_pps, -- EU: NUTS-2 (finest) or NUTS-1 (fallback)
|
||
us.median_income_pps, -- US: state-level PPS-equivalent
|
||
ci.median_income_pps -- Global: country-level from ilc_di03
|
||
) AS median_income_pps,
|
||
COALESCE(
|
||
ri.regional_income_year,
|
||
us.income_year,
|
||
ci.income_year
|
||
) AS income_year,
|
||
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
|
||
-- Venues per 100K residents (NULL if population = 0)
|
||
CASE WHEN l.population > 0
|
||
THEN ROUND(COALESCE(pl.padel_venue_count, 0)::DOUBLE / l.population * 100000, 2)
|
||
ELSE NULL
|
||
END AS padel_venues_per_100k,
|
||
np.nearest_padel_court_km,
|
||
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
|
||
CURRENT_DATE AS refreshed_date
|
||
FROM locations l
|
||
LEFT JOIN country_income ci ON l.country_code = ci.country_code
|
||
LEFT JOIN regional_income ri ON l.geoname_id = ri.geoname_id
|
||
LEFT JOIN us_income us ON l.country_code = 'US'
|
||
AND l.admin1_code = us.admin1_code
|
||
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
|
||
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
|
||
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
|
||
-- Enforce grain: deduplicate if location slug collides within same country
|
||
QUALIFY ROW_NUMBER() OVER (
|
||
PARTITION BY l.country_code, l.geoname_id
|
||
ORDER BY l.population DESC NULLS LAST
|
||
) = 1
|