merge(worktree): individualise article costs with per-country Eurostat data + tiered proxy tenant work

# Conflicts:
#	CHANGELOG.md
#	transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql
#	transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql
This commit is contained in:
Deeman
2026-03-04 12:44:56 +01:00
12 changed files with 679 additions and 36 deletions

View File

@@ -6,9 +6,9 @@
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
--
-- Enriched with:
-- foundation.dim_countries → country_name_en, country_slug, median_income_pps
-- stg_nuts2_boundaries + stg_regional_income → EU NUTS-2/NUTS-1 income (spatial join)
-- stg_income_usa → US state-level income (PPS-normalised)
-- stg_income → country-level income (fallback for all countries)
-- stg_padel_courts → padel venue count + nearest court distance (km)
-- stg_tennis_courts → tennis court count within 25km radius
--
@@ -16,7 +16,7 @@
-- 1. EU NUTS-2 regional income (finest; spatial join via ST_Contains)
-- 2. EU NUTS-1 regional income (fallback when NUTS-2 income missing from dataset)
-- 3. US state income (ratio-normalised to PPS scale; see us_income CTE)
-- 4. Country-level income (global fallback from stg_income / ilc_di03)
-- 4. Country-level income (global fallback from dim_countries / ilc_di03)
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- Spatial joins use BETWEEN predicates (not ABS()) to enable DuckDB's IEJoin
@@ -49,12 +49,6 @@ locations AS (
FROM staging.stg_population_geonames
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Country income (ilc_di03) — global fallback for all countries
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- ── EU NUTS-2 income via spatial join ──────────────────────────────────────
-- Each EU location's (lon, lat) is matched against NUTS-2 boundary polygons.
-- The bounding box pre-filter (bbox_lat/lon_min/max) eliminates most candidates
@@ -214,10 +208,9 @@ tennis_nearby AS (
SELECT
l.geoname_id,
l.country_code,
-- Human-readable country name (consistent with dim_cities)
@country_name(l.country_code) AS country_name_en,
-- URL-safe country slug
@country_slug(l.country_code) AS country_slug,
-- Human-readable country name and slug — from dim_countries (single source of truth)
c.country_name_en,
c.country_slug,
l.location_name,
l.location_slug,
l.lat,
@@ -230,12 +223,12 @@ SELECT
COALESCE(
ri.regional_income_pps, -- EU: NUTS-2 (finest) or NUTS-1 (fallback)
us.median_income_pps, -- US: state-level PPS-equivalent
ci.median_income_pps -- Global: country-level from ilc_di03
c.median_income_pps -- Global: country-level from dim_countries / ilc_di03
) AS median_income_pps,
COALESCE(
ri.regional_income_year,
us.income_year,
ci.income_year
c.income_year
) AS income_year,
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
-- Venues per 100K residents (NULL if population = 0)
@@ -247,8 +240,8 @@ SELECT
COALESCE(tn.tennis_courts_within_25km, 0)::INTEGER AS tennis_courts_within_25km,
CURRENT_DATE AS refreshed_date
FROM locations l
LEFT JOIN country_income ci ON l.country_code = ci.country_code
LEFT JOIN regional_income ri ON l.geoname_id = ri.geoname_id
LEFT JOIN foundation.dim_countries c ON l.country_code = c.country_code
LEFT JOIN regional_income ri ON l.geoname_id = ri.geoname_id
LEFT JOIN us_income us ON l.country_code = 'US'
AND l.admin1_code = us.admin1_code
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id