feat(data): Phase 2a — NUTS-1 regional income for opportunity score

- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to
  request URL so server pre-filters the large cube before download
- stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz,
  filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB
- dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer)
  + regional_income CTE; final SELECT uses COALESCE(regional, country) income
- init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz

Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of
both inheriting the same national average (~25.5K PPS).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-27 10:26:15 +01:00
parent e32f7ba4b8
commit 5ade38eeaf
4 changed files with 101 additions and 4 deletions

View File

@@ -6,10 +6,15 @@
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
--
-- Enriched with:
-- stg_income → country-level median income PPS
-- stg_income → country-level median income PPS (fallback)
-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped)
-- stg_padel_courts → padel venue count + nearest court distance (km)
-- stg_tennis_courts → tennis court count within 25km radius
--
-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps.
-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1.
-- All other countries fall back to country-level income (stg_income).
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
-- exact sphere distance is computed.
@@ -39,12 +44,45 @@ locations AS (
FROM staging.stg_population_geonames
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Country income (same source and pattern as dim_cities)
-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
),
-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows)
admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS (
VALUES
('DE', '01', 'DE1'), -- Baden-Württemberg
('DE', '02', 'DE2'), -- Bayern
('DE', '03', 'DE5'), -- Bremen
('DE', '04', 'DE6'), -- Hamburg
('DE', '05', 'DE7'), -- Hessen
('DE', '06', 'DE9'), -- Niedersachsen
('DE', '07', 'DEA'), -- Nordrhein-Westfalen
('DE', '08', 'DEB'), -- Rheinland-Pfalz
('DE', '09', 'DEC'), -- Saarland
('DE', '10', 'DEF'), -- Schleswig-Holstein
('DE', '11', 'DE4'), -- Brandenburg
('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern
('DE', '13', 'DED'), -- Sachsen
('DE', '14', 'DEE'), -- Sachsen-Anhalt
('DE', '15', 'DEG'), -- Thüringen
('DE', '16', 'DE3') -- Berlin
),
-- Latest NUTS-1 regional income per region
regional_income AS (
SELECT
m.country_code,
m.admin1_code,
r.regional_income_pps,
r.ref_year AS regional_income_year
FROM admin1_to_nuts1 m
JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code
QUALIFY ROW_NUMBER() OVER (
PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC
) = 1
),
-- Padel court lat/lon for distance and density calculations
padel_courts AS (
SELECT lat, lon, country_code
@@ -159,8 +197,8 @@ SELECT
l.admin2_code,
l.population,
l.ref_year AS population_year,
ci.median_income_pps,
ci.income_year,
COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps,
COALESCE(ri.regional_income_year, ci.income_year) AS income_year,
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
-- Venues per 100K residents (NULL if population = 0)
CASE WHEN l.population > 0
@@ -172,6 +210,8 @@ SELECT
CURRENT_DATE AS refreshed_date
FROM locations l
LEFT JOIN country_income ci ON l.country_code = ci.country_code
LEFT JOIN regional_income ri ON l.country_code = ri.country_code
AND l.admin1_code = ri.admin1_code
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id

View File

@@ -0,0 +1,44 @@
-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc).
-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …).
-- One row per (nuts1_code, ref_year).
--
-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz
-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]}
MODEL (
name staging.stg_regional_income,
kind FULL,
cron '@daily',
grain (nuts1_code, ref_year)
);
WITH source AS (
SELECT unnest(rows) AS r
FROM read_json(
@LANDING_DIR || '/eurostat/*/*/nama_10r_2hhinc.json.gz',
auto_detect = true
)
),
parsed AS (
SELECT
UPPER(TRIM(r.geo_code)) AS geo_code,
CAST(r.ref_year AS INTEGER) AS ref_year,
CAST(r.value AS DOUBLE) AS regional_income_pps,
CURRENT_DATE AS extracted_date
FROM source
WHERE r.value IS NOT NULL
)
SELECT
-- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB
CASE
WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3)
WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3)
ELSE geo_code
END AS nuts1_code,
ref_year,
regional_income_pps,
extracted_date
FROM parsed
-- NUTS-1 codes are exactly 3 characters (country 2 + region 1)
WHERE LENGTH(geo_code) = 3
AND regional_income_pps > 0