feat(data): Phase 2a — NUTS-1 regional income for opportunity score

- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to request URL so server pre-filters the large cube before download - stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz, filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB - dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer) + regional_income CTE; final SELECT uses COALESCE(regional, country) income - init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of both inheriting the same national average (~25.5K PPS). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 10:26:15 +01:00
parent e32f7ba4b8
commit 5ade38eeaf
4 changed files with 101 additions and 4 deletions
--- a/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py
@@ -42,6 +42,15 @@ DATASETS: dict[str, dict] = {
        "geo_dim": "geo",
        "time_dim": "time",
    },
+    "nama_10r_2hhinc": {
+        "filters": {  # Net household income per inhabitant in PPS (NUTS-2 grain, contains NUTS-1)
+            "unit": "PPS_EU27_2020_HAB",
+            "na_item": "B6N",
+            "direct": "BAL",
+        },
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
 }


@@ -189,6 +198,8 @@ def extract(

    for dataset_code, config in DATASETS.items():
        url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
+        for key, val in config.get("filters", {}).items():
+            url += f"&{key}={val}"
        dest_dir = landing_path(landing_dir, "eurostat", year, month)
        dest = dest_dir / f"{dataset_code}.json.gz"

--- a/scripts/init_landing_seeds.py
+++ b/scripts/init_landing_seeds.py
@@ -85,6 +85,8 @@ def main() -> None:
            json.dumps({"rows": [], "count": 0}).encode(),
        "eurostat/1970/01/ilc_di03.json.gz":
            json.dumps({"rows": [], "count": 0}).encode(),
+        "eurostat/1970/01/nama_10r_2hhinc.json.gz":
+            json.dumps({"rows": [], "count": 0}).encode(),
        "eurostat_city_labels/1970/01/cities_codelist.json.gz":
            json.dumps({"rows": [], "count": 0}).encode(),

--- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql
+++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql
@@ -6,10 +6,15 @@
 -- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
 --
 -- Enriched with:
--   stg_income           → country-level median income PPS
+--   stg_income           → country-level median income PPS (fallback)
+--   stg_regional_income  → NUTS-1 regional income PPS (preferred where mapped)
 --   stg_padel_courts     → padel venue count + nearest court distance (km)
 --   stg_tennis_courts    → tennis court count within 25km radius
 --
+-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps.
+-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1.
+-- All other countries fall back to country-level income (stg_income).
+--
 -- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
 -- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
 -- exact sphere distance is computed.
@@ -39,12 +44,45 @@ locations AS (
  FROM staging.stg_population_geonames
  WHERE lat IS NOT NULL AND lon IS NOT NULL
 ),
-- Country income (same source and pattern as dim_cities)
+-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions
 country_income AS (
  SELECT country_code, median_income_pps, ref_year AS income_year
  FROM staging.stg_income
  QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
 ),
+-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows)
+admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS (
+  VALUES
+    ('DE', '01', 'DE1'),  -- Baden-Württemberg
+    ('DE', '02', 'DE2'),  -- Bayern
+    ('DE', '03', 'DE5'),  -- Bremen
+    ('DE', '04', 'DE6'),  -- Hamburg
+    ('DE', '05', 'DE7'),  -- Hessen
+    ('DE', '06', 'DE9'),  -- Niedersachsen
+    ('DE', '07', 'DEA'),  -- Nordrhein-Westfalen
+    ('DE', '08', 'DEB'),  -- Rheinland-Pfalz
+    ('DE', '09', 'DEC'),  -- Saarland
+    ('DE', '10', 'DEF'),  -- Schleswig-Holstein
+    ('DE', '11', 'DE4'),  -- Brandenburg
+    ('DE', '12', 'DE8'),  -- Mecklenburg-Vorpommern
+    ('DE', '13', 'DED'),  -- Sachsen
+    ('DE', '14', 'DEE'),  -- Sachsen-Anhalt
+    ('DE', '15', 'DEG'),  -- Thüringen
+    ('DE', '16', 'DE3')   -- Berlin
+),
+-- Latest NUTS-1 regional income per region
+regional_income AS (
+  SELECT
+    m.country_code,
+    m.admin1_code,
+    r.regional_income_pps,
+    r.ref_year AS regional_income_year
+  FROM admin1_to_nuts1 m
+  JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code
+  QUALIFY ROW_NUMBER() OVER (
+    PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC
+  ) = 1
+),
 -- Padel court lat/lon for distance and density calculations
 padel_courts AS (
  SELECT lat, lon, country_code
@@ -159,8 +197,8 @@ SELECT
  l.admin2_code,
  l.population,
  l.ref_year                                                             AS population_year,
-  ci.median_income_pps,
-  ci.income_year,
+  COALESCE(ri.regional_income_pps, ci.median_income_pps)               AS median_income_pps,
+  COALESCE(ri.regional_income_year, ci.income_year)                    AS income_year,
  COALESCE(pl.padel_venue_count, 0)::INTEGER                            AS padel_venue_count,
  -- Venues per 100K residents (NULL if population = 0)
  CASE WHEN l.population > 0
@@ -172,6 +210,8 @@ SELECT
  CURRENT_DATE                                                          AS refreshed_date
 FROM locations l
 LEFT JOIN country_income ci   ON l.country_code = ci.country_code
+LEFT JOIN regional_income ri  ON l.country_code = ri.country_code
+                              AND l.admin1_code  = ri.admin1_code
 LEFT JOIN nearest_padel np    ON l.geoname_id   = np.geoname_id
 LEFT JOIN padel_local pl      ON l.geoname_id   = pl.geoname_id
 LEFT JOIN tennis_nearby tn    ON l.geoname_id   = tn.geoname_id
--- a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql
+++ b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql
@@ -0,0 +1,44 @@
+-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc).
+-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …).
+-- One row per (nuts1_code, ref_year).
+--
+-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz
+-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]}
+
+MODEL (
+  name staging.stg_regional_income,
+  kind FULL,
+  cron '@daily',
+  grain (nuts1_code, ref_year)
+);
+
+WITH source AS (
+  SELECT unnest(rows) AS r
+  FROM read_json(
+    @LANDING_DIR || '/eurostat/*/*/nama_10r_2hhinc.json.gz',
+    auto_detect = true
+  )
+),
+parsed AS (
+  SELECT
+    UPPER(TRIM(r.geo_code))        AS geo_code,
+    CAST(r.ref_year AS INTEGER)    AS ref_year,
+    CAST(r.value AS DOUBLE)        AS regional_income_pps,
+    CURRENT_DATE                   AS extracted_date
+  FROM source
+  WHERE r.value IS NOT NULL
+)
+SELECT
+  -- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB
+  CASE
+    WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3)
+    WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3)
+    ELSE geo_code
+  END                              AS nuts1_code,
+  ref_year,
+  regional_income_pps,
+  extracted_date
+FROM parsed
+-- NUTS-1 codes are exactly 3 characters (country 2 + region 1)
+WHERE LENGTH(geo_code) = 3
+  AND regional_income_pps > 0