feat(data): Phase 2a — NUTS-1 regional income for opportunity score
- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to request URL so server pre-filters the large cube before download - stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz, filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB - dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer) + regional_income CTE; final SELECT uses COALESCE(regional, country) income - init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of both inheriting the same national average (~25.5K PPS). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -42,6 +42,15 @@ DATASETS: dict[str, dict] = {
|
|||||||
"geo_dim": "geo",
|
"geo_dim": "geo",
|
||||||
"time_dim": "time",
|
"time_dim": "time",
|
||||||
},
|
},
|
||||||
|
"nama_10r_2hhinc": {
|
||||||
|
"filters": { # Net household income per inhabitant in PPS (NUTS-2 grain, contains NUTS-1)
|
||||||
|
"unit": "PPS_EU27_2020_HAB",
|
||||||
|
"na_item": "B6N",
|
||||||
|
"direct": "BAL",
|
||||||
|
},
|
||||||
|
"geo_dim": "geo",
|
||||||
|
"time_dim": "time",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -189,6 +198,8 @@ def extract(
|
|||||||
|
|
||||||
for dataset_code, config in DATASETS.items():
|
for dataset_code, config in DATASETS.items():
|
||||||
url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
|
url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
|
||||||
|
for key, val in config.get("filters", {}).items():
|
||||||
|
url += f"&{key}={val}"
|
||||||
dest_dir = landing_path(landing_dir, "eurostat", year, month)
|
dest_dir = landing_path(landing_dir, "eurostat", year, month)
|
||||||
dest = dest_dir / f"{dataset_code}.json.gz"
|
dest = dest_dir / f"{dataset_code}.json.gz"
|
||||||
|
|
||||||
|
|||||||
@@ -85,6 +85,8 @@ def main() -> None:
|
|||||||
json.dumps({"rows": [], "count": 0}).encode(),
|
json.dumps({"rows": [], "count": 0}).encode(),
|
||||||
"eurostat/1970/01/ilc_di03.json.gz":
|
"eurostat/1970/01/ilc_di03.json.gz":
|
||||||
json.dumps({"rows": [], "count": 0}).encode(),
|
json.dumps({"rows": [], "count": 0}).encode(),
|
||||||
|
"eurostat/1970/01/nama_10r_2hhinc.json.gz":
|
||||||
|
json.dumps({"rows": [], "count": 0}).encode(),
|
||||||
"eurostat_city_labels/1970/01/cities_codelist.json.gz":
|
"eurostat_city_labels/1970/01/cities_codelist.json.gz":
|
||||||
json.dumps({"rows": [], "count": 0}).encode(),
|
json.dumps({"rows": [], "count": 0}).encode(),
|
||||||
|
|
||||||
|
|||||||
@@ -6,10 +6,15 @@
|
|||||||
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
|
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
|
||||||
--
|
--
|
||||||
-- Enriched with:
|
-- Enriched with:
|
||||||
-- stg_income → country-level median income PPS
|
-- stg_income → country-level median income PPS (fallback)
|
||||||
|
-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped)
|
||||||
-- stg_padel_courts → padel venue count + nearest court distance (km)
|
-- stg_padel_courts → padel venue count + nearest court distance (km)
|
||||||
-- stg_tennis_courts → tennis court count within 25km radius
|
-- stg_tennis_courts → tennis court count within 25km radius
|
||||||
--
|
--
|
||||||
|
-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps.
|
||||||
|
-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1.
|
||||||
|
-- All other countries fall back to country-level income (stg_income).
|
||||||
|
--
|
||||||
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
|
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
|
||||||
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
|
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
|
||||||
-- exact sphere distance is computed.
|
-- exact sphere distance is computed.
|
||||||
@@ -39,12 +44,45 @@ locations AS (
|
|||||||
FROM staging.stg_population_geonames
|
FROM staging.stg_population_geonames
|
||||||
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||||||
),
|
),
|
||||||
-- Country income (same source and pattern as dim_cities)
|
-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions
|
||||||
country_income AS (
|
country_income AS (
|
||||||
SELECT country_code, median_income_pps, ref_year AS income_year
|
SELECT country_code, median_income_pps, ref_year AS income_year
|
||||||
FROM staging.stg_income
|
FROM staging.stg_income
|
||||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
||||||
),
|
),
|
||||||
|
-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows)
|
||||||
|
admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS (
|
||||||
|
VALUES
|
||||||
|
('DE', '01', 'DE1'), -- Baden-Württemberg
|
||||||
|
('DE', '02', 'DE2'), -- Bayern
|
||||||
|
('DE', '03', 'DE5'), -- Bremen
|
||||||
|
('DE', '04', 'DE6'), -- Hamburg
|
||||||
|
('DE', '05', 'DE7'), -- Hessen
|
||||||
|
('DE', '06', 'DE9'), -- Niedersachsen
|
||||||
|
('DE', '07', 'DEA'), -- Nordrhein-Westfalen
|
||||||
|
('DE', '08', 'DEB'), -- Rheinland-Pfalz
|
||||||
|
('DE', '09', 'DEC'), -- Saarland
|
||||||
|
('DE', '10', 'DEF'), -- Schleswig-Holstein
|
||||||
|
('DE', '11', 'DE4'), -- Brandenburg
|
||||||
|
('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern
|
||||||
|
('DE', '13', 'DED'), -- Sachsen
|
||||||
|
('DE', '14', 'DEE'), -- Sachsen-Anhalt
|
||||||
|
('DE', '15', 'DEG'), -- Thüringen
|
||||||
|
('DE', '16', 'DE3') -- Berlin
|
||||||
|
),
|
||||||
|
-- Latest NUTS-1 regional income per region
|
||||||
|
regional_income AS (
|
||||||
|
SELECT
|
||||||
|
m.country_code,
|
||||||
|
m.admin1_code,
|
||||||
|
r.regional_income_pps,
|
||||||
|
r.ref_year AS regional_income_year
|
||||||
|
FROM admin1_to_nuts1 m
|
||||||
|
JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code
|
||||||
|
QUALIFY ROW_NUMBER() OVER (
|
||||||
|
PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC
|
||||||
|
) = 1
|
||||||
|
),
|
||||||
-- Padel court lat/lon for distance and density calculations
|
-- Padel court lat/lon for distance and density calculations
|
||||||
padel_courts AS (
|
padel_courts AS (
|
||||||
SELECT lat, lon, country_code
|
SELECT lat, lon, country_code
|
||||||
@@ -159,8 +197,8 @@ SELECT
|
|||||||
l.admin2_code,
|
l.admin2_code,
|
||||||
l.population,
|
l.population,
|
||||||
l.ref_year AS population_year,
|
l.ref_year AS population_year,
|
||||||
ci.median_income_pps,
|
COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps,
|
||||||
ci.income_year,
|
COALESCE(ri.regional_income_year, ci.income_year) AS income_year,
|
||||||
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
|
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
|
||||||
-- Venues per 100K residents (NULL if population = 0)
|
-- Venues per 100K residents (NULL if population = 0)
|
||||||
CASE WHEN l.population > 0
|
CASE WHEN l.population > 0
|
||||||
@@ -172,6 +210,8 @@ SELECT
|
|||||||
CURRENT_DATE AS refreshed_date
|
CURRENT_DATE AS refreshed_date
|
||||||
FROM locations l
|
FROM locations l
|
||||||
LEFT JOIN country_income ci ON l.country_code = ci.country_code
|
LEFT JOIN country_income ci ON l.country_code = ci.country_code
|
||||||
|
LEFT JOIN regional_income ri ON l.country_code = ri.country_code
|
||||||
|
AND l.admin1_code = ri.admin1_code
|
||||||
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
|
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
|
||||||
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
|
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
|
||||||
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
|
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
|
||||||
|
|||||||
@@ -0,0 +1,44 @@
|
|||||||
|
-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc).
|
||||||
|
-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …).
|
||||||
|
-- One row per (nuts1_code, ref_year).
|
||||||
|
--
|
||||||
|
-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz
|
||||||
|
-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]}
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name staging.stg_regional_income,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain (nuts1_code, ref_year)
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH source AS (
|
||||||
|
SELECT unnest(rows) AS r
|
||||||
|
FROM read_json(
|
||||||
|
@LANDING_DIR || '/eurostat/*/*/nama_10r_2hhinc.json.gz',
|
||||||
|
auto_detect = true
|
||||||
|
)
|
||||||
|
),
|
||||||
|
parsed AS (
|
||||||
|
SELECT
|
||||||
|
UPPER(TRIM(r.geo_code)) AS geo_code,
|
||||||
|
CAST(r.ref_year AS INTEGER) AS ref_year,
|
||||||
|
CAST(r.value AS DOUBLE) AS regional_income_pps,
|
||||||
|
CURRENT_DATE AS extracted_date
|
||||||
|
FROM source
|
||||||
|
WHERE r.value IS NOT NULL
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
-- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB
|
||||||
|
CASE
|
||||||
|
WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3)
|
||||||
|
WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3)
|
||||||
|
ELSE geo_code
|
||||||
|
END AS nuts1_code,
|
||||||
|
ref_year,
|
||||||
|
regional_income_pps,
|
||||||
|
extracted_date
|
||||||
|
FROM parsed
|
||||||
|
-- NUTS-1 codes are exactly 3 characters (country 2 + region 1)
|
||||||
|
WHERE LENGTH(geo_code) = 3
|
||||||
|
AND regional_income_pps > 0
|
||||||
Reference in New Issue
Block a user