feat(data): Phase 2a — NUTS-1 regional income for opportunity score
- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to request URL so server pre-filters the large cube before download - stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz, filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB - dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer) + regional_income CTE; final SELECT uses COALESCE(regional, country) income - init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of both inheriting the same national average (~25.5K PPS). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -42,6 +42,15 @@ DATASETS: dict[str, dict] = {
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"nama_10r_2hhinc": {
|
||||
"filters": { # Net household income per inhabitant in PPS (NUTS-2 grain, contains NUTS-1)
|
||||
"unit": "PPS_EU27_2020_HAB",
|
||||
"na_item": "B6N",
|
||||
"direct": "BAL",
|
||||
},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -189,6 +198,8 @@ def extract(
|
||||
|
||||
for dataset_code, config in DATASETS.items():
|
||||
url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
|
||||
for key, val in config.get("filters", {}).items():
|
||||
url += f"&{key}={val}"
|
||||
dest_dir = landing_path(landing_dir, "eurostat", year, month)
|
||||
dest = dest_dir / f"{dataset_code}.json.gz"
|
||||
|
||||
|
||||
@@ -85,6 +85,8 @@ def main() -> None:
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat/1970/01/ilc_di03.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat/1970/01/nama_10r_2hhinc.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat_city_labels/1970/01/cities_codelist.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
|
||||
|
||||
@@ -6,10 +6,15 @@
|
||||
-- covers all locations with population ≥ 1K so zero-court Gemeinden score fully.
|
||||
--
|
||||
-- Enriched with:
|
||||
-- stg_income → country-level median income PPS
|
||||
-- stg_income → country-level median income PPS (fallback)
|
||||
-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped)
|
||||
-- stg_padel_courts → padel venue count + nearest court distance (km)
|
||||
-- stg_tennis_courts → tennis court count within 25km radius
|
||||
--
|
||||
-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps.
|
||||
-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1.
|
||||
-- All other countries fall back to country-level income (stg_income).
|
||||
--
|
||||
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
|
||||
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
|
||||
-- exact sphere distance is computed.
|
||||
@@ -39,12 +44,45 @@ locations AS (
|
||||
FROM staging.stg_population_geonames
|
||||
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||||
),
|
||||
-- Country income (same source and pattern as dim_cities)
|
||||
-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions
|
||||
country_income AS (
|
||||
SELECT country_code, median_income_pps, ref_year AS income_year
|
||||
FROM staging.stg_income
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows)
|
||||
admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS (
|
||||
VALUES
|
||||
('DE', '01', 'DE1'), -- Baden-Württemberg
|
||||
('DE', '02', 'DE2'), -- Bayern
|
||||
('DE', '03', 'DE5'), -- Bremen
|
||||
('DE', '04', 'DE6'), -- Hamburg
|
||||
('DE', '05', 'DE7'), -- Hessen
|
||||
('DE', '06', 'DE9'), -- Niedersachsen
|
||||
('DE', '07', 'DEA'), -- Nordrhein-Westfalen
|
||||
('DE', '08', 'DEB'), -- Rheinland-Pfalz
|
||||
('DE', '09', 'DEC'), -- Saarland
|
||||
('DE', '10', 'DEF'), -- Schleswig-Holstein
|
||||
('DE', '11', 'DE4'), -- Brandenburg
|
||||
('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern
|
||||
('DE', '13', 'DED'), -- Sachsen
|
||||
('DE', '14', 'DEE'), -- Sachsen-Anhalt
|
||||
('DE', '15', 'DEG'), -- Thüringen
|
||||
('DE', '16', 'DE3') -- Berlin
|
||||
),
|
||||
-- Latest NUTS-1 regional income per region
|
||||
regional_income AS (
|
||||
SELECT
|
||||
m.country_code,
|
||||
m.admin1_code,
|
||||
r.regional_income_pps,
|
||||
r.ref_year AS regional_income_year
|
||||
FROM admin1_to_nuts1 m
|
||||
JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC
|
||||
) = 1
|
||||
),
|
||||
-- Padel court lat/lon for distance and density calculations
|
||||
padel_courts AS (
|
||||
SELECT lat, lon, country_code
|
||||
@@ -159,8 +197,8 @@ SELECT
|
||||
l.admin2_code,
|
||||
l.population,
|
||||
l.ref_year AS population_year,
|
||||
ci.median_income_pps,
|
||||
ci.income_year,
|
||||
COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps,
|
||||
COALESCE(ri.regional_income_year, ci.income_year) AS income_year,
|
||||
COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count,
|
||||
-- Venues per 100K residents (NULL if population = 0)
|
||||
CASE WHEN l.population > 0
|
||||
@@ -172,6 +210,8 @@ SELECT
|
||||
CURRENT_DATE AS refreshed_date
|
||||
FROM locations l
|
||||
LEFT JOIN country_income ci ON l.country_code = ci.country_code
|
||||
LEFT JOIN regional_income ri ON l.country_code = ri.country_code
|
||||
AND l.admin1_code = ri.admin1_code
|
||||
LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id
|
||||
LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id
|
||||
LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc).
|
||||
-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …).
|
||||
-- One row per (nuts1_code, ref_year).
|
||||
--
|
||||
-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz
|
||||
-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]}
|
||||
|
||||
MODEL (
|
||||
name staging.stg_regional_income,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain (nuts1_code, ref_year)
|
||||
);
|
||||
|
||||
WITH source AS (
|
||||
SELECT unnest(rows) AS r
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/eurostat/*/*/nama_10r_2hhinc.json.gz',
|
||||
auto_detect = true
|
||||
)
|
||||
),
|
||||
parsed AS (
|
||||
SELECT
|
||||
UPPER(TRIM(r.geo_code)) AS geo_code,
|
||||
CAST(r.ref_year AS INTEGER) AS ref_year,
|
||||
CAST(r.value AS DOUBLE) AS regional_income_pps,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM source
|
||||
WHERE r.value IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
-- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB
|
||||
CASE
|
||||
WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3)
|
||||
WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3)
|
||||
ELSE geo_code
|
||||
END AS nuts1_code,
|
||||
ref_year,
|
||||
regional_income_pps,
|
||||
extracted_date
|
||||
FROM parsed
|
||||
-- NUTS-1 codes are exactly 3 characters (country 2 + region 1)
|
||||
WHERE LENGTH(geo_code) = 3
|
||||
AND regional_income_pps > 0
|
||||
Reference in New Issue
Block a user