diff --git a/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py b/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py index c7dcfe0..ee8c429 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py @@ -42,6 +42,15 @@ DATASETS: dict[str, dict] = { "geo_dim": "geo", "time_dim": "time", }, + "nama_10r_2hhinc": { + "filters": { # Net household income per inhabitant in PPS (NUTS-2 grain, contains NUTS-1) + "unit": "PPS_EU27_2020_HAB", + "na_item": "B6N", + "direct": "BAL", + }, + "geo_dim": "geo", + "time_dim": "time", + }, } @@ -189,6 +198,8 @@ def extract( for dataset_code, config in DATASETS.items(): url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN" + for key, val in config.get("filters", {}).items(): + url += f"&{key}={val}" dest_dir = landing_path(landing_dir, "eurostat", year, month) dest = dest_dir / f"{dataset_code}.json.gz" diff --git a/scripts/init_landing_seeds.py b/scripts/init_landing_seeds.py index 9ffc743..9705b51 100644 --- a/scripts/init_landing_seeds.py +++ b/scripts/init_landing_seeds.py @@ -85,6 +85,8 @@ def main() -> None: json.dumps({"rows": [], "count": 0}).encode(), "eurostat/1970/01/ilc_di03.json.gz": json.dumps({"rows": [], "count": 0}).encode(), + "eurostat/1970/01/nama_10r_2hhinc.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), "eurostat_city_labels/1970/01/cities_codelist.json.gz": json.dumps({"rows": [], "count": 0}).encode(), diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index 13b4a0c..8acc4d3 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -6,10 +6,15 @@ -- covers all locations with population ≥ 1K so zero-court Gemeinden score fully. -- -- Enriched with: --- stg_income → country-level median income PPS +-- stg_income → country-level median income PPS (fallback) +-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped) -- stg_padel_courts → padel venue count + nearest court distance (km) -- stg_tennis_courts → tennis court count within 25km radius -- +-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps. +-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1. +-- All other countries fall back to country-level income (stg_income). +-- -- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension). -- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the -- exact sphere distance is computed. @@ -39,12 +44,45 @@ locations AS ( FROM staging.stg_population_geonames WHERE lat IS NOT NULL AND lon IS NOT NULL ), --- Country income (same source and pattern as dim_cities) +-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions country_income AS ( SELECT country_code, median_income_pps, ref_year AS income_year FROM staging.stg_income QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1 ), +-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows) +admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS ( + VALUES + ('DE', '01', 'DE1'), -- Baden-Württemberg + ('DE', '02', 'DE2'), -- Bayern + ('DE', '03', 'DE5'), -- Bremen + ('DE', '04', 'DE6'), -- Hamburg + ('DE', '05', 'DE7'), -- Hessen + ('DE', '06', 'DE9'), -- Niedersachsen + ('DE', '07', 'DEA'), -- Nordrhein-Westfalen + ('DE', '08', 'DEB'), -- Rheinland-Pfalz + ('DE', '09', 'DEC'), -- Saarland + ('DE', '10', 'DEF'), -- Schleswig-Holstein + ('DE', '11', 'DE4'), -- Brandenburg + ('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern + ('DE', '13', 'DED'), -- Sachsen + ('DE', '14', 'DEE'), -- Sachsen-Anhalt + ('DE', '15', 'DEG'), -- Thüringen + ('DE', '16', 'DE3') -- Berlin +), +-- Latest NUTS-1 regional income per region +regional_income AS ( + SELECT + m.country_code, + m.admin1_code, + r.regional_income_pps, + r.ref_year AS regional_income_year + FROM admin1_to_nuts1 m + JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code + QUALIFY ROW_NUMBER() OVER ( + PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC + ) = 1 +), -- Padel court lat/lon for distance and density calculations padel_courts AS ( SELECT lat, lon, country_code @@ -159,8 +197,8 @@ SELECT l.admin2_code, l.population, l.ref_year AS population_year, - ci.median_income_pps, - ci.income_year, + COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps, + COALESCE(ri.regional_income_year, ci.income_year) AS income_year, COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count, -- Venues per 100K residents (NULL if population = 0) CASE WHEN l.population > 0 @@ -172,6 +210,8 @@ SELECT CURRENT_DATE AS refreshed_date FROM locations l LEFT JOIN country_income ci ON l.country_code = ci.country_code +LEFT JOIN regional_income ri ON l.country_code = ri.country_code + AND l.admin1_code = ri.admin1_code LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql new file mode 100644 index 0000000..ceae1d9 --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql @@ -0,0 +1,44 @@ +-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc). +-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …). +-- One row per (nuts1_code, ref_year). +-- +-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz +-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]} + +MODEL ( + name staging.stg_regional_income, + kind FULL, + cron '@daily', + grain (nuts1_code, ref_year) +); + +WITH source AS ( + SELECT unnest(rows) AS r + FROM read_json( + @LANDING_DIR || '/eurostat/*/*/nama_10r_2hhinc.json.gz', + auto_detect = true + ) +), +parsed AS ( + SELECT + UPPER(TRIM(r.geo_code)) AS geo_code, + CAST(r.ref_year AS INTEGER) AS ref_year, + CAST(r.value AS DOUBLE) AS regional_income_pps, + CURRENT_DATE AS extracted_date + FROM source + WHERE r.value IS NOT NULL +) +SELECT + -- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB + CASE + WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3) + WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3) + ELSE geo_code + END AS nuts1_code, + ref_year, + regional_income_pps, + extracted_date +FROM parsed +-- NUTS-1 codes are exactly 3 characters (country 2 + region 1) +WHERE LENGTH(geo_code) = 3 + AND regional_income_pps > 0