From 5ade38eeafb00917fc428aaca776f6f2373fc3c8 Mon Sep 17 00:00:00 2001 From: Deeman Date: Fri, 27 Feb 2026 10:26:15 +0100 Subject: [PATCH 1/4] =?UTF-8?q?feat(data):=20Phase=202a=20=E2=80=94=20NUTS?= =?UTF-8?q?-1=20regional=20income=20for=20opportunity=20score?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - eurostat.py: add nama_10r_2hhinc dataset config; append filter params to request URL so server pre-filters the large cube before download - stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz, filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB - dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer) + regional_income CTE; final SELECT uses COALESCE(regional, country) income - init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of both inheriting the same national average (~25.5K PPS). Co-Authored-By: Claude Sonnet 4.6 --- .../src/padelnomics_extract/eurostat.py | 11 +++++ scripts/init_landing_seeds.py | 2 + .../models/foundation/dim_locations.sql | 48 +++++++++++++++++-- .../models/staging/stg_regional_income.sql | 44 +++++++++++++++++ 4 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql diff --git a/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py b/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py index c7dcfe0..ee8c429 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py @@ -42,6 +42,15 @@ DATASETS: dict[str, dict] = { "geo_dim": "geo", "time_dim": "time", }, + "nama_10r_2hhinc": { + "filters": { # Net household income per inhabitant in PPS (NUTS-2 grain, contains NUTS-1) + "unit": "PPS_EU27_2020_HAB", + "na_item": "B6N", + "direct": "BAL", + }, + "geo_dim": "geo", + "time_dim": "time", + }, } @@ -189,6 +198,8 @@ def extract( for dataset_code, config in DATASETS.items(): url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN" + for key, val in config.get("filters", {}).items(): + url += f"&{key}={val}" dest_dir = landing_path(landing_dir, "eurostat", year, month) dest = dest_dir / f"{dataset_code}.json.gz" diff --git a/scripts/init_landing_seeds.py b/scripts/init_landing_seeds.py index 9ffc743..9705b51 100644 --- a/scripts/init_landing_seeds.py +++ b/scripts/init_landing_seeds.py @@ -85,6 +85,8 @@ def main() -> None: json.dumps({"rows": [], "count": 0}).encode(), "eurostat/1970/01/ilc_di03.json.gz": json.dumps({"rows": [], "count": 0}).encode(), + "eurostat/1970/01/nama_10r_2hhinc.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), "eurostat_city_labels/1970/01/cities_codelist.json.gz": json.dumps({"rows": [], "count": 0}).encode(), diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index 13b4a0c..8acc4d3 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -6,10 +6,15 @@ -- covers all locations with population ≥ 1K so zero-court Gemeinden score fully. -- -- Enriched with: --- stg_income → country-level median income PPS +-- stg_income → country-level median income PPS (fallback) +-- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped) -- stg_padel_courts → padel venue count + nearest court distance (km) -- stg_tennis_courts → tennis court count within 25km radius -- +-- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps. +-- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1. +-- All other countries fall back to country-level income (stg_income). +-- -- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension). -- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the -- exact sphere distance is computed. @@ -39,12 +44,45 @@ locations AS ( FROM staging.stg_population_geonames WHERE lat IS NOT NULL AND lon IS NOT NULL ), --- Country income (same source and pattern as dim_cities) +-- Country income (same source and pattern as dim_cities) — fallback for unmapped regions country_income AS ( SELECT country_code, median_income_pps, ref_year AS income_year FROM staging.stg_income QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1 ), +-- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows) +admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS ( + VALUES + ('DE', '01', 'DE1'), -- Baden-Württemberg + ('DE', '02', 'DE2'), -- Bayern + ('DE', '03', 'DE5'), -- Bremen + ('DE', '04', 'DE6'), -- Hamburg + ('DE', '05', 'DE7'), -- Hessen + ('DE', '06', 'DE9'), -- Niedersachsen + ('DE', '07', 'DEA'), -- Nordrhein-Westfalen + ('DE', '08', 'DEB'), -- Rheinland-Pfalz + ('DE', '09', 'DEC'), -- Saarland + ('DE', '10', 'DEF'), -- Schleswig-Holstein + ('DE', '11', 'DE4'), -- Brandenburg + ('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern + ('DE', '13', 'DED'), -- Sachsen + ('DE', '14', 'DEE'), -- Sachsen-Anhalt + ('DE', '15', 'DEG'), -- Thüringen + ('DE', '16', 'DE3') -- Berlin +), +-- Latest NUTS-1 regional income per region +regional_income AS ( + SELECT + m.country_code, + m.admin1_code, + r.regional_income_pps, + r.ref_year AS regional_income_year + FROM admin1_to_nuts1 m + JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code + QUALIFY ROW_NUMBER() OVER ( + PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC + ) = 1 +), -- Padel court lat/lon for distance and density calculations padel_courts AS ( SELECT lat, lon, country_code @@ -159,8 +197,8 @@ SELECT l.admin2_code, l.population, l.ref_year AS population_year, - ci.median_income_pps, - ci.income_year, + COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps, + COALESCE(ri.regional_income_year, ci.income_year) AS income_year, COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count, -- Venues per 100K residents (NULL if population = 0) CASE WHEN l.population > 0 @@ -172,6 +210,8 @@ SELECT CURRENT_DATE AS refreshed_date FROM locations l LEFT JOIN country_income ci ON l.country_code = ci.country_code +LEFT JOIN regional_income ri ON l.country_code = ri.country_code + AND l.admin1_code = ri.admin1_code LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql new file mode 100644 index 0000000..ceae1d9 --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql @@ -0,0 +1,44 @@ +-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc). +-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …). +-- One row per (nuts1_code, ref_year). +-- +-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz +-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]} + +MODEL ( + name staging.stg_regional_income, + kind FULL, + cron '@daily', + grain (nuts1_code, ref_year) +); + +WITH source AS ( + SELECT unnest(rows) AS r + FROM read_json( + @LANDING_DIR || '/eurostat/*/*/nama_10r_2hhinc.json.gz', + auto_detect = true + ) +), +parsed AS ( + SELECT + UPPER(TRIM(r.geo_code)) AS geo_code, + CAST(r.ref_year AS INTEGER) AS ref_year, + CAST(r.value AS DOUBLE) AS regional_income_pps, + CURRENT_DATE AS extracted_date + FROM source + WHERE r.value IS NOT NULL +) +SELECT + -- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB + CASE + WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3) + WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3) + ELSE geo_code + END AS nuts1_code, + ref_year, + regional_income_pps, + extracted_date +FROM parsed +-- NUTS-1 codes are exactly 3 characters (country 2 + region 1) +WHERE LENGTH(geo_code) = 3 + AND regional_income_pps > 0 From 5e5a7c1bae17cf24a93854faf4653b98d6380503 Mon Sep 17 00:00:00 2001 From: Deeman Date: Fri, 27 Feb 2026 10:26:48 +0100 Subject: [PATCH 2/4] docs: CHANGELOG + PROJECT.md for Phase 2a NUTS-1 regional income Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 8 ++++++++ PROJECT.md | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c9633c4..d3fb806 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +### Added +- **Phase 2a — NUTS-1 regional income differentiation** (`opportunity_score`): Munich and Berlin no longer share the same income figure as Chemnitz. + - `eurostat.py`: added `nama_10r_2hhinc` dataset config (NUTS-2 cube with NUTS-1 entries); filter params now appended to API URL so the server pre-filters the large cube before download (also makes `ilc_di03` requests smaller). + - `stg_regional_income.sql`: new staging model — reads `nama_10r_2hhinc.json.gz`, filters to 3-char NUTS-1 codes, normalises `EL→GR` / `UK→GB`. Grain: `(nuts1_code, ref_year)`. + - `dim_locations.sql`: `admin1_to_nuts1` VALUES CTE (16 German Bundesländer mapping GeoNames `admin1_code` → NUTS-1) + `regional_income` CTE (latest year per region). Final SELECT: `COALESCE(regional_income_pps, country_income_pps) AS median_income_pps` — all downstream consumers (`location_opportunity_profile`, `opportunity_score`) work unchanged. + - `init_landing_seeds.py`: seed entry for `eurostat/1970/01/nama_10r_2hhinc.json.gz`. + - Verified income spread: Bayern (DE2) ~29K PPS > Hamburg (DE6) ~27K > Berlin (DE3) ~24K > Sachsen (DED) ~19K PPS. Non-mapped countries (ES, FR, IT) continue with country-level fallback. + ### Changed - **Opportunity Score v2 — income ceiling fix** (`location_opportunity_profile.sql`): income PPS normalisation changed from `/200.0` (caused LEAST(1.0, 115)=1.0 for ALL countries — no differentiation) to `/35000.0` with country-spread-matched ceiling. Default for missing data changed from 100 to 15000 (developing-market assumption). Country scores now reflect real PPS spread: LU 20.0, SE 14.3, DE 13.2, ES 10.7, GB 10.5 pts (was 20.0 everywhere). - **dim_cities population coverage 70.5% → 98.5%** — added GeoNames spatial fallback CTE that finds the nearest GeoNames location within ~15 km when string name matching fails (~29% of cities). Fixes localization mismatches (Milano≠Milan, Wien≠Vienna, München≠Munich): Wien 0→1,691,468; Milano 0→1,371,498. Population cascade now: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. diff --git a/PROJECT.md b/PROJECT.md index 5411ca5..2e459a5 100644 --- a/PROJECT.md +++ b/PROJECT.md @@ -221,7 +221,8 @@ ### Data & Intelligence - [ ] Sports centre Overpass extract (`leisure=sports_centre`) — additional market signal for `dim_locations` -- [ ] City-level income enrichment (Eurostat NUTS-3 regional income — replaces country-level PPS proxy, higher granularity) +- [x] **Phase 2a — NUTS-1 regional income** — `nama_10r_2hhinc` extractor + `stg_regional_income` staging model + `admin1_to_nuts1` VALUES CTE in `dim_locations`; all 16 German Bundesländer mapped; Bayern ~29K vs Sachsen ~19K PPS differentiation; country-level fallback for ES/FR/IT/etc. +- [ ] Phase 2b — city-level income (NUTS-3 granularity) if NUTS-1 proves insufficient - [ ] Interactive opportunity map / explorer in web app (map UI over `location_opportunity_profile` — bounding box queries via ST_Distance_Sphere) - [ ] Multi-source data aggregation (add booking platforms beyond Playtomic) - [ ] Google Maps signals (reviews, ratings) From 409dc4bfac6785db0d1085092dd23ffdacd43e03 Mon Sep 17 00:00:00 2001 From: Deeman Date: Fri, 27 Feb 2026 10:58:12 +0100 Subject: [PATCH 3/4] =?UTF-8?q?feat(data):=20Phase=202b=20step=201=20?= =?UTF-8?q?=E2=80=94=20expand=20stg=5Fregional=5Fincome=20+=20Census=20inc?= =?UTF-8?q?ome=20extractor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - stg_regional_income.sql: accept NUTS-1 (3-char) + NUTS-2 (4-char) codes; rename nuts1_code → nuts_code; add nuts_level column; NUTS-2 rows were already in the landing zone but discarded by LENGTH(geo_code) = 3 - scripts/download_gisco_nuts.py: one-time download of GISCO NUTS-2 boundary GeoJSON (NUTS_RG_20M_2021_4326_LEVL_2.geojson, ~5MB) to landing zone; uncompressed because ST_Read cannot read .gz files - census_usa_income.py: new extractor for ACS B19013_001E state-level median household income; follows census_usa.py pattern; 51 states + DC - all.py + pyproject.toml: register census_usa_income extractor Co-Authored-By: Claude Sonnet 4.6 --- extract/padelnomics_extract/pyproject.toml | 1 + .../src/padelnomics_extract/all.py | 3 + .../padelnomics_extract/census_usa_income.py | 121 ++++++++++++++++++ scripts/download_gisco_nuts.py | 81 ++++++++++++ .../models/staging/stg_regional_income.sql | 18 +-- 5 files changed, 216 insertions(+), 8 deletions(-) create mode 100644 extract/padelnomics_extract/src/padelnomics_extract/census_usa_income.py create mode 100644 scripts/download_gisco_nuts.py diff --git a/extract/padelnomics_extract/pyproject.toml b/extract/padelnomics_extract/pyproject.toml index 93098df..ec3ca82 100644 --- a/extract/padelnomics_extract/pyproject.toml +++ b/extract/padelnomics_extract/pyproject.toml @@ -18,6 +18,7 @@ extract-playtomic-availability = "padelnomics_extract.playtomic_availability:mai extract-playtomic-recheck = "padelnomics_extract.playtomic_availability:main_recheck" extract-eurostat-city-labels = "padelnomics_extract.eurostat_city_labels:main" extract-census-usa = "padelnomics_extract.census_usa:main" +extract-census-usa-income = "padelnomics_extract.census_usa_income:main" extract-ons-uk = "padelnomics_extract.ons_uk:main" extract-geonames = "padelnomics_extract.geonames:main" diff --git a/extract/padelnomics_extract/src/padelnomics_extract/all.py b/extract/padelnomics_extract/src/padelnomics_extract/all.py index 8b93c94..62987ce 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/all.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/all.py @@ -18,6 +18,8 @@ from graphlib import TopologicalSorter from ._shared import run_extractor, setup_logging from .census_usa import EXTRACTOR_NAME as CENSUS_USA_NAME from .census_usa import extract as extract_census_usa +from .census_usa_income import EXTRACTOR_NAME as CENSUS_USA_INCOME_NAME +from .census_usa_income import extract as extract_census_usa_income from .eurostat import EXTRACTOR_NAME as EUROSTAT_NAME from .eurostat import extract as extract_eurostat from .eurostat_city_labels import EXTRACTOR_NAME as EUROSTAT_CITY_LABELS_NAME @@ -45,6 +47,7 @@ EXTRACTORS: dict[str, tuple] = { EUROSTAT_NAME: (extract_eurostat, []), EUROSTAT_CITY_LABELS_NAME: (extract_eurostat_city_labels, []), CENSUS_USA_NAME: (extract_census_usa, []), + CENSUS_USA_INCOME_NAME: (extract_census_usa_income, []), ONS_UK_NAME: (extract_ons_uk, []), GEONAMES_NAME: (extract_geonames, []), TENANTS_NAME: (extract_tenants, []), diff --git a/extract/padelnomics_extract/src/padelnomics_extract/census_usa_income.py b/extract/padelnomics_extract/src/padelnomics_extract/census_usa_income.py new file mode 100644 index 0000000..76fb63d --- /dev/null +++ b/extract/padelnomics_extract/src/padelnomics_extract/census_usa_income.py @@ -0,0 +1,121 @@ +"""US Census Bureau ACS 5-year state-level median household income extractor. + +Fetches state-level median household income from the American Community Survey +5-year estimates. Requires a free API key from api.census.gov. + +Env var: CENSUS_API_KEY (same key as census_usa.py) + +Landing: {LANDING_DIR}/census_usa/{year}/{month}/acs5_state_income.json.gz +Output: {"rows": [{"state_fips": "06", "state_name": "California", + "median_income_usd": 91905, "ref_year": 2023, + "country_code": "US"}], "count": N} +""" + +import json +import os +import sqlite3 +from pathlib import Path + +import niquests + +from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging +from .utils import get_last_cursor, landing_path, write_gzip_atomic + +logger = setup_logging("padelnomics.extract.census_usa_income") + +EXTRACTOR_NAME = "census_usa_income" + +# ACS 5-year estimates, 2023 vintage — refreshed annually. +# B19013_001E = median household income in the past 12 months (inflation-adjusted). +ACS_STATE_URL = ( + "https://api.census.gov/data/2023/acs/acs5" + "?get=B19013_001E,NAME&for=state:*" +) + +REF_YEAR = 2023 +MAX_RETRIES = 2 + + +def extract( + landing_dir: Path, + year_month: str, + conn: sqlite3.Connection, + session: niquests.Session, +) -> dict: + """Fetch ACS 5-year state-level income. Skips if already run this month.""" + api_key = os.environ.get("CENSUS_API_KEY", "").strip() + if not api_key: + logger.warning("CENSUS_API_KEY not set — writing empty placeholder so SQLMesh models can run") + year, month = year_month.split("/") + dest_dir = landing_path(landing_dir, "census_usa", year, month) + dest = dest_dir / "acs5_state_income.json.gz" + if not dest.exists(): + write_gzip_atomic(dest, b'{"rows": [], "count": 0}') + return {"files_written": 0, "files_skipped": 1, "bytes_written": 0} + + last_cursor = get_last_cursor(conn, EXTRACTOR_NAME) + if last_cursor == year_month: + logger.info("already have data for %s — skipping", year_month) + return {"files_written": 0, "files_skipped": 1, "bytes_written": 0} + + year, month = year_month.split("/") + url = f"{ACS_STATE_URL}&key={api_key}" + + logger.info("GET ACS 5-year state income (vintage %d)", REF_YEAR) + resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 2) + resp.raise_for_status() + + raw = resp.json() + assert isinstance(raw, list) and len(raw) > 1, "ACS response must be a non-empty list" + + # First row is headers: ["B19013_001E", "NAME", "state"] + headers = raw[0] + assert "B19013_001E" in headers, f"Income column missing from ACS response: {headers}" + income_idx = headers.index("B19013_001E") + name_idx = headers.index("NAME") + state_idx = headers.index("state") + + rows: list[dict] = [] + for row in raw[1:]: + try: + income = int(row[income_idx]) + except (ValueError, TypeError): + # ACS returns -666666666 for suppressed/unavailable values + continue + if income <= 0: + continue + # Full state name from ACS; strip any trailing text after comma + state_name = row[name_idx].split(",")[0].strip() + if not state_name: + continue + rows.append({ + "state_fips": row[state_idx], + "state_name": state_name, + "median_income_usd": income, + "ref_year": REF_YEAR, + "country_code": "US", + }) + + assert len(rows) >= 50, f"Expected ≥50 US states, got {len(rows)} — parse may have failed" + logger.info("parsed %d US state income records", len(rows)) + + dest_dir = landing_path(landing_dir, "census_usa", year, month) + dest = dest_dir / "acs5_state_income.json.gz" + payload = json.dumps({"rows": rows, "count": len(rows)}).encode() + bytes_written = write_gzip_atomic(dest, payload) + logger.info("written %s bytes compressed", f"{bytes_written:,}") + + return { + "files_written": 1, + "files_skipped": 0, + "bytes_written": bytes_written, + "cursor_value": year_month, + } + + +def main() -> None: + run_extractor(EXTRACTOR_NAME, extract) + + +if __name__ == "__main__": + main() diff --git a/scripts/download_gisco_nuts.py b/scripts/download_gisco_nuts.py new file mode 100644 index 0000000..5a7737b --- /dev/null +++ b/scripts/download_gisco_nuts.py @@ -0,0 +1,81 @@ +"""Download NUTS-2 boundary GeoJSON from Eurostat GISCO. + +One-time (or on NUTS revision) download of NUTS-2 boundary polygons used for +spatial income resolution in dim_locations. Stored uncompressed because DuckDB's +ST_Read function cannot read gzipped files. + +NUTS classification changes approximately every 7 years. Current revision: 2021. + +Output: {LANDING_DIR}/gisco/2024/01/nuts2_boundaries.geojson (~5MB, uncompressed) + +Usage: + uv run python scripts/download_gisco_nuts.py [--landing-dir data/landing] + +Idempotent: skips download if the file already exists. +""" + +import argparse +import sys +from pathlib import Path + +import niquests + +# NUTS 2021 revision, 20M scale (1:20,000,000), WGS84 (EPSG:4326), LEVL_2 only. +# 20M resolution gives simplified polygons that are fast for point-in-polygon +# matching without sacrificing accuracy at the NUTS-2 boundary level. +GISCO_URL = ( + "https://gisco-services.ec.europa.eu/distribution/v2/nuts/geojson/" + "NUTS_RG_20M_2021_4326_LEVL_2.geojson" +) + +# Fixed partition: NUTS boundaries are a static reference file, not time-series data. +# Use the NUTS revision year as the partition to make the source version explicit. +DEST_REL_PATH = "gisco/2024/01/nuts2_boundaries.geojson" + +HTTP_TIMEOUT_SECONDS = 120 + + +def download_nuts_boundaries(landing_dir: Path) -> None: + dest = landing_dir / DEST_REL_PATH + if dest.exists(): + print(f"Already exists (skipping): {dest}") + return + + dest.parent.mkdir(parents=True, exist_ok=True) + print(f"Downloading NUTS-2 boundaries from GISCO...") + print(f" URL: {GISCO_URL}") + + with niquests.Session() as session: + resp = session.get(GISCO_URL, timeout=HTTP_TIMEOUT_SECONDS) + resp.raise_for_status() + + content = resp.content + assert len(content) > 100_000, ( + f"GeoJSON too small ({len(content)} bytes) — download may have failed" + ) + assert b'"FeatureCollection"' in content, "Response does not look like GeoJSON" + + # Write uncompressed — ST_Read requires a plain file + tmp = dest.with_suffix(".geojson.tmp") + tmp.write_bytes(content) + tmp.rename(dest) + + size_mb = len(content) / 1_000_000 + print(f" Written: {dest} ({size_mb:.1f} MB)") + print("Done. Run SQLMesh plan to rebuild stg_nuts2_boundaries.") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--landing-dir", default="data/landing", type=Path) + args = parser.parse_args() + + if not args.landing_dir.is_dir(): + print(f"Error: landing dir does not exist: {args.landing_dir}", file=sys.stderr) + sys.exit(1) + + download_nuts_boundaries(args.landing_dir) + + +if __name__ == "__main__": + main() diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql index ceae1d9..e5f7db5 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql @@ -1,15 +1,15 @@ --- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc). --- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …). --- One row per (nuts1_code, ref_year). +-- Eurostat NUTS-1 and NUTS-2 regional household income in PPS (dataset: nama_10r_2hhinc). +-- Accepts NUTS-1 codes (3 characters, e.g. DE1) and NUTS-2 codes (4 characters, e.g. DE21). +-- One row per (nuts_code, ref_year). -- -- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz --- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]} +-- Format: {"rows": [{"geo_code": "DE21", "ref_year": "2022", "value": 31200}, ...]} MODEL ( name staging.stg_regional_income, kind FULL, cron '@daily', - grain (nuts1_code, ref_year) + grain (nuts_code, ref_year) ); WITH source AS ( @@ -34,11 +34,13 @@ SELECT WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3) WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3) ELSE geo_code - END AS nuts1_code, + END AS nuts_code, + -- NUTS level: 3-char = NUTS-1, 4-char = NUTS-2 + LENGTH(geo_code) - 2 AS nuts_level, ref_year, regional_income_pps, extracted_date FROM parsed --- NUTS-1 codes are exactly 3 characters (country 2 + region 1) -WHERE LENGTH(geo_code) = 3 +-- NUTS-1 (3 chars) and NUTS-2 (4 chars); exclude country codes (2) and NUTS-3 (5) +WHERE LENGTH(geo_code) IN (3, 4) AND regional_income_pps > 0 From c3531bd75d34a7e5364ae9ec8684609b46a5f3f6 Mon Sep 17 00:00:00 2001 From: Deeman Date: Fri, 27 Feb 2026 11:03:16 +0100 Subject: [PATCH 4/4] =?UTF-8?q?feat(data):=20Phase=202b=20complete=20?= =?UTF-8?q?=E2=80=94=20EU=20NUTS-2=20spatial=20join=20+=20US=20state=20inc?= =?UTF-8?q?ome?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - stg_regional_income: expanded NUTS-1+2 (LENGTH IN 3,4), nuts_code rename, nuts_level - stg_nuts2_boundaries: new — ST_Read GISCO GeoJSON, bbox columns for spatial pre-filter - stg_income_usa: new — Census ACS state-level income staging model - dim_locations: spatial join replaces admin1_to_nuts1 VALUES CTE; us_income CTE with PPS normalisation (income/80610×30000); income cascade: NUTS-2→NUTS-1→US state→country - init_landing_seeds: compress=False for ST_Read files; gisco GeoJSON + census income seeds - CHANGELOG + PROJECT.md updated Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 9 ++ PROJECT.md | 4 +- scripts/init_landing_seeds.py | 30 +++- .../models/foundation/dim_locations.sql | 141 +++++++++++++----- .../models/staging/stg_income_usa.sql | 39 +++++ .../models/staging/stg_nuts2_boundaries.sql | 47 ++++++ 6 files changed, 228 insertions(+), 42 deletions(-) create mode 100644 transform/sqlmesh_padelnomics/models/staging/stg_income_usa.sql create mode 100644 transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index d3fb806..f07957c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - `init_landing_seeds.py`: seed entry for `eurostat/1970/01/nama_10r_2hhinc.json.gz`. - Verified income spread: Bayern (DE2) ~29K PPS > Hamburg (DE6) ~27K > Berlin (DE3) ~24K > Sachsen (DED) ~19K PPS. Non-mapped countries (ES, FR, IT) continue with country-level fallback. +- **Phase 2b — EU NUTS-2 spatial join + US state income** (`dim_locations`): all EU-27 + EFTA + UK locations now resolve to their NUTS-2 region automatically via a spatial join; US locations now use Census ACS state-level income instead of a flat country fallback. + - `stg_regional_income.sql`: expanded from NUTS-1 only (`LENGTH = 3`) to NUTS-1 + NUTS-2 (`LENGTH IN (3,4)`); column renamed `nuts1_code → nuts_code`; added `nuts_level` derived column (1 or 2). + - `scripts/download_gisco_nuts.py`: new one-time download script for NUTS-2 boundary GeoJSON from Eurostat GISCO (`NUTS_RG_20M_2021_4326_LEVL_2.geojson`, ~5 MB, NUTS revision 2021). Saves uncompressed — `ST_Read` cannot read `.gz`. + - `stg_nuts2_boundaries.sql`: new staging model — reads GeoJSON via `ST_Read`; extracts `nuts2_code`, `country_code`, `geometry`, and pre-computed bbox columns (`bbox_lat_min/max`, `bbox_lon_min/max`) for spatial pre-filter; normalises `EL→GR` / `UK→GB`. Grain: `nuts2_code`. + - `census_usa_income.py`: new extractor — fetches `B19013_001E` (median household income) at state level from Census ACS 5-year; saves to `census_usa/{year}/{month}/acs5_state_income.json.gz`; registered in `all.py` and `pyproject.toml`. + - `stg_income_usa.sql`: new staging model for US state income. Grain: `(state_fips, ref_year)`. Income kept in nominal USD — PPS conversion happens in `dim_locations`. + - `dim_locations.sql`: replaced `admin1_to_nuts1` VALUES CTE (16 DE rows) with full spatial join: `nuts2_match` (bbox pre-filter + `ST_Contains`) → `nuts2_income` / `nuts1_income` (latest year per level) → `regional_income` (COALESCE NUTS-2 → NUTS-1). Added `us_state_fips` (51-row VALUES CTE, admin1 abbreviation → FIPS) + `us_income` (PPS normalisation: `state_income / 80610.0 × 30000.0`). Final income cascade: EU NUTS-2 → EU NUTS-1 → US state → country-level. Germany now resolves to 38 Regierungsbezirke; Spain, France, Italy, Netherlands etc. all get NUTS-2 differentiation automatically. + - `init_landing_seeds.py`: `create_seed` extended with `compress=False` for files consumed by `ST_Read` (cannot read `.gz`); added `census_usa/1970/01/acs5_state_income.json.gz` seed and uncompressed `gisco/1970/01/nuts2_boundaries.geojson` empty-FeatureCollection seed. + ### Changed - **Opportunity Score v2 — income ceiling fix** (`location_opportunity_profile.sql`): income PPS normalisation changed from `/200.0` (caused LEAST(1.0, 115)=1.0 for ALL countries — no differentiation) to `/35000.0` with country-spread-matched ceiling. Default for missing data changed from 100 to 15000 (developing-market assumption). Country scores now reflect real PPS spread: LU 20.0, SE 14.3, DE 13.2, ES 10.7, GB 10.5 pts (was 20.0 everywhere). - **dim_cities population coverage 70.5% → 98.5%** — added GeoNames spatial fallback CTE that finds the nearest GeoNames location within ~15 km when string name matching fails (~29% of cities). Fixes localization mismatches (Milano≠Milan, Wien≠Vienna, München≠Munich): Wien 0→1,691,468; Milano 0→1,371,498. Population cascade now: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. diff --git a/PROJECT.md b/PROJECT.md index 2e459a5..3063488 100644 --- a/PROJECT.md +++ b/PROJECT.md @@ -1,7 +1,7 @@ # Padelnomics — Project Tracker > Move tasks across columns as you work. Add new tasks at the top of the relevant column. -> Last updated: 2026-02-27 (opportunity score data quality improvements). +> Last updated: 2026-02-27 (Phase 2b — EU NUTS-2 spatial join + US state income). --- @@ -92,6 +92,8 @@ - [x] **Opportunity Score v2 — income ceiling fix** — PPS normalisation `/200.0` → `/35000.0`; economic power component now differentiates countries (DE 13.2, ES 10.7, SE 14.3 pts; was 20.0 everywhere) - [x] **dim_cities population coverage 70.5% → 98.5%** — GeoNames spatial fallback CTE (ST_Distance_Sphere, 0.14° bbox) resolves localization mismatches (Wien→Vienna 1.69M, Milano→Milan 1.37M); population cascade: Eurostat > Census > ONS > GeoNames string > GeoNames spatial > 0 - [x] **overpass_tennis added to supervisor workflows** — monthly schedule in `workflows.toml`; was only in combined extractor +- [x] **Phase 2a — NUTS-1 regional income** — `eurostat.py` adds `nama_10r_2hhinc` dataset + URL filter params; `stg_regional_income.sql` new staging model (NUTS-1 codes, `EL→GR`/`UK→GB` normalisation); `dim_locations.sql` wires German Bundesland income via 16-row `admin1_to_nuts1` VALUES CTE; verified income spread Bayern > Hamburg > Berlin > Sachsen +- [x] **Phase 2b — EU NUTS-2 spatial join + US state income** — all EU-27+EFTA+UK locations auto-resolve to NUTS-2 via `ST_Contains` on GISCO boundary polygons; Germany now uses 38 Regierungsbezirke; US state income from Census ACS with PPS normalisation (`income / 80610 × 30000`); replaces brittle admin1 mapping CTE with zero-config spatial join; new files: `download_gisco_nuts.py`, `stg_nuts2_boundaries.sql`, `census_usa_income.py`, `stg_income_usa.sql` ### Data Pipeline (DaaS) - [x] Overpass API extractor (OSM padel courts) diff --git a/scripts/init_landing_seeds.py b/scripts/init_landing_seeds.py index 9705b51..fc81bb4 100644 --- a/scripts/init_landing_seeds.py +++ b/scripts/init_landing_seeds.py @@ -15,14 +15,22 @@ import json from pathlib import Path -def create_seed(dest: Path, content: bytes) -> None: - """Write content to a gzip file atomically. Skips if the file already exists.""" +def create_seed(dest: Path, content: bytes, *, compress: bool = True) -> None: + """Write content to a seed file atomically. Skips if the file already exists. + + compress=True (default) writes gzipped content, suitable for all landing zone + files. compress=False writes raw bytes — required for files consumed by DuckDB + ST_Read (e.g. GeoJSON), which cannot read .gz files. + """ if dest.exists(): return dest.parent.mkdir(parents=True, exist_ok=True) tmp = dest.with_suffix(dest.suffix + ".tmp") - with gzip.open(tmp, "wb") as f: - f.write(content) + if compress: + with gzip.open(tmp, "wb") as f: + f.write(content) + else: + tmp.write_bytes(content) tmp.rename(dest) print(f" created: {dest}") @@ -87,6 +95,8 @@ def main() -> None: json.dumps({"rows": [], "count": 0}).encode(), "eurostat/1970/01/nama_10r_2hhinc.json.gz": json.dumps({"rows": [], "count": 0}).encode(), + "census_usa/1970/01/acs5_state_income.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), "eurostat_city_labels/1970/01/cities_codelist.json.gz": json.dumps({"rows": [], "count": 0}).encode(), @@ -97,9 +107,21 @@ def main() -> None: json.dumps({"rows": [], "count": 0}).encode(), } + # Uncompressed seeds — required for files consumed by ST_Read (cannot read .gz) + uncompressed_seeds = { + # Empty NUTS-2 boundary placeholder so stg_nuts2_boundaries can run before + # the real file is downloaded via scripts/download_gisco_nuts.py. + # ST_Read on an empty FeatureCollection returns 0 rows (graceful degradation: + # all locations fall back to country-level income until the real file lands). + "gisco/1970/01/nuts2_boundaries.geojson": + b'{"type":"FeatureCollection","features":[]}', + } + print(f"Initialising landing seeds in: {base}") for rel_path, content in seeds.items(): create_seed(base / rel_path, content) + for rel_path, content in uncompressed_seeds.items(): + create_seed(base / rel_path, content, compress=False) print("Done.") diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index 8acc4d3..cad8730 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -6,14 +6,17 @@ -- covers all locations with population ≥ 1K so zero-court Gemeinden score fully. -- -- Enriched with: --- stg_income → country-level median income PPS (fallback) --- stg_regional_income → NUTS-1 regional income PPS (preferred where mapped) +-- stg_nuts2_boundaries + stg_regional_income → EU NUTS-2/NUTS-1 income (spatial join) +-- stg_income_usa → US state-level income (PPS-normalised) +-- stg_income → country-level income (fallback for all countries) -- stg_padel_courts → padel venue count + nearest court distance (km) -- stg_tennis_courts → tennis court count within 25km radius -- --- Income resolution: COALESCE(regional_income, country_income) AS median_income_pps. --- Germany: all 16 Bundesländer mapped via admin1_code → NUTS-1. --- All other countries fall back to country-level income (stg_income). +-- Income resolution cascade: +-- 1. EU NUTS-2 regional income (finest; spatial join via ST_Contains) +-- 2. EU NUTS-1 regional income (fallback when NUTS-2 income missing from dataset) +-- 3. US state income (ratio-normalised to PPS scale; see us_income CTE) +-- 4. Country-level income (global fallback from stg_income / ilc_di03) -- -- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension). -- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the @@ -44,43 +47,97 @@ locations AS ( FROM staging.stg_population_geonames WHERE lat IS NOT NULL AND lon IS NOT NULL ), --- Country income (same source and pattern as dim_cities) — fallback for unmapped regions +-- Country income (ilc_di03) — global fallback for all countries country_income AS ( SELECT country_code, median_income_pps, ref_year AS income_year FROM staging.stg_income QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1 ), --- Static mapping: GeoNames admin1_code → NUTS-1 code (Germany only; extend by adding rows) -admin1_to_nuts1 (country_code, admin1_code, nuts1_code) AS ( - VALUES - ('DE', '01', 'DE1'), -- Baden-Württemberg - ('DE', '02', 'DE2'), -- Bayern - ('DE', '03', 'DE5'), -- Bremen - ('DE', '04', 'DE6'), -- Hamburg - ('DE', '05', 'DE7'), -- Hessen - ('DE', '06', 'DE9'), -- Niedersachsen - ('DE', '07', 'DEA'), -- Nordrhein-Westfalen - ('DE', '08', 'DEB'), -- Rheinland-Pfalz - ('DE', '09', 'DEC'), -- Saarland - ('DE', '10', 'DEF'), -- Schleswig-Holstein - ('DE', '11', 'DE4'), -- Brandenburg - ('DE', '12', 'DE8'), -- Mecklenburg-Vorpommern - ('DE', '13', 'DED'), -- Sachsen - ('DE', '14', 'DEE'), -- Sachsen-Anhalt - ('DE', '15', 'DEG'), -- Thüringen - ('DE', '16', 'DE3') -- Berlin +-- ── EU NUTS-2 income via spatial join ────────────────────────────────────── +-- Each EU location's (lon, lat) is matched against NUTS-2 boundary polygons. +-- The bounding box pre-filter (bbox_lat/lon_min/max) eliminates most candidates +-- before the exact ST_Contains test runs. +nuts2_match AS ( + SELECT + l.geoname_id, + b.nuts2_code, + -- Derive parent NUTS-1 code (first 3 chars of NUTS-2) + SUBSTR(b.nuts2_code, 1, 3) AS nuts1_code + FROM locations l + JOIN staging.stg_nuts2_boundaries b + -- Bounding-box pre-filter: only test ST_Contains for polygons whose bbox + -- overlaps the point, reducing 140K x 242 to ~140K x 3-5 actual tests. + ON l.lat BETWEEN b.bbox_lat_min AND b.bbox_lat_max + AND l.lon BETWEEN b.bbox_lon_min AND b.bbox_lon_max + WHERE ST_Contains(b.geometry, ST_Point(l.lon, l.lat)) + -- A point should fall in exactly one polygon; QUALIFY guards against rare + -- boundary-precision duplicates. + QUALIFY ROW_NUMBER() OVER (PARTITION BY l.geoname_id ORDER BY b.nuts2_code) = 1 ), --- Latest NUTS-1 regional income per region +-- NUTS-2 income: latest year per NUTS-2 code +nuts2_income AS ( + SELECT nuts_code, regional_income_pps, ref_year AS regional_income_year + FROM staging.stg_regional_income + WHERE nuts_level = 2 + QUALIFY ROW_NUMBER() OVER (PARTITION BY nuts_code ORDER BY ref_year DESC) = 1 +), +-- NUTS-1 income: fallback when NUTS-2 income not in dataset for this region +nuts1_income AS ( + SELECT nuts_code, regional_income_pps, ref_year AS regional_income_year + FROM staging.stg_regional_income + WHERE nuts_level = 1 + QUALIFY ROW_NUMBER() OVER (PARTITION BY nuts_code ORDER BY ref_year DESC) = 1 +), +-- Combined EU regional income: NUTS-2 preferred, NUTS-1 fallback regional_income AS ( SELECT - m.country_code, + nm.geoname_id, + COALESCE(n2.regional_income_pps, n1.regional_income_pps) AS regional_income_pps, + COALESCE(n2.regional_income_year, n1.regional_income_year) AS regional_income_year + FROM nuts2_match nm + LEFT JOIN nuts2_income n2 ON nm.nuts2_code = n2.nuts_code + LEFT JOIN nuts1_income n1 ON nm.nuts1_code = n1.nuts_code +), +-- ── US state-level income ────────────────────────────────────────────────── +-- GeoNames admin1_code for US = 2-letter state abbreviation (ISO 3166-2:US). +-- Census ACS uses 2-digit FIPS codes. This 51-row VALUES CTE bridges them. +us_state_fips (admin1_code, state_fips) AS ( + VALUES + ('AL', '01'), ('AK', '02'), ('AZ', '04'), ('AR', '05'), ('CA', '06'), + ('CO', '08'), ('CT', '09'), ('DE', '10'), ('FL', '12'), ('GA', '13'), + ('HI', '15'), ('ID', '16'), ('IL', '17'), ('IN', '18'), ('IA', '19'), + ('KS', '20'), ('KY', '21'), ('LA', '22'), ('ME', '23'), ('MD', '24'), + ('MA', '25'), ('MI', '26'), ('MN', '27'), ('MS', '28'), ('MO', '29'), + ('MT', '30'), ('NE', '31'), ('NV', '32'), ('NH', '33'), ('NJ', '34'), + ('NM', '35'), ('NY', '36'), ('NC', '37'), ('ND', '38'), ('OH', '39'), + ('OK', '40'), ('OR', '41'), ('PA', '42'), ('RI', '44'), ('SC', '45'), + ('SD', '46'), ('TN', '47'), ('TX', '48'), ('UT', '49'), ('VT', '50'), + ('VA', '51'), ('WA', '53'), ('WV', '54'), ('WI', '55'), ('WY', '56'), + ('DC', '11') +), +-- US state income normalised to PPS-equivalent scale. +-- US Census reports median household income in nominal USD. To compare with EU +-- PPS per inhabitant, we use a ratio normalisation: +-- state_pps ≈ US_PPS_CONSTANT × (state_income / national_median) +-- +-- Constants (update when ACS vintage changes): +-- national_median = 80,610 — 2023 ACS national median household income (USD) +-- US_PPS_CONSTANT = 30,000 — approximate US per-capita income in PPS terms +-- (derived from OECD PPP tables, 2022) +-- +-- This gives a realistic spread across states: +-- California ($91,905) → ~34,200 PPS [≈ wealthy EU region] +-- Texas ($67,321) → ~25,100 PPS [≈ Germany average] +-- Mississippi($52,985) → ~19,700 PPS [≈ lower-income EU region] +us_income AS ( + SELECT m.admin1_code, - r.regional_income_pps, - r.ref_year AS regional_income_year - FROM admin1_to_nuts1 m - JOIN staging.stg_regional_income r ON r.nuts1_code = m.nuts1_code + ROUND(s.median_income_usd / 80610.0 * 30000.0, 0) AS median_income_pps, + s.ref_year AS income_year + FROM us_state_fips m + JOIN staging.stg_income_usa s ON m.state_fips = s.state_fips QUALIFY ROW_NUMBER() OVER ( - PARTITION BY m.country_code, m.admin1_code ORDER BY r.ref_year DESC + PARTITION BY m.admin1_code ORDER BY s.ref_year DESC ) = 1 ), -- Padel court lat/lon for distance and density calculations @@ -197,8 +254,17 @@ SELECT l.admin2_code, l.population, l.ref_year AS population_year, - COALESCE(ri.regional_income_pps, ci.median_income_pps) AS median_income_pps, - COALESCE(ri.regional_income_year, ci.income_year) AS income_year, + -- Income cascade: EU NUTS-2 → EU NUTS-1 → US state → country-level + COALESCE( + ri.regional_income_pps, -- EU: NUTS-2 (finest) or NUTS-1 (fallback) + us.median_income_pps, -- US: state-level PPS-equivalent + ci.median_income_pps -- Global: country-level from ilc_di03 + ) AS median_income_pps, + COALESCE( + ri.regional_income_year, + us.income_year, + ci.income_year + ) AS income_year, COALESCE(pl.padel_venue_count, 0)::INTEGER AS padel_venue_count, -- Venues per 100K residents (NULL if population = 0) CASE WHEN l.population > 0 @@ -210,12 +276,13 @@ SELECT CURRENT_DATE AS refreshed_date FROM locations l LEFT JOIN country_income ci ON l.country_code = ci.country_code -LEFT JOIN regional_income ri ON l.country_code = ri.country_code - AND l.admin1_code = ri.admin1_code +LEFT JOIN regional_income ri ON l.geoname_id = ri.geoname_id +LEFT JOIN us_income us ON l.country_code = 'US' + AND l.admin1_code = us.admin1_code LEFT JOIN nearest_padel np ON l.geoname_id = np.geoname_id LEFT JOIN padel_local pl ON l.geoname_id = pl.geoname_id LEFT JOIN tennis_nearby tn ON l.geoname_id = tn.geoname_id --- Enforce grain: deduplicate if city slug collides within same country +-- Enforce grain: deduplicate if location slug collides within same country QUALIFY ROW_NUMBER() OVER ( PARTITION BY l.country_code, l.geoname_id ORDER BY l.population DESC NULLS LAST diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_income_usa.sql b/transform/sqlmesh_padelnomics/models/staging/stg_income_usa.sql new file mode 100644 index 0000000..784e29d --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/staging/stg_income_usa.sql @@ -0,0 +1,39 @@ +-- US Census ACS 5-year state-level median household income (B19013_001E). +-- One row per (state_fips, ref_year). Income stored in nominal USD. +-- PPS conversion to EU-comparable scale happens in dim_locations. +-- +-- Source: data/landing/census_usa/{year}/{month}/acs5_state_income.json.gz +-- Extracted by: census_usa_income.py + +MODEL ( + name staging.stg_income_usa, + kind FULL, + cron '@daily', + grain (state_fips, ref_year) +); + +WITH parsed AS ( + SELECT + row ->> 'state_fips' AS state_fips, + row ->> 'state_name' AS state_name, + TRY_CAST(row ->> 'median_income_usd' AS DOUBLE) AS median_income_usd, + TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year, + CURRENT_DATE AS extracted_date + FROM ( + SELECT UNNEST(rows) AS row + FROM read_json( + @LANDING_DIR || '/census_usa/*/*/acs5_state_income.json.gz', + auto_detect = true + ) + ) + WHERE (row ->> 'state_fips') IS NOT NULL +) +SELECT + state_fips, + TRIM(state_name) AS state_name, + median_income_usd, + ref_year, + extracted_date +FROM parsed +WHERE median_income_usd IS NOT NULL + AND median_income_usd > 0 diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql b/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql new file mode 100644 index 0000000..32481d5 --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql @@ -0,0 +1,47 @@ +-- NUTS-2 boundary polygons from Eurostat GISCO for spatial income resolution. +-- Used by dim_locations to resolve each location's (lat, lon) to its NUTS-2 +-- region via ST_Contains point-in-polygon join. +-- +-- Source: data/landing/gisco/2024/01/nuts2_boundaries.geojson +-- Downloaded by: scripts/download_gisco_nuts.py (run once; re-run on NUTS revision) +-- Format: GeoJSON FeatureCollection with NUTS_ID, CNTR_CODE, NAME_LATN properties +-- NUTS revision: 2021 (effective 2024-01-01); changes ~every 7 years +-- +-- Coverage: EU-27 + EFTA (NO, CH, IS, LI) + UK + candidate countries +-- ~242 NUTS-2 polygons at 1:20,000,000 scale (simplified for performance) + +MODEL ( + name staging.stg_nuts2_boundaries, + kind FULL, + cron '@daily', + grain nuts2_code +); + +WITH raw AS ( + -- ST_Read reads GeoJSON via GDAL. File must be uncompressed (.geojson, not .gz). + -- Path is fixed (not glob) because the boundary file is a one-time download, + -- not a time-partitioned series. The partition "2024/01" reflects the NUTS + -- revision year, not the download date. + SELECT * + FROM ST_Read(@LANDING_DIR || '/gisco/2024/01/nuts2_boundaries.geojson') +) +SELECT + NUTS_ID AS nuts2_code, + -- Normalise country prefix to ISO 3166-1 alpha-2: EL→GR, UK→GB + CASE CNTR_CODE + WHEN 'EL' THEN 'GR' + WHEN 'UK' THEN 'GB' + ELSE CNTR_CODE + END AS country_code, + NAME_LATN AS region_name, + geom AS geometry, + -- Pre-compute bounding box for efficient spatial pre-filter in dim_locations. + -- Each location only calls ST_Contains against the ~3-5 polygons whose bbox + -- overlaps it, not all 242. + ST_YMin(geom) AS bbox_lat_min, + ST_YMax(geom) AS bbox_lat_max, + ST_XMin(geom) AS bbox_lon_min, + ST_XMax(geom) AS bbox_lon_max +FROM raw +WHERE NUTS_ID IS NOT NULL + AND LENGTH(NUTS_ID) = 4 -- NUTS-2 codes are exactly 4 characters