feat(data): Phase 2b step 1 — expand stg_regional_income + Census income extractor
- stg_regional_income.sql: accept NUTS-1 (3-char) + NUTS-2 (4-char) codes; rename nuts1_code → nuts_code; add nuts_level column; NUTS-2 rows were already in the landing zone but discarded by LENGTH(geo_code) = 3 - scripts/download_gisco_nuts.py: one-time download of GISCO NUTS-2 boundary GeoJSON (NUTS_RG_20M_2021_4326_LEVL_2.geojson, ~5MB) to landing zone; uncompressed because ST_Read cannot read .gz files - census_usa_income.py: new extractor for ACS B19013_001E state-level median household income; follows census_usa.py pattern; 51 states + DC - all.py + pyproject.toml: register census_usa_income extractor Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,15 @@
|
||||
-- Eurostat NUTS-1 regional household income in PPS (dataset: nama_10r_2hhinc).
|
||||
-- Filters to NUTS-1 codes (exactly 3 characters, e.g. DE1, DE2, …).
|
||||
-- One row per (nuts1_code, ref_year).
|
||||
-- Eurostat NUTS-1 and NUTS-2 regional household income in PPS (dataset: nama_10r_2hhinc).
|
||||
-- Accepts NUTS-1 codes (3 characters, e.g. DE1) and NUTS-2 codes (4 characters, e.g. DE21).
|
||||
-- One row per (nuts_code, ref_year).
|
||||
--
|
||||
-- Source: data/landing/eurostat/{year}/{month}/nama_10r_2hhinc.json.gz
|
||||
-- Format: {"rows": [{"geo_code": "DE1", "ref_year": "2022", "value": 29400}, ...]}
|
||||
-- Format: {"rows": [{"geo_code": "DE21", "ref_year": "2022", "value": 31200}, ...]}
|
||||
|
||||
MODEL (
|
||||
name staging.stg_regional_income,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain (nuts1_code, ref_year)
|
||||
grain (nuts_code, ref_year)
|
||||
);
|
||||
|
||||
WITH source AS (
|
||||
@@ -34,11 +34,13 @@ SELECT
|
||||
WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3)
|
||||
WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3)
|
||||
ELSE geo_code
|
||||
END AS nuts1_code,
|
||||
END AS nuts_code,
|
||||
-- NUTS level: 3-char = NUTS-1, 4-char = NUTS-2
|
||||
LENGTH(geo_code) - 2 AS nuts_level,
|
||||
ref_year,
|
||||
regional_income_pps,
|
||||
extracted_date
|
||||
FROM parsed
|
||||
-- NUTS-1 codes are exactly 3 characters (country 2 + region 1)
|
||||
WHERE LENGTH(geo_code) = 3
|
||||
-- NUTS-1 (3 chars) and NUTS-2 (4 chars); exclude country codes (2) and NUTS-3 (5)
|
||||
WHERE LENGTH(geo_code) IN (3, 4)
|
||||
AND regional_income_pps > 0
|
||||
|
||||
Reference in New Issue
Block a user