feat(data): Phase 2b step 1 — expand stg_regional_income + Census income extractor
- stg_regional_income.sql: accept NUTS-1 (3-char) + NUTS-2 (4-char) codes; rename nuts1_code → nuts_code; add nuts_level column; NUTS-2 rows were already in the landing zone but discarded by LENGTH(geo_code) = 3 - scripts/download_gisco_nuts.py: one-time download of GISCO NUTS-2 boundary GeoJSON (NUTS_RG_20M_2021_4326_LEVL_2.geojson, ~5MB) to landing zone; uncompressed because ST_Read cannot read .gz files - census_usa_income.py: new extractor for ACS B19013_001E state-level median household income; follows census_usa.py pattern; 51 states + DC - all.py + pyproject.toml: register census_usa_income extractor Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
81
scripts/download_gisco_nuts.py
Normal file
81
scripts/download_gisco_nuts.py
Normal file
@@ -0,0 +1,81 @@
|
||||
"""Download NUTS-2 boundary GeoJSON from Eurostat GISCO.
|
||||
|
||||
One-time (or on NUTS revision) download of NUTS-2 boundary polygons used for
|
||||
spatial income resolution in dim_locations. Stored uncompressed because DuckDB's
|
||||
ST_Read function cannot read gzipped files.
|
||||
|
||||
NUTS classification changes approximately every 7 years. Current revision: 2021.
|
||||
|
||||
Output: {LANDING_DIR}/gisco/2024/01/nuts2_boundaries.geojson (~5MB, uncompressed)
|
||||
|
||||
Usage:
|
||||
uv run python scripts/download_gisco_nuts.py [--landing-dir data/landing]
|
||||
|
||||
Idempotent: skips download if the file already exists.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import niquests
|
||||
|
||||
# NUTS 2021 revision, 20M scale (1:20,000,000), WGS84 (EPSG:4326), LEVL_2 only.
|
||||
# 20M resolution gives simplified polygons that are fast for point-in-polygon
|
||||
# matching without sacrificing accuracy at the NUTS-2 boundary level.
|
||||
GISCO_URL = (
|
||||
"https://gisco-services.ec.europa.eu/distribution/v2/nuts/geojson/"
|
||||
"NUTS_RG_20M_2021_4326_LEVL_2.geojson"
|
||||
)
|
||||
|
||||
# Fixed partition: NUTS boundaries are a static reference file, not time-series data.
|
||||
# Use the NUTS revision year as the partition to make the source version explicit.
|
||||
DEST_REL_PATH = "gisco/2024/01/nuts2_boundaries.geojson"
|
||||
|
||||
HTTP_TIMEOUT_SECONDS = 120
|
||||
|
||||
|
||||
def download_nuts_boundaries(landing_dir: Path) -> None:
|
||||
dest = landing_dir / DEST_REL_PATH
|
||||
if dest.exists():
|
||||
print(f"Already exists (skipping): {dest}")
|
||||
return
|
||||
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Downloading NUTS-2 boundaries from GISCO...")
|
||||
print(f" URL: {GISCO_URL}")
|
||||
|
||||
with niquests.Session() as session:
|
||||
resp = session.get(GISCO_URL, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
resp.raise_for_status()
|
||||
|
||||
content = resp.content
|
||||
assert len(content) > 100_000, (
|
||||
f"GeoJSON too small ({len(content)} bytes) — download may have failed"
|
||||
)
|
||||
assert b'"FeatureCollection"' in content, "Response does not look like GeoJSON"
|
||||
|
||||
# Write uncompressed — ST_Read requires a plain file
|
||||
tmp = dest.with_suffix(".geojson.tmp")
|
||||
tmp.write_bytes(content)
|
||||
tmp.rename(dest)
|
||||
|
||||
size_mb = len(content) / 1_000_000
|
||||
print(f" Written: {dest} ({size_mb:.1f} MB)")
|
||||
print("Done. Run SQLMesh plan to rebuild stg_nuts2_boundaries.")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--landing-dir", default="data/landing", type=Path)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.landing_dir.is_dir():
|
||||
print(f"Error: landing dir does not exist: {args.landing_dir}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
download_nuts_boundaries(args.landing_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user