"""Download NUTS-2 boundary GeoJSON from Eurostat GISCO. One-time (or on NUTS revision) download of NUTS-2 boundary polygons used for spatial income resolution in dim_locations. Stored uncompressed because DuckDB's ST_Read function cannot read gzipped files. NUTS classification changes approximately every 7 years. Current revision: 2021. Output: {LANDING_DIR}/gisco/2024/01/nuts2_boundaries.geojson (~5MB, uncompressed) Usage: uv run python scripts/download_gisco_nuts.py [--landing-dir data/landing] Idempotent: skips download if the file already exists. """ import argparse import sys from pathlib import Path import niquests # NUTS 2021 revision, 20M scale (1:20,000,000), WGS84 (EPSG:4326), LEVL_2 only. # 20M resolution gives simplified polygons that are fast for point-in-polygon # matching without sacrificing accuracy at the NUTS-2 boundary level. GISCO_URL = ( "https://gisco-services.ec.europa.eu/distribution/v2/nuts/geojson/" "NUTS_RG_20M_2021_4326_LEVL_2.geojson" ) # Fixed partition: NUTS boundaries are a static reference file, not time-series data. # Use the NUTS revision year as the partition to make the source version explicit. DEST_REL_PATH = "gisco/2024/01/nuts2_boundaries.geojson" HTTP_TIMEOUT_SECONDS = 120 def download_nuts_boundaries(landing_dir: Path) -> None: dest = landing_dir / DEST_REL_PATH if dest.exists(): print(f"Already exists (skipping): {dest}") return dest.parent.mkdir(parents=True, exist_ok=True) print(f"Downloading NUTS-2 boundaries from GISCO...") print(f" URL: {GISCO_URL}") with niquests.Session() as session: resp = session.get(GISCO_URL, timeout=HTTP_TIMEOUT_SECONDS) resp.raise_for_status() content = resp.content assert len(content) > 100_000, ( f"GeoJSON too small ({len(content)} bytes) — download may have failed" ) assert b'"FeatureCollection"' in content, "Response does not look like GeoJSON" # Write uncompressed — ST_Read requires a plain file tmp = dest.with_suffix(".geojson.tmp") tmp.write_bytes(content) tmp.rename(dest) size_mb = len(content) / 1_000_000 print(f" Written: {dest} ({size_mb:.1f} MB)") print("Done. Run SQLMesh plan to rebuild stg_nuts2_boundaries.") def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--landing-dir", default="data/landing", type=Path) args = parser.parse_args() if not args.landing_dir.is_dir(): print(f"Error: landing dir does not exist: {args.landing_dir}", file=sys.stderr) sys.exit(1) download_nuts_boundaries(args.landing_dir) if __name__ == "__main__": main()