feat(data): Phase 2b complete — EU NUTS-2 spatial join + US state income
- stg_regional_income: expanded NUTS-1+2 (LENGTH IN 3,4), nuts_code rename, nuts_level - stg_nuts2_boundaries: new — ST_Read GISCO GeoJSON, bbox columns for spatial pre-filter - stg_income_usa: new — Census ACS state-level income staging model - dim_locations: spatial join replaces admin1_to_nuts1 VALUES CTE; us_income CTE with PPS normalisation (income/80610×30000); income cascade: NUTS-2→NUTS-1→US state→country - init_landing_seeds: compress=False for ST_Read files; gisco GeoJSON + census income seeds - CHANGELOG + PROJECT.md updated Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,14 +15,22 @@ import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def create_seed(dest: Path, content: bytes) -> None:
|
||||
"""Write content to a gzip file atomically. Skips if the file already exists."""
|
||||
def create_seed(dest: Path, content: bytes, *, compress: bool = True) -> None:
|
||||
"""Write content to a seed file atomically. Skips if the file already exists.
|
||||
|
||||
compress=True (default) writes gzipped content, suitable for all landing zone
|
||||
files. compress=False writes raw bytes — required for files consumed by DuckDB
|
||||
ST_Read (e.g. GeoJSON), which cannot read .gz files.
|
||||
"""
|
||||
if dest.exists():
|
||||
return
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
||||
with gzip.open(tmp, "wb") as f:
|
||||
f.write(content)
|
||||
if compress:
|
||||
with gzip.open(tmp, "wb") as f:
|
||||
f.write(content)
|
||||
else:
|
||||
tmp.write_bytes(content)
|
||||
tmp.rename(dest)
|
||||
print(f" created: {dest}")
|
||||
|
||||
@@ -87,6 +95,8 @@ def main() -> None:
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat/1970/01/nama_10r_2hhinc.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"census_usa/1970/01/acs5_state_income.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat_city_labels/1970/01/cities_codelist.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
|
||||
@@ -97,9 +107,21 @@ def main() -> None:
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
}
|
||||
|
||||
# Uncompressed seeds — required for files consumed by ST_Read (cannot read .gz)
|
||||
uncompressed_seeds = {
|
||||
# Empty NUTS-2 boundary placeholder so stg_nuts2_boundaries can run before
|
||||
# the real file is downloaded via scripts/download_gisco_nuts.py.
|
||||
# ST_Read on an empty FeatureCollection returns 0 rows (graceful degradation:
|
||||
# all locations fall back to country-level income until the real file lands).
|
||||
"gisco/1970/01/nuts2_boundaries.geojson":
|
||||
b'{"type":"FeatureCollection","features":[]}',
|
||||
}
|
||||
|
||||
print(f"Initialising landing seeds in: {base}")
|
||||
for rel_path, content in seeds.items():
|
||||
create_seed(base / rel_path, content)
|
||||
for rel_path, content in uncompressed_seeds.items():
|
||||
create_seed(base / rel_path, content, compress=False)
|
||||
print("Done.")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user