Files
padelnomics/scripts/init_landing_seeds.py
Deeman 5ade38eeaf feat(data): Phase 2a — NUTS-1 regional income for opportunity score
- eurostat.py: add nama_10r_2hhinc dataset config; append filter params to
  request URL so server pre-filters the large cube before download
- stg_regional_income.sql: new staging model — reads nama_10r_2hhinc.json.gz,
  filters to NUTS-1 codes (3-char), normalises EL→GR / UK→GB
- dim_locations.sql: add admin1_to_nuts1 VALUES CTE (16 German Bundesländer)
  + regional_income CTE; final SELECT uses COALESCE(regional, country) income
- init_landing_seeds.py: add empty seed for nama_10r_2hhinc.json.gz

Munich/Bayern now scores ~29K PPS vs Chemnitz/Sachsen ~19K PPS instead of
both inheriting the same national average (~25.5K PPS).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 10:26:15 +01:00

108 lines
4.3 KiB
Python

"""Create minimal landing zone seed files so SQLMesh models can run before real data arrives.
Each seed contains one null/empty record that is filtered out by the staging model's
WHERE clause. Seeds live in the 1970/01 epoch so they're never confused with real data.
Usage:
uv run python scripts/init_landing_seeds.py [--landing-dir data/landing]
Idempotent: skips existing files.
"""
import argparse
import gzip
import json
from pathlib import Path
def create_seed(dest: Path, content: bytes) -> None:
"""Write content to a gzip file atomically. Skips if the file already exists."""
if dest.exists():
return
dest.parent.mkdir(parents=True, exist_ok=True)
tmp = dest.with_suffix(dest.suffix + ".tmp")
with gzip.open(tmp, "wb") as f:
f.write(content)
tmp.rename(dest)
print(f" created: {dest}")
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--landing-dir", default="data/landing", type=Path)
args = parser.parse_args()
base: Path = args.landing_dir
seeds = {
# --- Playtomic tenants ---
# JSONL: one null tenant (filtered by WHERE tenant_id IS NOT NULL)
"playtomic/1970/01/tenants.jsonl.gz":
b'{"tenant_id":null}\n',
# Blob: empty tenants array
"playtomic/1970/01/tenants.json.gz":
json.dumps({"tenants": [], "count": 0}).encode(),
# --- Playtomic availability (morning) ---
# JSONL: one null venue (filtered by WHERE tenant_id IS NOT NULL)
"playtomic/1970/01/availability_1970-01-01.jsonl.gz":
b'{"tenant_id":null,"date":"1970-01-01","captured_at_utc":"1970-01-01T00:00:00Z","slots":null}\n',
# Blob: empty venues array
"playtomic/1970/01/availability_1970-01-01.json.gz":
json.dumps({"date": "1970-01-01", "captured_at_utc": "1970-01-01T00:00:00Z",
"venue_count": 0, "venues": []}).encode(),
# --- Playtomic recheck ---
# JSONL: one null venue (filtered by WHERE tenant_id IS NOT NULL)
"playtomic/1970/01/availability_1970-01-01_recheck_00.jsonl.gz":
b'{"tenant_id":null,"date":"1970-01-01","captured_at_utc":"1970-01-01T00:00:00Z","recheck_hour":0,"slots":null}\n',
# Blob: empty venues array (old format, kept for transition)
"playtomic/1970/01/availability_1970-01-01_recheck_00.json.gz":
json.dumps({"date": "1970-01-01", "captured_at_utc": "1970-01-01T00:00:00Z",
"recheck_hour": 0, "venues": []}).encode(),
# --- GeoNames ---
# JSONL: one null city (filtered by WHERE geoname_id IS NOT NULL)
"geonames/1970/01/cities_global.jsonl.gz":
b'{"geoname_id":null}\n',
# Blob: empty rows array
"geonames/1970/01/cities_global.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
# --- Overpass tennis ---
# JSONL: one null element (filtered by WHERE type IS NOT NULL)
"overpass_tennis/1970/01/courts.jsonl.gz":
b'{"type":null,"id":null}\n',
# Blob: empty elements array
"overpass_tennis/1970/01/courts.json.gz":
json.dumps({"version": 0.6, "elements": []}).encode(),
# --- Overpass padel (unchanged format) ---
"overpass/1970/01/courts.json.gz":
json.dumps({"version": 0.6, "elements": []}).encode(),
# --- Eurostat ---
"eurostat/1970/01/urb_cpop1.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
"eurostat/1970/01/ilc_di03.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
"eurostat/1970/01/nama_10r_2hhinc.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
"eurostat_city_labels/1970/01/cities_codelist.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
# --- National statistics ---
"ons_uk/1970/01/lad_population.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
"census_usa/1970/01/acs5_places.json.gz":
json.dumps({"rows": [], "count": 0}).encode(),
}
print(f"Initialising landing seeds in: {base}")
for rel_path, content in seeds.items():
create_seed(base / rel_path, content)
print("Done.")
if __name__ == "__main__":
main()