diff --git a/scripts/init_landing_seeds.py b/scripts/init_landing_seeds.py new file mode 100644 index 0000000..cc61bd5 --- /dev/null +++ b/scripts/init_landing_seeds.py @@ -0,0 +1,101 @@ +"""Create minimal landing zone seed files so SQLMesh models can run before real data arrives. + +Each seed contains one null/empty record that is filtered out by the staging model's +WHERE clause. Seeds live in the 1970/01 epoch so they're never confused with real data. + +Usage: + uv run python scripts/init_landing_seeds.py [--landing-dir data/landing] + +Idempotent: skips existing files. +""" + +import argparse +import gzip +import json +from pathlib import Path + + +def create_seed(dest: Path, content: bytes) -> None: + """Write content to a gzip file atomically. Skips if the file already exists.""" + if dest.exists(): + return + dest.parent.mkdir(parents=True, exist_ok=True) + tmp = dest.with_suffix(dest.suffix + ".tmp") + with gzip.open(tmp, "wb") as f: + f.write(content) + tmp.rename(dest) + print(f" created: {dest}") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--landing-dir", default="data/landing", type=Path) + args = parser.parse_args() + base: Path = args.landing_dir + + seeds = { + # --- Playtomic tenants --- + # JSONL: one null tenant (filtered by WHERE tenant_id IS NOT NULL) + "playtomic/1970/01/tenants.jsonl.gz": + b'{"tenant_id":null}\n', + # Blob: empty tenants array + "playtomic/1970/01/tenants.json.gz": + json.dumps({"tenants": [], "count": 0}).encode(), + + # --- Playtomic availability (morning) --- + # JSONL: one null venue (filtered by WHERE tenant_id IS NOT NULL) + "playtomic/1970/01/availability_1970-01-01.jsonl.gz": + b'{"tenant_id":null,"date":"1970-01-01","captured_at_utc":"1970-01-01T00:00:00Z","slots":null}\n', + # Blob: empty venues array + "playtomic/1970/01/availability_1970-01-01.json.gz": + json.dumps({"date": "1970-01-01", "captured_at_utc": "1970-01-01T00:00:00Z", + "venue_count": 0, "venues": []}).encode(), + + # --- Playtomic recheck (blob only, small format) --- + "playtomic/1970/01/availability_1970-01-01_recheck_00.json.gz": + json.dumps({"date": "1970-01-01", "captured_at_utc": "1970-01-01T00:00:00Z", + "recheck_hour": 0, "venues": []}).encode(), + + # --- GeoNames --- + # JSONL: one null city (filtered by WHERE geoname_id IS NOT NULL) + "geonames/1970/01/cities_global.jsonl.gz": + b'{"geoname_id":null}\n', + # Blob: empty rows array + "geonames/1970/01/cities_global.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), + + # --- Overpass tennis --- + # JSONL: one null element (filtered by WHERE type IS NOT NULL) + "overpass_tennis/1970/01/courts.jsonl.gz": + b'{"type":null,"id":null}\n', + # Blob: empty elements array + "overpass_tennis/1970/01/courts.json.gz": + json.dumps({"version": 0.6, "elements": []}).encode(), + + # --- Overpass padel (unchanged format) --- + "overpass/1970/01/courts.json.gz": + json.dumps({"version": 0.6, "elements": []}).encode(), + + # --- Eurostat --- + "eurostat/1970/01/urb_cpop1.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), + "eurostat/1970/01/ilc_di03.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), + "eurostat_city_labels/1970/01/cities_codelist.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), + + # --- National statistics --- + "ons_uk/1970/01/lad_population.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), + "census_usa/1970/01/acs5_places.json.gz": + json.dumps({"rows": [], "count": 0}).encode(), + } + + print(f"Initialising landing seeds in: {base}") + for rel_path, content in seeds.items(): + create_seed(base / rel_path, content) + print("Done.") + + +if __name__ == "__main__": + main()