feat: add init_landing_seeds.py for empty-landing bootstrap
Creates minimal .jsonl.gz and .json.gz seed files so all SQLMesh staging
models can compile and run before real extraction data arrives.
Each seed has a single null record filtered by the staging model's WHERE
clause (tenant_id IS NOT NULL, geoname_id IS NOT NULL, type IS NOT NULL, etc).
Covers both formats (JSONL + blob) for the UNION ALL transition CTEs:
playtomic/1970/01/: tenants.{jsonl,json}.gz, availability seeds (morning + recheck)
geonames/1970/01/: cities_global.{jsonl,json}.gz
overpass_tennis/1970/01/: courts.{jsonl,json}.gz
overpass/1970/01/: courts.json.gz (padel, unchanged format)
eurostat/1970/01/: urb_cpop1.json.gz, ilc_di03.json.gz
eurostat_city_labels/1970/01/: cities_codelist.json.gz
ons_uk/1970/01/: lad_population.json.gz
census_usa/1970/01/: acs5_places.json.gz
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
101
scripts/init_landing_seeds.py
Normal file
101
scripts/init_landing_seeds.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Create minimal landing zone seed files so SQLMesh models can run before real data arrives.
|
||||
|
||||
Each seed contains one null/empty record that is filtered out by the staging model's
|
||||
WHERE clause. Seeds live in the 1970/01 epoch so they're never confused with real data.
|
||||
|
||||
Usage:
|
||||
uv run python scripts/init_landing_seeds.py [--landing-dir data/landing]
|
||||
|
||||
Idempotent: skips existing files.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def create_seed(dest: Path, content: bytes) -> None:
|
||||
"""Write content to a gzip file atomically. Skips if the file already exists."""
|
||||
if dest.exists():
|
||||
return
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
||||
with gzip.open(tmp, "wb") as f:
|
||||
f.write(content)
|
||||
tmp.rename(dest)
|
||||
print(f" created: {dest}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--landing-dir", default="data/landing", type=Path)
|
||||
args = parser.parse_args()
|
||||
base: Path = args.landing_dir
|
||||
|
||||
seeds = {
|
||||
# --- Playtomic tenants ---
|
||||
# JSONL: one null tenant (filtered by WHERE tenant_id IS NOT NULL)
|
||||
"playtomic/1970/01/tenants.jsonl.gz":
|
||||
b'{"tenant_id":null}\n',
|
||||
# Blob: empty tenants array
|
||||
"playtomic/1970/01/tenants.json.gz":
|
||||
json.dumps({"tenants": [], "count": 0}).encode(),
|
||||
|
||||
# --- Playtomic availability (morning) ---
|
||||
# JSONL: one null venue (filtered by WHERE tenant_id IS NOT NULL)
|
||||
"playtomic/1970/01/availability_1970-01-01.jsonl.gz":
|
||||
b'{"tenant_id":null,"date":"1970-01-01","captured_at_utc":"1970-01-01T00:00:00Z","slots":null}\n',
|
||||
# Blob: empty venues array
|
||||
"playtomic/1970/01/availability_1970-01-01.json.gz":
|
||||
json.dumps({"date": "1970-01-01", "captured_at_utc": "1970-01-01T00:00:00Z",
|
||||
"venue_count": 0, "venues": []}).encode(),
|
||||
|
||||
# --- Playtomic recheck (blob only, small format) ---
|
||||
"playtomic/1970/01/availability_1970-01-01_recheck_00.json.gz":
|
||||
json.dumps({"date": "1970-01-01", "captured_at_utc": "1970-01-01T00:00:00Z",
|
||||
"recheck_hour": 0, "venues": []}).encode(),
|
||||
|
||||
# --- GeoNames ---
|
||||
# JSONL: one null city (filtered by WHERE geoname_id IS NOT NULL)
|
||||
"geonames/1970/01/cities_global.jsonl.gz":
|
||||
b'{"geoname_id":null}\n',
|
||||
# Blob: empty rows array
|
||||
"geonames/1970/01/cities_global.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
|
||||
# --- Overpass tennis ---
|
||||
# JSONL: one null element (filtered by WHERE type IS NOT NULL)
|
||||
"overpass_tennis/1970/01/courts.jsonl.gz":
|
||||
b'{"type":null,"id":null}\n',
|
||||
# Blob: empty elements array
|
||||
"overpass_tennis/1970/01/courts.json.gz":
|
||||
json.dumps({"version": 0.6, "elements": []}).encode(),
|
||||
|
||||
# --- Overpass padel (unchanged format) ---
|
||||
"overpass/1970/01/courts.json.gz":
|
||||
json.dumps({"version": 0.6, "elements": []}).encode(),
|
||||
|
||||
# --- Eurostat ---
|
||||
"eurostat/1970/01/urb_cpop1.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat/1970/01/ilc_di03.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"eurostat_city_labels/1970/01/cities_codelist.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
|
||||
# --- National statistics ---
|
||||
"ons_uk/1970/01/lad_population.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
"census_usa/1970/01/acs5_places.json.gz":
|
||||
json.dumps({"rows": [], "count": 0}).encode(),
|
||||
}
|
||||
|
||||
print(f"Initialising landing seeds in: {base}")
|
||||
for rel_path, content in seeds.items():
|
||||
create_seed(base / rel_path, content)
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user