feat(extract): convert playtomic_availability to JSONL output

- availability_{date}.jsonl.gz replaces .json.gz for morning snapshots
- Each JSONL line = one venue object with date + captured_at_utc injected
- Eliminates in-memory consolidation: working.jsonl IS the final file
  (compress_jsonl_atomic at end instead of write_gzip_atomic blob)
- Crash recovery unchanged: working.jsonl accumulates via flush_partial_batch
- _load_morning_availability tries .jsonl.gz first, falls back to .json.gz
- Skip check covers both formats during transition
- Recheck files stay blob format (small, infrequent)

stg_playtomic_availability: UNION ALL transition (morning_jsonl + morning_blob + recheck_blob)
  - morning_jsonl: read_json JSONL, tenant_id direct column, no outer UNNEST
  - morning_blob / recheck_blob: subquery + LATERAL UNNEST (unchanged semantics)
  - All three produce (snapshot_date, captured_at_utc, snapshot_type, recheck_hour, tenant_id, slots_json)
  - Downstream raw_resources / raw_slots CTEs unchanged

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-25 12:14:38 +01:00
parent 9bef055e6d
commit 7b03fd71f9
2 changed files with 139 additions and 83 deletions

View File

@@ -16,7 +16,7 @@ records (a few seconds of work with 10 workers) are lost on crash.
Recheck mode: re-queries venues with slots starting within the next 90 minutes.
Writes a separate recheck file for more accurate occupancy measurement.
Landing: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}.json.gz
Landing: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}.jsonl.gz
Recheck: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}_recheck_{HH}.json.gz
"""
@@ -34,7 +34,13 @@ import niquests
from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging
from .proxy import load_fallback_proxy_urls, load_proxy_urls, make_tiered_cycler
from .utils import flush_partial_batch, landing_path, load_partial_results, write_gzip_atomic
from .utils import (
compress_jsonl_atomic,
flush_partial_batch,
landing_path,
load_partial_results,
write_gzip_atomic,
)
logger = setup_logging("padelnomics.extract.playtomic_availability")
@@ -273,14 +279,14 @@ def extract(
year, month = year_month.split("/")
dest_dir = landing_path(landing_dir, "playtomic", year, month)
dest = dest_dir / f"availability_{target_date}.json.gz"
if dest.exists():
logger.info("Already have %s — skipping", dest)
dest = dest_dir / f"availability_{target_date}.jsonl.gz"
old_blob = dest_dir / f"availability_{target_date}.json.gz"
if dest.exists() or old_blob.exists():
logger.info("Already have availability for %s — skipping", target_date)
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
# Crash resumption: load already-fetched venues from partial file
partial_path = dest.with_suffix(".partial.jsonl")
# Crash resumption: load already-fetched venues from working file
partial_path = dest_dir / f"availability_{target_date}.working.jsonl"
prior_results, already_done = load_partial_results(partial_path, id_key="tenant_id")
if already_done:
logger.info("Resuming: %d venues already fetched from partial file", len(already_done))
@@ -297,7 +303,10 @@ def extract(
start_min_str = start_min.strftime("%Y-%m-%dT%H:%M:%S")
start_max_str = start_max.strftime("%Y-%m-%dT%H:%M:%S")
# Partial file for incremental crash-safe progress
# Timestamp stamped into every JSONL line — computed once before the fetch loop.
captured_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
# Working file for incremental crash-safe progress (IS the final file).
partial_file = open(partial_path, "a") # noqa: SIM115
partial_lock = threading.Lock()
pending_batch: list[dict] = []
@@ -305,6 +314,9 @@ def extract(
def _on_result(result: dict) -> None:
# Called inside _fetch_venues_parallel's lock — no additional locking needed.
# In serial mode, called single-threaded — also safe without extra locking.
# Inject date + captured_at so every JSONL line is self-contained.
result["date"] = target_date
result["captured_at_utc"] = captured_at
pending_batch.append(result)
if len(pending_batch) >= PARTIAL_FLUSH_SIZE:
flush_partial_batch(partial_file, partial_lock, pending_batch)
@@ -348,24 +360,13 @@ def extract(
pending_batch.clear()
partial_file.close()
# Consolidate prior (resumed) + new results into final file
venues_data = prior_results + new_venues_data
captured_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
payload = json.dumps({
"date": target_date,
"captured_at_utc": captured_at,
"venue_count": len(venues_data),
"venues_errored": venues_errored,
"venues": venues_data,
}).encode()
bytes_written = write_gzip_atomic(dest, payload)
if partial_path.exists():
partial_path.unlink()
# Working file IS the output — compress atomically (deletes source).
total_venues = len(prior_results) + len(new_venues_data)
bytes_written = compress_jsonl_atomic(partial_path, dest)
logger.info(
"%d venues scraped (%d errors) -> %s (%s bytes)",
len(venues_data), venues_errored, dest, f"{bytes_written:,}",
total_venues, venues_errored, dest, f"{bytes_written:,}",
)
return {
@@ -380,14 +381,36 @@ def extract(
# Recheck mode — re-query venues with upcoming slots for accurate occupancy
# ---------------------------------------------------------------------------
def _read_availability_jsonl(path: Path) -> dict:
"""Read a JSONL availability file into the blob dict format recheck expects."""
venues = []
date_val = captured_at = None
with gzip.open(path, "rt") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
break # truncated last line on crash
if date_val is None:
date_val = record.get("date")
captured_at = record.get("captured_at_utc")
venues.append(record)
return {"date": date_val, "captured_at_utc": captured_at, "venues": venues}
def _load_morning_availability(landing_dir: Path, target_date: str) -> dict | None:
"""Load today's morning availability file. Returns parsed JSON or None."""
"""Load today's morning availability file (JSONL or blob). Returns dict or None."""
playtomic_dir = landing_dir / "playtomic"
# Search across year/month dirs for the target date
# Try JSONL first (new format), fall back to blob (old format)
matches = list(playtomic_dir.glob(f"*/*/availability_{target_date}.jsonl.gz"))
if matches:
return _read_availability_jsonl(matches[0])
matches = list(playtomic_dir.glob(f"*/*/availability_{target_date}.json.gz"))
if not matches:
return None
with gzip.open(matches[0], "rb") as f:
return json.loads(f.read())