merge: standardise recheck availability to JSONL + update docs

This commit is contained in:
Deeman
2026-02-25 15:45:23 +01:00
6 changed files with 63 additions and 31 deletions

View File

@@ -5,7 +5,8 @@
-- Reads BOTH morning snapshots and recheck files:
-- Morning (new): availability_{date}.jsonl.gz → snapshot_type = 'morning'
-- Morning (old): availability_{date}.json.gz → snapshot_type = 'morning'
-- Recheck: availability_{date}_recheck_{HH}.json.gz → snapshot_type = 'recheck'
-- Recheck (new): availability_{date}_recheck_{HH}.jsonl.gz → snapshot_type = 'recheck'
-- Recheck (old): availability_{date}_recheck_{HH}.json.gz → snapshot_type = 'recheck'
--
-- Only 60-min duration slots are kept (canonical hourly rate + occupancy unit).
-- Price parsed from strings like "14.56 EUR" or "48 GBP".
@@ -77,7 +78,30 @@ morning_blob AS (
) af,
LATERAL UNNEST(af.venues) AS t(venue_json)
),
-- Recheck snapshots (blob format only — small files, no JSONL conversion needed)
-- Recheck snapshots (new JSONL format one venue per line)
recheck_jsonl AS (
SELECT
date AS snapshot_date,
captured_at_utc,
'recheck' AS snapshot_type,
TRY_CAST(recheck_hour AS INTEGER) AS recheck_hour,
tenant_id,
slots AS slots_json
FROM read_json(
@LANDING_DIR || '/playtomic/*/*/availability_*_recheck_*.jsonl.gz',
format = 'newline_delimited',
columns = {
date: 'VARCHAR',
captured_at_utc: 'VARCHAR',
recheck_hour: 'VARCHAR',
tenant_id: 'VARCHAR',
slots: 'JSON'
},
filename = true
)
WHERE tenant_id IS NOT NULL
),
-- Recheck snapshots (old blob format, kept for transition)
recheck_blob AS (
SELECT
rf.date AS snapshot_date,
@@ -111,6 +135,8 @@ all_venues AS (
UNION ALL
SELECT * FROM morning_blob
UNION ALL
SELECT * FROM recheck_jsonl
UNION ALL
SELECT * FROM recheck_blob
),
raw_resources AS (