feat: standardise recheck availability to JSONL output
- extract_recheck() now writes availability_{date}_recheck_{HH}.jsonl.gz
(one venue per line with date/captured_at_utc/recheck_hour injected);
uses compress_jsonl_atomic; removes write_gzip_atomic import
- stg_playtomic_availability: add recheck_jsonl CTE (newline_delimited
read_json on *.jsonl.gz recheck files); include in all_venues UNION ALL;
old recheck_blob CTE kept for transition
- init_landing_seeds.py: add JSONL recheck seed alongside blob seed
- Docs: README landing structure + data sources table updated; CHANGELOG
availability bullets updated; data-sources-inventory paths corrected
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,7 +5,8 @@
|
||||
-- Reads BOTH morning snapshots and recheck files:
|
||||
-- Morning (new): availability_{date}.jsonl.gz → snapshot_type = 'morning'
|
||||
-- Morning (old): availability_{date}.json.gz → snapshot_type = 'morning'
|
||||
-- Recheck: availability_{date}_recheck_{HH}.json.gz → snapshot_type = 'recheck'
|
||||
-- Recheck (new): availability_{date}_recheck_{HH}.jsonl.gz → snapshot_type = 'recheck'
|
||||
-- Recheck (old): availability_{date}_recheck_{HH}.json.gz → snapshot_type = 'recheck'
|
||||
--
|
||||
-- Only 60-min duration slots are kept (canonical hourly rate + occupancy unit).
|
||||
-- Price parsed from strings like "14.56 EUR" or "48 GBP".
|
||||
@@ -77,7 +78,30 @@ morning_blob AS (
|
||||
) af,
|
||||
LATERAL UNNEST(af.venues) AS t(venue_json)
|
||||
),
|
||||
-- Recheck snapshots (blob format only — small files, no JSONL conversion needed)
|
||||
-- Recheck snapshots (new JSONL format — one venue per line)
|
||||
recheck_jsonl AS (
|
||||
SELECT
|
||||
date AS snapshot_date,
|
||||
captured_at_utc,
|
||||
'recheck' AS snapshot_type,
|
||||
TRY_CAST(recheck_hour AS INTEGER) AS recheck_hour,
|
||||
tenant_id,
|
||||
slots AS slots_json
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/playtomic/*/*/availability_*_recheck_*.jsonl.gz',
|
||||
format = 'newline_delimited',
|
||||
columns = {
|
||||
date: 'VARCHAR',
|
||||
captured_at_utc: 'VARCHAR',
|
||||
recheck_hour: 'VARCHAR',
|
||||
tenant_id: 'VARCHAR',
|
||||
slots: 'JSON'
|
||||
},
|
||||
filename = true
|
||||
)
|
||||
WHERE tenant_id IS NOT NULL
|
||||
),
|
||||
-- Recheck snapshots (old blob format, kept for transition)
|
||||
recheck_blob AS (
|
||||
SELECT
|
||||
rf.date AS snapshot_date,
|
||||
@@ -111,6 +135,8 @@ all_venues AS (
|
||||
UNION ALL
|
||||
SELECT * FROM morning_blob
|
||||
UNION ALL
|
||||
SELECT * FROM recheck_jsonl
|
||||
UNION ALL
|
||||
SELECT * FROM recheck_blob
|
||||
),
|
||||
raw_resources AS (
|
||||
|
||||
Reference in New Issue
Block a user