feat(extract): convert playtomic_availability to JSONL output
- availability_{date}.jsonl.gz replaces .json.gz for morning snapshots
- Each JSONL line = one venue object with date + captured_at_utc injected
- Eliminates in-memory consolidation: working.jsonl IS the final file
(compress_jsonl_atomic at end instead of write_gzip_atomic blob)
- Crash recovery unchanged: working.jsonl accumulates via flush_partial_batch
- _load_morning_availability tries .jsonl.gz first, falls back to .json.gz
- Skip check covers both formats during transition
- Recheck files stay blob format (small, infrequent)
stg_playtomic_availability: UNION ALL transition (morning_jsonl + morning_blob + recheck_blob)
- morning_jsonl: read_json JSONL, tenant_id direct column, no outer UNNEST
- morning_blob / recheck_blob: subquery + LATERAL UNNEST (unchanged semantics)
- All three produce (snapshot_date, captured_at_utc, snapshot_type, recheck_hour, tenant_id, slots_json)
- Downstream raw_resources / raw_slots CTEs unchanged
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,12 +3,17 @@
|
||||
-- "Available" = the slot was NOT booked at capture time. Missing slots = booked.
|
||||
--
|
||||
-- Reads BOTH morning snapshots and recheck files:
|
||||
-- Morning: availability_{date}.json.gz → snapshot_type = 'morning'
|
||||
-- Recheck: availability_{date}_recheck_{HH}.json.gz → snapshot_type = 'recheck'
|
||||
-- Morning (new): availability_{date}.jsonl.gz → snapshot_type = 'morning'
|
||||
-- Morning (old): availability_{date}.json.gz → snapshot_type = 'morning'
|
||||
-- Recheck: availability_{date}_recheck_{HH}.json.gz → snapshot_type = 'recheck'
|
||||
--
|
||||
-- Only 60-min duration slots are kept (canonical hourly rate + occupancy unit).
|
||||
-- Price parsed from strings like "14.56 EUR" or "48 GBP".
|
||||
--
|
||||
-- Supports two morning landing formats (UNION ALL during migration):
|
||||
-- New: availability_{date}.jsonl.gz — one venue per line, columns: tenant_id, slots, date, captured_at_utc
|
||||
-- Old: availability_{date}.json.gz — {"date":..., "venues": [...]} blob (UNNEST required)
|
||||
--
|
||||
-- Requires: at least one availability file in the landing zone.
|
||||
-- A seed file (data/landing/playtomic/1970/01/availability_1970-01-01.json.gz)
|
||||
-- with empty venues[] ensures this model runs before real data arrives.
|
||||
@@ -20,77 +25,105 @@ MODEL (
|
||||
grain (snapshot_date, tenant_id, resource_id, slot_start_time, snapshot_type, captured_at_utc)
|
||||
);
|
||||
|
||||
-- Morning snapshots (filename does NOT contain '_recheck_')
|
||||
WITH morning_files AS (
|
||||
WITH
|
||||
-- New format: one venue per JSONL line — no outer UNNEST needed
|
||||
morning_jsonl AS (
|
||||
SELECT
|
||||
*,
|
||||
'morning' AS snapshot_type,
|
||||
NULL::INTEGER AS recheck_hour
|
||||
date AS snapshot_date,
|
||||
captured_at_utc,
|
||||
'morning' AS snapshot_type,
|
||||
NULL::INTEGER AS recheck_hour,
|
||||
tenant_id,
|
||||
slots AS slots_json
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/playtomic/*/*/availability_*.json.gz',
|
||||
format = 'auto',
|
||||
@LANDING_DIR || '/playtomic/*/*/availability_*.jsonl.gz',
|
||||
format = 'newline_delimited',
|
||||
columns = {
|
||||
date: 'VARCHAR',
|
||||
captured_at_utc: 'VARCHAR',
|
||||
venues: 'JSON[]'
|
||||
tenant_id: 'VARCHAR',
|
||||
slots: 'JSON'
|
||||
},
|
||||
filename = true,
|
||||
maximum_object_size = 134217728 -- 128 MB; daily files grow with venue count
|
||||
filename = true
|
||||
)
|
||||
WHERE filename NOT LIKE '%_recheck_%'
|
||||
AND venues IS NOT NULL
|
||||
AND json_array_length(venues) > 0
|
||||
AND tenant_id IS NOT NULL
|
||||
),
|
||||
-- Recheck snapshots (filename contains '_recheck_')
|
||||
-- Use TRY_CAST on a regex-extracted hour to get the recheck_hour.
|
||||
-- If no recheck files exist yet, this CTE produces zero rows (safe).
|
||||
recheck_files AS (
|
||||
-- Old format: {"date":..., "venues": [...]} blob — kept for transition
|
||||
morning_blob AS (
|
||||
SELECT
|
||||
*,
|
||||
'recheck' AS snapshot_type,
|
||||
TRY_CAST(
|
||||
regexp_extract(filename, '_recheck_(\d+)', 1) AS INTEGER
|
||||
) AS recheck_hour
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/playtomic/*/*/availability_*_recheck_*.json.gz',
|
||||
format = 'auto',
|
||||
columns = {
|
||||
date: 'VARCHAR',
|
||||
captured_at_utc: 'VARCHAR',
|
||||
venues: 'JSON[]'
|
||||
},
|
||||
filename = true,
|
||||
maximum_object_size = 134217728 -- 128 MB; matches morning snapshot limit
|
||||
)
|
||||
WHERE venues IS NOT NULL
|
||||
AND json_array_length(venues) > 0
|
||||
),
|
||||
all_files AS (
|
||||
SELECT date, captured_at_utc, venues, snapshot_type, recheck_hour FROM morning_files
|
||||
UNION ALL
|
||||
SELECT date, captured_at_utc, venues, snapshot_type, recheck_hour FROM recheck_files
|
||||
),
|
||||
raw_venues AS (
|
||||
SELECT
|
||||
af.date AS snapshot_date,
|
||||
af.date AS snapshot_date,
|
||||
af.captured_at_utc,
|
||||
af.snapshot_type,
|
||||
af.recheck_hour,
|
||||
venue_json
|
||||
FROM all_files af,
|
||||
'morning' AS snapshot_type,
|
||||
NULL::INTEGER AS recheck_hour,
|
||||
venue_json ->> 'tenant_id' AS tenant_id,
|
||||
venue_json -> 'slots' AS slots_json
|
||||
FROM (
|
||||
SELECT date, captured_at_utc, venues
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/playtomic/*/*/availability_*.json.gz',
|
||||
format = 'auto',
|
||||
columns = {
|
||||
date: 'VARCHAR',
|
||||
captured_at_utc: 'VARCHAR',
|
||||
venues: 'JSON[]'
|
||||
},
|
||||
filename = true,
|
||||
maximum_object_size = 134217728 -- 128 MB; daily files grow with venue count
|
||||
)
|
||||
WHERE filename NOT LIKE '%_recheck_%'
|
||||
AND venues IS NOT NULL
|
||||
AND json_array_length(venues) > 0
|
||||
) af,
|
||||
LATERAL UNNEST(af.venues) AS t(venue_json)
|
||||
),
|
||||
-- Recheck snapshots (blob format only — small files, no JSONL conversion needed)
|
||||
recheck_blob AS (
|
||||
SELECT
|
||||
rf.date AS snapshot_date,
|
||||
rf.captured_at_utc,
|
||||
'recheck' AS snapshot_type,
|
||||
TRY_CAST(
|
||||
regexp_extract(rf.filename, '_recheck_(\d+)', 1) AS INTEGER
|
||||
) AS recheck_hour,
|
||||
venue_json ->> 'tenant_id' AS tenant_id,
|
||||
venue_json -> 'slots' AS slots_json
|
||||
FROM (
|
||||
SELECT date, captured_at_utc, venues, filename
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/playtomic/*/*/availability_*_recheck_*.json.gz',
|
||||
format = 'auto',
|
||||
columns = {
|
||||
date: 'VARCHAR',
|
||||
captured_at_utc: 'VARCHAR',
|
||||
venues: 'JSON[]'
|
||||
},
|
||||
filename = true,
|
||||
maximum_object_size = 134217728 -- 128 MB; matches morning snapshot limit
|
||||
)
|
||||
WHERE venues IS NOT NULL
|
||||
AND json_array_length(venues) > 0
|
||||
) rf,
|
||||
LATERAL UNNEST(rf.venues) AS t(venue_json)
|
||||
),
|
||||
all_venues AS (
|
||||
SELECT * FROM morning_jsonl
|
||||
UNION ALL
|
||||
SELECT * FROM morning_blob
|
||||
UNION ALL
|
||||
SELECT * FROM recheck_blob
|
||||
),
|
||||
raw_resources AS (
|
||||
SELECT
|
||||
rv.snapshot_date,
|
||||
rv.captured_at_utc,
|
||||
rv.snapshot_type,
|
||||
rv.recheck_hour,
|
||||
rv.venue_json ->> 'tenant_id' AS tenant_id,
|
||||
av.snapshot_date,
|
||||
av.captured_at_utc,
|
||||
av.snapshot_type,
|
||||
av.recheck_hour,
|
||||
av.tenant_id,
|
||||
resource_json
|
||||
FROM raw_venues rv,
|
||||
FROM all_venues av,
|
||||
LATERAL UNNEST(
|
||||
from_json(rv.venue_json -> 'slots', '["JSON"]')
|
||||
from_json(av.slots_json, '["JSON"]')
|
||||
) AS t(resource_json)
|
||||
),
|
||||
raw_slots AS (
|
||||
|
||||
Reference in New Issue
Block a user