feat(extract): regional overpass_tennis splitting + JSONL output
Replace single global Overpass query (150K+ elements, times out) with
10 regional bbox queries (~10-40K elements each, 150s server / 180s client).
- REGIONS: 10 bboxes covering all continents
- Crash recovery: working.jsonl accumulates per-region results;
already_seen_ids deduplication skips re-written elements on restart
- Overlapping bbox elements deduped by OSM id across regions
- Retry per region: up to 2 retries with 30s cooldown
- Polite 5s inter-region delay
- Skip if courts.jsonl.gz or courts.json.gz already exists for the month
stg_tennis_courts: UNION ALL transition (jsonl_elements + blob_elements)
- jsonl_elements: JSONL, explicit columns, COALESCE lat/lon with center coords
(supports both node direct lat/lon and way/relation Overpass out center)
- blob_elements: existing UNNEST(elements) pattern, unchanged
- Removed osm_type='node' filter — ways/relations now usable via center coords
- Dedup on (osm_id, extracted_date DESC) unchanged
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,12 @@
|
||||
-- Used as a "racket sport culture" signal in the opportunity score:
|
||||
-- areas with high tennis court density are prime padel adoption markets.
|
||||
--
|
||||
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz
|
||||
-- Supports two landing formats (UNION ALL during migration):
|
||||
-- New: courts.jsonl.gz — one OSM element per line; nodes have lat/lon directly,
|
||||
-- ways/relations have center.lat/center.lon (Overpass out center)
|
||||
-- Old: courts.json.gz — {"elements": [...]} blob (UNNEST required)
|
||||
--
|
||||
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.{jsonl,json}.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_tennis_courts,
|
||||
@@ -11,7 +16,39 @@ MODEL (
|
||||
grain osm_id
|
||||
);
|
||||
|
||||
WITH parsed AS (
|
||||
WITH
|
||||
-- New format: one OSM element per JSONL line
|
||||
jsonl_elements AS (
|
||||
SELECT
|
||||
type AS osm_type,
|
||||
TRY_CAST(id AS BIGINT) AS osm_id,
|
||||
-- Nodes: lat/lon direct. Ways/relations: center object (Overpass out center).
|
||||
COALESCE(
|
||||
TRY_CAST(lat AS DOUBLE),
|
||||
TRY_CAST(center ->> 'lat' AS DOUBLE)
|
||||
) AS lat,
|
||||
COALESCE(
|
||||
TRY_CAST(lon AS DOUBLE),
|
||||
TRY_CAST(center ->> 'lon' AS DOUBLE)
|
||||
) AS lon,
|
||||
tags ->> 'name' AS name,
|
||||
tags ->> 'addr:country' AS country_code,
|
||||
tags ->> 'addr:city' AS city_tag,
|
||||
filename AS source_file,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/overpass_tennis/*/*/courts.jsonl.gz',
|
||||
format = 'newline_delimited',
|
||||
columns = {
|
||||
type: 'VARCHAR', id: 'BIGINT', lat: 'DOUBLE', lon: 'DOUBLE',
|
||||
center: 'JSON', tags: 'JSON'
|
||||
},
|
||||
filename = true
|
||||
)
|
||||
WHERE type IS NOT NULL
|
||||
),
|
||||
-- Old format: {"elements": [...]} blob — kept for transition
|
||||
blob_elements AS (
|
||||
SELECT
|
||||
elem ->> 'type' AS osm_type,
|
||||
(elem ->> 'id')::BIGINT AS osm_id,
|
||||
@@ -32,12 +69,16 @@ WITH parsed AS (
|
||||
)
|
||||
WHERE (elem ->> 'type') IS NOT NULL
|
||||
),
|
||||
parsed AS (
|
||||
SELECT * FROM jsonl_elements
|
||||
UNION ALL
|
||||
SELECT * FROM blob_elements
|
||||
),
|
||||
deduped AS (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
|
||||
FROM parsed
|
||||
WHERE osm_type = 'node'
|
||||
AND lat IS NOT NULL AND lon IS NOT NULL
|
||||
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
||||
AND lat BETWEEN -90 AND 90
|
||||
AND lon BETWEEN -180 AND 180
|
||||
),
|
||||
@@ -54,8 +95,8 @@ with_country AS (
|
||||
WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT'
|
||||
WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
|
||||
ELSE NULL
|
||||
END) AS country_code,
|
||||
NULLIF(TRIM(name), '') AS name,
|
||||
END) AS country_code,
|
||||
NULLIF(TRIM(name), '') AS name,
|
||||
NULLIF(TRIM(city_tag), '') AS city,
|
||||
extracted_date
|
||||
FROM deduped
|
||||
|
||||
Reference in New Issue
Block a user