feat(extract): regional overpass_tennis splitting + JSONL output

Replace single global Overpass query (150K+ elements, times out) with
10 regional bbox queries (~10-40K elements each, 150s server / 180s client).

- REGIONS: 10 bboxes covering all continents
- Crash recovery: working.jsonl accumulates per-region results;
  already_seen_ids deduplication skips re-written elements on restart
- Overlapping bbox elements deduped by OSM id across regions
- Retry per region: up to 2 retries with 30s cooldown
- Polite 5s inter-region delay
- Skip if courts.jsonl.gz or courts.json.gz already exists for the month

stg_tennis_courts: UNION ALL transition (jsonl_elements + blob_elements)
  - jsonl_elements: JSONL, explicit columns, COALESCE lat/lon with center coords
    (supports both node direct lat/lon and way/relation Overpass out center)
  - blob_elements: existing UNNEST(elements) pattern, unchanged
  - Removed osm_type='node' filter — ways/relations now usable via center coords
  - Dedup on (osm_id, extracted_date DESC) unchanged

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-25 12:19:37 +01:00
parent a4f246d69a
commit b5b8493543
2 changed files with 177 additions and 39 deletions

View File

@@ -2,7 +2,12 @@
-- Used as a "racket sport culture" signal in the opportunity score:
-- areas with high tennis court density are prime padel adoption markets.
--
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz
-- Supports two landing formats (UNION ALL during migration):
-- New: courts.jsonl.gz — one OSM element per line; nodes have lat/lon directly,
-- ways/relations have center.lat/center.lon (Overpass out center)
-- Old: courts.json.gz — {"elements": [...]} blob (UNNEST required)
--
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.{jsonl,json}.gz
MODEL (
name staging.stg_tennis_courts,
@@ -11,7 +16,39 @@ MODEL (
grain osm_id
);
WITH parsed AS (
WITH
-- New format: one OSM element per JSONL line
jsonl_elements AS (
SELECT
type AS osm_type,
TRY_CAST(id AS BIGINT) AS osm_id,
-- Nodes: lat/lon direct. Ways/relations: center object (Overpass out center).
COALESCE(
TRY_CAST(lat AS DOUBLE),
TRY_CAST(center ->> 'lat' AS DOUBLE)
) AS lat,
COALESCE(
TRY_CAST(lon AS DOUBLE),
TRY_CAST(center ->> 'lon' AS DOUBLE)
) AS lon,
tags ->> 'name' AS name,
tags ->> 'addr:country' AS country_code,
tags ->> 'addr:city' AS city_tag,
filename AS source_file,
CURRENT_DATE AS extracted_date
FROM read_json(
@LANDING_DIR || '/overpass_tennis/*/*/courts.jsonl.gz',
format = 'newline_delimited',
columns = {
type: 'VARCHAR', id: 'BIGINT', lat: 'DOUBLE', lon: 'DOUBLE',
center: 'JSON', tags: 'JSON'
},
filename = true
)
WHERE type IS NOT NULL
),
-- Old format: {"elements": [...]} blob — kept for transition
blob_elements AS (
SELECT
elem ->> 'type' AS osm_type,
(elem ->> 'id')::BIGINT AS osm_id,
@@ -32,12 +69,16 @@ WITH parsed AS (
)
WHERE (elem ->> 'type') IS NOT NULL
),
parsed AS (
SELECT * FROM jsonl_elements
UNION ALL
SELECT * FROM blob_elements
),
deduped AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
FROM parsed
WHERE osm_type = 'node'
AND lat IS NOT NULL AND lon IS NOT NULL
WHERE lat IS NOT NULL AND lon IS NOT NULL
AND lat BETWEEN -90 AND 90
AND lon BETWEEN -180 AND 180
),
@@ -54,8 +95,8 @@ with_country AS (
WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT'
WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
ELSE NULL
END) AS country_code,
NULLIF(TRIM(name), '') AS name,
END) AS country_code,
NULLIF(TRIM(name), '') AS name,
NULLIF(TRIM(city_tag), '') AS city,
extracted_date
FROM deduped