feat(extract): expand GeoNames to cities1000 + add tennis court extractor
GeoNames: - cities15000 → cities1000 (~140K global locations, pop ≥ 1K) - Add lat/lon, admin1_code, admin2_code to output (needed for dim_locations) - Expand feature codes to include PPLA3/4/5 (Gemeinden, cantons, etc.) - Remove MIN_POPULATION=50K floor — cities1000 already pre-filters to ≥1K - Update assertions for new scale (~100K+ expected) Tennis courts: - New overpass_tennis.py extractor (sport=tennis, 180s Overpass timeout) - Registered as extract-overpass-tennis, added to EXTRACTORS list - New stg_tennis_courts.sql staging model (grain: osm_id) stg_population_geonames: add lat, lon, admin1_code, admin2_code columns Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
-- GeoNames global city population (cities15000 bulk dataset, filtered to ≥50K).
|
||||
-- GeoNames global city/municipality population (cities1000 bulk dataset, pop ≥ 1K).
|
||||
-- Global fallback for countries not covered by Eurostat, Census, or ONS.
|
||||
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
|
||||
-- One row per geoname_id (GeoNames stable numeric identifier).
|
||||
--
|
||||
-- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz
|
||||
@@ -16,6 +17,10 @@ WITH parsed AS (
|
||||
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
|
||||
row ->> 'city_name' AS city_name,
|
||||
row ->> 'country_code' AS country_code,
|
||||
TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat,
|
||||
TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon,
|
||||
row ->> 'admin1_code' AS admin1_code,
|
||||
row ->> 'admin2_code' AS admin2_code,
|
||||
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
|
||||
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
|
||||
CURRENT_DATE AS extracted_date
|
||||
@@ -32,6 +37,10 @@ SELECT
|
||||
geoname_id,
|
||||
TRIM(city_name) AS city_name,
|
||||
UPPER(country_code) AS country_code,
|
||||
lat,
|
||||
lon,
|
||||
NULLIF(TRIM(admin1_code), '') AS admin1_code,
|
||||
NULLIF(TRIM(admin2_code), '') AS admin2_code,
|
||||
population,
|
||||
ref_year,
|
||||
extracted_date
|
||||
@@ -40,3 +49,5 @@ WHERE population IS NOT NULL
|
||||
AND population > 0
|
||||
AND geoname_id IS NOT NULL
|
||||
AND city_name IS NOT NULL
|
||||
AND lat IS NOT NULL
|
||||
AND lon IS NOT NULL
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
-- Tennis court locations from OpenStreetMap via Overpass API (sport=tennis).
|
||||
-- Used as a "racket sport culture" signal in the opportunity score:
|
||||
-- areas with high tennis court density are prime padel adoption markets.
|
||||
--
|
||||
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_tennis_courts,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain osm_id
|
||||
);
|
||||
|
||||
WITH parsed AS (
|
||||
SELECT
|
||||
elem ->> 'type' AS osm_type,
|
||||
(elem ->> 'id')::BIGINT AS osm_id,
|
||||
TRY_CAST(elem ->> 'lat' AS DOUBLE) AS lat,
|
||||
TRY_CAST(elem ->> 'lon' AS DOUBLE) AS lon,
|
||||
elem -> 'tags' ->> 'name' AS name,
|
||||
elem -> 'tags' ->> 'addr:country' AS country_code,
|
||||
elem -> 'tags' ->> 'addr:city' AS city_tag,
|
||||
filename AS source_file,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM (
|
||||
SELECT UNNEST(elements) AS elem, filename
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/overpass_tennis/*/*/courts.json.gz',
|
||||
format = 'auto',
|
||||
filename = true
|
||||
)
|
||||
)
|
||||
WHERE (elem ->> 'type') IS NOT NULL
|
||||
),
|
||||
deduped AS (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
|
||||
FROM parsed
|
||||
WHERE osm_type = 'node'
|
||||
AND lat IS NOT NULL AND lon IS NOT NULL
|
||||
AND lat BETWEEN -90 AND 90
|
||||
AND lon BETWEEN -180 AND 180
|
||||
),
|
||||
with_country AS (
|
||||
SELECT
|
||||
osm_id, lat, lon,
|
||||
COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE
|
||||
WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE'
|
||||
WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES'
|
||||
WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB'
|
||||
WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR'
|
||||
WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH'
|
||||
WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT'
|
||||
WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT'
|
||||
WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
|
||||
ELSE NULL
|
||||
END) AS country_code,
|
||||
NULLIF(TRIM(name), '') AS name,
|
||||
NULLIF(TRIM(city_tag), '') AS city,
|
||||
extracted_date
|
||||
FROM deduped
|
||||
WHERE rn = 1
|
||||
)
|
||||
SELECT
|
||||
osm_id,
|
||||
lat,
|
||||
lon,
|
||||
country_code,
|
||||
name,
|
||||
city,
|
||||
extracted_date
|
||||
FROM with_country
|
||||
Reference in New Issue
Block a user