feat(extract): expand GeoNames to cities1000 + add tennis court extractor

GeoNames:
- cities15000 → cities1000 (~140K global locations, pop ≥ 1K)
- Add lat/lon, admin1_code, admin2_code to output (needed for dim_locations)
- Expand feature codes to include PPLA3/4/5 (Gemeinden, cantons, etc.)
- Remove MIN_POPULATION=50K floor — cities1000 already pre-filters to ≥1K
- Update assertions for new scale (~100K+ expected)

Tennis courts:
- New overpass_tennis.py extractor (sport=tennis, 180s Overpass timeout)
- Registered as extract-overpass-tennis, added to EXTRACTORS list
- New stg_tennis_courts.sql staging model (grain: osm_id)

stg_population_geonames: add lat, lon, admin1_code, admin2_code columns

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 16:15:20 +01:00
parent edf1e30444
commit c109488d9d
6 changed files with 208 additions and 20 deletions

View File

@@ -1,5 +1,6 @@
-- GeoNames global city population (cities15000 bulk dataset, filtered to ≥50K).
-- GeoNames global city/municipality population (cities1000 bulk dataset, pop ≥ 1K).
-- Global fallback for countries not covered by Eurostat, Census, or ONS.
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
-- One row per geoname_id (GeoNames stable numeric identifier).
--
-- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz
@@ -16,6 +17,10 @@ WITH parsed AS (
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
row ->> 'city_name' AS city_name,
row ->> 'country_code' AS country_code,
TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat,
TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon,
row ->> 'admin1_code' AS admin1_code,
row ->> 'admin2_code' AS admin2_code,
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
CURRENT_DATE AS extracted_date
@@ -32,6 +37,10 @@ SELECT
geoname_id,
TRIM(city_name) AS city_name,
UPPER(country_code) AS country_code,
lat,
lon,
NULLIF(TRIM(admin1_code), '') AS admin1_code,
NULLIF(TRIM(admin2_code), '') AS admin2_code,
population,
ref_year,
extracted_date
@@ -40,3 +49,5 @@ WHERE population IS NOT NULL
AND population > 0
AND geoname_id IS NOT NULL
AND city_name IS NOT NULL
AND lat IS NOT NULL
AND lon IS NOT NULL

View File

@@ -0,0 +1,72 @@
-- Tennis court locations from OpenStreetMap via Overpass API (sport=tennis).
-- Used as a "racket sport culture" signal in the opportunity score:
-- areas with high tennis court density are prime padel adoption markets.
--
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz
MODEL (
name staging.stg_tennis_courts,
kind FULL,
cron '@daily',
grain osm_id
);
WITH parsed AS (
SELECT
elem ->> 'type' AS osm_type,
(elem ->> 'id')::BIGINT AS osm_id,
TRY_CAST(elem ->> 'lat' AS DOUBLE) AS lat,
TRY_CAST(elem ->> 'lon' AS DOUBLE) AS lon,
elem -> 'tags' ->> 'name' AS name,
elem -> 'tags' ->> 'addr:country' AS country_code,
elem -> 'tags' ->> 'addr:city' AS city_tag,
filename AS source_file,
CURRENT_DATE AS extracted_date
FROM (
SELECT UNNEST(elements) AS elem, filename
FROM read_json(
@LANDING_DIR || '/overpass_tennis/*/*/courts.json.gz',
format = 'auto',
filename = true
)
)
WHERE (elem ->> 'type') IS NOT NULL
),
deduped AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
FROM parsed
WHERE osm_type = 'node'
AND lat IS NOT NULL AND lon IS NOT NULL
AND lat BETWEEN -90 AND 90
AND lon BETWEEN -180 AND 180
),
with_country AS (
SELECT
osm_id, lat, lon,
COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE
WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE'
WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES'
WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB'
WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR'
WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH'
WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT'
WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT'
WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
ELSE NULL
END) AS country_code,
NULLIF(TRIM(name), '') AS name,
NULLIF(TRIM(city_tag), '') AS city,
extracted_date
FROM deduped
WHERE rn = 1
)
SELECT
osm_id,
lat,
lon,
country_code,
name,
city,
extracted_date
FROM with_country