feat(extract): expand GeoNames to cities1000 + add tennis court extractor

GeoNames:
- cities15000 → cities1000 (~140K global locations, pop ≥ 1K)
- Add lat/lon, admin1_code, admin2_code to output (needed for dim_locations)
- Expand feature codes to include PPLA3/4/5 (Gemeinden, cantons, etc.)
- Remove MIN_POPULATION=50K floor — cities1000 already pre-filters to ≥1K
- Update assertions for new scale (~100K+ expected)

Tennis courts:
- New overpass_tennis.py extractor (sport=tennis, 180s Overpass timeout)
- Registered as extract-overpass-tennis, added to EXTRACTORS list
- New stg_tennis_courts.sql staging model (grain: osm_id)

stg_population_geonames: add lat, lon, admin1_code, admin2_code columns

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 16:15:20 +01:00
parent edf1e30444
commit c109488d9d
6 changed files with 208 additions and 20 deletions

View File

@@ -11,6 +11,7 @@ dependencies = [
[project.scripts]
extract = "padelnomics_extract.all:main"
extract-overpass = "padelnomics_extract.overpass:main"
extract-overpass-tennis = "padelnomics_extract.overpass_tennis:main"
extract-eurostat = "padelnomics_extract.eurostat:main"
extract-playtomic-tenants = "padelnomics_extract.playtomic_tenants:main"
extract-playtomic-availability = "padelnomics_extract.playtomic_availability:main"

View File

@@ -17,6 +17,8 @@ from .ons_uk import EXTRACTOR_NAME as ONS_UK_NAME
from .ons_uk import extract as extract_ons_uk
from .overpass import EXTRACTOR_NAME as OVERPASS_NAME
from .overpass import extract as extract_overpass
from .overpass_tennis import EXTRACTOR_NAME as OVERPASS_TENNIS_NAME
from .overpass_tennis import extract as extract_overpass_tennis
from .playtomic_availability import EXTRACTOR_NAME as AVAILABILITY_NAME
from .playtomic_availability import extract as extract_availability
from .playtomic_tenants import EXTRACTOR_NAME as TENANTS_NAME
@@ -26,6 +28,7 @@ logger = setup_logging("padelnomics.extract")
EXTRACTORS = [
(OVERPASS_NAME, extract_overpass),
(OVERPASS_TENNIS_NAME, extract_overpass_tennis),
(EUROSTAT_NAME, extract_eurostat),
(EUROSTAT_CITY_LABELS_NAME, extract_eurostat_city_labels),
(CENSUS_USA_NAME, extract_census_usa),

View File

@@ -1,17 +1,20 @@
"""GeoNames global city population extractor.
Downloads the cities15000.zip bulk file (~1.5MB compressed, ~26K entries) from
GeoNames and filters to cities with population ≥ 50,000 and feature codes in
{PPLA, PPLA2, PPLC, PPL} (populated places, avoiding parks, airports, etc.).
Downloads the cities1000.zip bulk file (~30MB compressed, ~140K entries) from
GeoNames. Includes all populated places with population ≥ 1,000 and feature codes
in {PPLA, PPLA2, PPLA3, PPLA4, PPLA5, PPLC, PPL}.
Used as the global fallback for population when Eurostat/Census/ONS don't cover
a country. Padel is expanding globally so this catches UAE, Australia, Argentina, etc.
This broader coverage (vs. the old cities15000 with ≥50K filter) supports
Gemeinde-level market intelligence pages — small municipalities often have the
highest padel investment opportunity (white space markets).
Requires: GEONAMES_USERNAME env var (free registration at geonames.org)
Landing: {LANDING_DIR}/geonames/{year}/{month}/cities_global.json.gz
Output: {"rows": [{"geoname_id": 2950159, "city_name": "Berlin",
"country_code": "DE", "population": 3644826,
"lat": 52.524, "lon": 13.411,
"admin1_code": "16", "admin2_code": "00",
"ref_year": 2024}], "count": N}
"""
@@ -31,24 +34,33 @@ logger = setup_logging("padelnomics.extract.geonames")
EXTRACTOR_NAME = "geonames"
DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities15000.zip"
DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities1000.zip"
# Only populated place feature codes — excludes airports, parks, admin areas, etc.
# PPLC = capital of a political entity
# PPLA = seat of a first-order administrative division
# PPLA2 = seat of a second-order admin division
# PPLA3 = seat of a third-order admin division (Gemeinden, cantons, etc.)
# PPLA4 = seat of a fourth-order admin division
# PPLA5 = seat of a fifth-order admin division
# PPL = populated place
VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPL"}
VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPLA3", "PPLA4", "PPLA5", "PPL"}
MIN_POPULATION = 50_000
# No population floor — cities1000.zip is pre-filtered to ≥ 1,000.
# Accept all to maximise Gemeinde-level coverage.
MIN_POPULATION = 0
# GeoNames tab-separated column layout for cities15000.txt
# GeoNames tab-separated column layout for cities1000.txt
# https://download.geonames.org/export/dump/readme.txt
COL_GEONAME_ID = 0
COL_NAME = 1
COL_ASCIINAME = 2
COL_COUNTRY_CODE = 8
COL_LAT = 4
COL_LON = 5
COL_FEATURE_CODE = 7
COL_COUNTRY_CODE = 8
COL_ADMIN1_CODE = 10
COL_ADMIN2_CODE = 11
COL_POPULATION = 14
COL_MODIFICATION_DATE = 18
@@ -86,10 +98,21 @@ def _parse_cities_txt(content: bytes) -> list[dict]:
country_code = parts[COL_COUNTRY_CODE].strip().upper()
if not city_name or not country_code:
continue
try:
lat = float(parts[COL_LAT])
lon = float(parts[COL_LON])
except (ValueError, IndexError):
continue
admin1_code = parts[COL_ADMIN1_CODE].strip() if len(parts) > COL_ADMIN1_CODE else ""
admin2_code = parts[COL_ADMIN2_CODE].strip() if len(parts) > COL_ADMIN2_CODE else ""
rows.append({
"geoname_id": geoname_id,
"city_name": city_name,
"country_code": country_code,
"lat": lat,
"lon": lon,
"admin1_code": admin1_code or None,
"admin2_code": admin2_code or None,
"population": population,
"ref_year": REF_YEAR,
})
@@ -102,7 +125,7 @@ def extract(
conn: sqlite3.Connection,
session: niquests.Session,
) -> dict:
"""Download GeoNames cities15000.zip. Skips if already run this month."""
"""Download GeoNames cities1000.zip. Skips if already run this month."""
username = os.environ.get("GEONAMES_USERNAME", "").strip()
if not username:
logger.warning("GEONAMES_USERNAME not set — writing empty placeholder so SQLMesh models can run")
@@ -120,25 +143,25 @@ def extract(
year, month = year_month.split("/")
# GeoNames bulk downloads don't require the username in the URL for cities15000.zip,
# GeoNames bulk downloads don't require the username in the URL for cities1000.zip,
# but the username signals acceptance of their terms of use and helps their monitoring.
url = f"{DOWNLOAD_URL}?username={username}"
logger.info("GET cities15000.zip (~1.5MB compressed)")
resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 4)
logger.info("GET cities1000.zip (~30MB compressed, ~140K locations)")
resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 10)
resp.raise_for_status()
assert len(resp.content) > 100_000, (
f"cities15000.zip too small ({len(resp.content)} bytes) — download may have failed"
assert len(resp.content) > 1_000_000, (
f"cities1000.zip too small ({len(resp.content)} bytes) — download may have failed"
)
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
txt_name = next((n for n in zf.namelist() if n.endswith(".txt")), None)
assert txt_name, f"No .txt file in cities15000.zip: {zf.namelist()}"
assert txt_name, f"No .txt file in cities1000.zip: {zf.namelist()}"
txt_content = zf.read(txt_name)
rows = _parse_cities_txt(txt_content)
assert len(rows) > 5_000, f"Expected >5000 global cities ≥50K pop, got {len(rows)}"
logger.info("parsed %d global cities with population ≥%d", len(rows), MIN_POPULATION)
assert len(rows) > 100_000, f"Expected >100K global locations (pop ≥1K), got {len(rows)}"
logger.info("parsed %d global locations (pop ≥1K)", len(rows))
dest_dir = landing_path(landing_dir, "geonames", year, month)
dest = dest_dir / "cities_global.json.gz"

View File

@@ -0,0 +1,78 @@
"""Overpass API extractor — global tennis court locations from OpenStreetMap.
Queries the Overpass API for all nodes/ways/relations tagged sport=tennis.
Tennis court density near a location is a proxy for racket-sport culture —
areas with many tennis clubs are prime candidates for padel adoption.
The query returns ~150K+ results globally (vs ~5K for padel), so a higher
Overpass timeout is used.
Landing: {LANDING_DIR}/overpass_tennis/{year}/{month}/courts.json.gz
"""
import sqlite3
from pathlib import Path
import niquests
from ._shared import OVERPASS_TIMEOUT_SECONDS, run_extractor, setup_logging
from .utils import landing_path, write_gzip_atomic
logger = setup_logging("padelnomics.extract.overpass_tennis")
EXTRACTOR_NAME = "overpass_tennis"
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
# Tennis returns ~150K+ elements globally vs ~5K for padel — use 3× timeout.
TENNIS_OVERPASS_TIMEOUT_SECONDS = OVERPASS_TIMEOUT_SECONDS * 3
OVERPASS_QUERY = (
"[out:json][timeout:180];\n"
"(\n"
' node["sport"="tennis"];\n'
' way["sport"="tennis"];\n'
' relation["sport"="tennis"];\n'
");\n"
"out body;"
)
def extract(
landing_dir: Path,
year_month: str,
conn: sqlite3.Connection,
session: niquests.Session,
) -> dict:
"""POST OverpassQL query for tennis courts and write raw OSM JSON. Returns run metrics."""
year, month = year_month.split("/")
dest_dir = landing_path(landing_dir, "overpass_tennis", year, month)
dest = dest_dir / "courts.json.gz"
logger.info("POST %s (sport=tennis, ~150K+ results expected)", OVERPASS_URL)
resp = session.post(
OVERPASS_URL,
data={"data": OVERPASS_QUERY},
timeout=TENNIS_OVERPASS_TIMEOUT_SECONDS,
)
resp.raise_for_status()
size_bytes = len(resp.content)
logger.info("%s bytes received", f"{size_bytes:,}")
bytes_written = write_gzip_atomic(dest, resp.content)
logger.info("wrote %s (%s bytes compressed)", dest, f"{bytes_written:,}")
return {
"files_written": 1,
"files_skipped": 0,
"bytes_written": bytes_written,
"cursor_value": year_month,
}
def main() -> None:
run_extractor(EXTRACTOR_NAME, extract)
if __name__ == "__main__":
main()

View File

@@ -1,5 +1,6 @@
-- GeoNames global city population (cities15000 bulk dataset, filtered to ≥50K).
-- GeoNames global city/municipality population (cities1000 bulk dataset, pop ≥ 1K).
-- Global fallback for countries not covered by Eurostat, Census, or ONS.
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
-- One row per geoname_id (GeoNames stable numeric identifier).
--
-- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz
@@ -16,6 +17,10 @@ WITH parsed AS (
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
row ->> 'city_name' AS city_name,
row ->> 'country_code' AS country_code,
TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat,
TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon,
row ->> 'admin1_code' AS admin1_code,
row ->> 'admin2_code' AS admin2_code,
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
CURRENT_DATE AS extracted_date
@@ -32,6 +37,10 @@ SELECT
geoname_id,
TRIM(city_name) AS city_name,
UPPER(country_code) AS country_code,
lat,
lon,
NULLIF(TRIM(admin1_code), '') AS admin1_code,
NULLIF(TRIM(admin2_code), '') AS admin2_code,
population,
ref_year,
extracted_date
@@ -40,3 +49,5 @@ WHERE population IS NOT NULL
AND population > 0
AND geoname_id IS NOT NULL
AND city_name IS NOT NULL
AND lat IS NOT NULL
AND lon IS NOT NULL

View File

@@ -0,0 +1,72 @@
-- Tennis court locations from OpenStreetMap via Overpass API (sport=tennis).
-- Used as a "racket sport culture" signal in the opportunity score:
-- areas with high tennis court density are prime padel adoption markets.
--
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz
MODEL (
name staging.stg_tennis_courts,
kind FULL,
cron '@daily',
grain osm_id
);
WITH parsed AS (
SELECT
elem ->> 'type' AS osm_type,
(elem ->> 'id')::BIGINT AS osm_id,
TRY_CAST(elem ->> 'lat' AS DOUBLE) AS lat,
TRY_CAST(elem ->> 'lon' AS DOUBLE) AS lon,
elem -> 'tags' ->> 'name' AS name,
elem -> 'tags' ->> 'addr:country' AS country_code,
elem -> 'tags' ->> 'addr:city' AS city_tag,
filename AS source_file,
CURRENT_DATE AS extracted_date
FROM (
SELECT UNNEST(elements) AS elem, filename
FROM read_json(
@LANDING_DIR || '/overpass_tennis/*/*/courts.json.gz',
format = 'auto',
filename = true
)
)
WHERE (elem ->> 'type') IS NOT NULL
),
deduped AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
FROM parsed
WHERE osm_type = 'node'
AND lat IS NOT NULL AND lon IS NOT NULL
AND lat BETWEEN -90 AND 90
AND lon BETWEEN -180 AND 180
),
with_country AS (
SELECT
osm_id, lat, lon,
COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE
WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE'
WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES'
WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB'
WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR'
WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH'
WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT'
WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT'
WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
ELSE NULL
END) AS country_code,
NULLIF(TRIM(name), '') AS name,
NULLIF(TRIM(city_tag), '') AS city,
extracted_date
FROM deduped
WHERE rn = 1
)
SELECT
osm_id,
lat,
lon,
country_code,
name,
city,
extracted_date
FROM with_country