feat(extract): expand GeoNames to cities1000 + add tennis court extractor

GeoNames:
- cities15000 → cities1000 (~140K global locations, pop ≥ 1K)
- Add lat/lon, admin1_code, admin2_code to output (needed for dim_locations)
- Expand feature codes to include PPLA3/4/5 (Gemeinden, cantons, etc.)
- Remove MIN_POPULATION=50K floor — cities1000 already pre-filters to ≥1K
- Update assertions for new scale (~100K+ expected)

Tennis courts:
- New overpass_tennis.py extractor (sport=tennis, 180s Overpass timeout)
- Registered as extract-overpass-tennis, added to EXTRACTORS list
- New stg_tennis_courts.sql staging model (grain: osm_id)

stg_population_geonames: add lat, lon, admin1_code, admin2_code columns

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 16:15:20 +01:00
parent edf1e30444
commit c109488d9d
6 changed files with 208 additions and 20 deletions

View File

@@ -11,6 +11,7 @@ dependencies = [
[project.scripts]
extract = "padelnomics_extract.all:main"
extract-overpass = "padelnomics_extract.overpass:main"
extract-overpass-tennis = "padelnomics_extract.overpass_tennis:main"
extract-eurostat = "padelnomics_extract.eurostat:main"
extract-playtomic-tenants = "padelnomics_extract.playtomic_tenants:main"
extract-playtomic-availability = "padelnomics_extract.playtomic_availability:main"

View File

@@ -17,6 +17,8 @@ from .ons_uk import EXTRACTOR_NAME as ONS_UK_NAME
from .ons_uk import extract as extract_ons_uk
from .overpass import EXTRACTOR_NAME as OVERPASS_NAME
from .overpass import extract as extract_overpass
from .overpass_tennis import EXTRACTOR_NAME as OVERPASS_TENNIS_NAME
from .overpass_tennis import extract as extract_overpass_tennis
from .playtomic_availability import EXTRACTOR_NAME as AVAILABILITY_NAME
from .playtomic_availability import extract as extract_availability
from .playtomic_tenants import EXTRACTOR_NAME as TENANTS_NAME
@@ -26,6 +28,7 @@ logger = setup_logging("padelnomics.extract")
EXTRACTORS = [
(OVERPASS_NAME, extract_overpass),
(OVERPASS_TENNIS_NAME, extract_overpass_tennis),
(EUROSTAT_NAME, extract_eurostat),
(EUROSTAT_CITY_LABELS_NAME, extract_eurostat_city_labels),
(CENSUS_USA_NAME, extract_census_usa),

View File

@@ -1,17 +1,20 @@
"""GeoNames global city population extractor.
Downloads the cities15000.zip bulk file (~1.5MB compressed, ~26K entries) from
GeoNames and filters to cities with population ≥ 50,000 and feature codes in
{PPLA, PPLA2, PPLC, PPL} (populated places, avoiding parks, airports, etc.).
Downloads the cities1000.zip bulk file (~30MB compressed, ~140K entries) from
GeoNames. Includes all populated places with population ≥ 1,000 and feature codes
in {PPLA, PPLA2, PPLA3, PPLA4, PPLA5, PPLC, PPL}.
Used as the global fallback for population when Eurostat/Census/ONS don't cover
a country. Padel is expanding globally so this catches UAE, Australia, Argentina, etc.
This broader coverage (vs. the old cities15000 with ≥50K filter) supports
Gemeinde-level market intelligence pages — small municipalities often have the
highest padel investment opportunity (white space markets).
Requires: GEONAMES_USERNAME env var (free registration at geonames.org)
Landing: {LANDING_DIR}/geonames/{year}/{month}/cities_global.json.gz
Output: {"rows": [{"geoname_id": 2950159, "city_name": "Berlin",
"country_code": "DE", "population": 3644826,
"lat": 52.524, "lon": 13.411,
"admin1_code": "16", "admin2_code": "00",
"ref_year": 2024}], "count": N}
"""
@@ -31,24 +34,33 @@ logger = setup_logging("padelnomics.extract.geonames")
EXTRACTOR_NAME = "geonames"
DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities15000.zip"
DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities1000.zip"
# Only populated place feature codes — excludes airports, parks, admin areas, etc.
# PPLC = capital of a political entity
# PPLA = seat of a first-order administrative division
# PPLA2 = seat of a second-order admin division
# PPLA3 = seat of a third-order admin division (Gemeinden, cantons, etc.)
# PPLA4 = seat of a fourth-order admin division
# PPLA5 = seat of a fifth-order admin division
# PPL = populated place
VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPL"}
VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPLA3", "PPLA4", "PPLA5", "PPL"}
MIN_POPULATION = 50_000
# No population floor — cities1000.zip is pre-filtered to ≥ 1,000.
# Accept all to maximise Gemeinde-level coverage.
MIN_POPULATION = 0
# GeoNames tab-separated column layout for cities15000.txt
# GeoNames tab-separated column layout for cities1000.txt
# https://download.geonames.org/export/dump/readme.txt
COL_GEONAME_ID = 0
COL_NAME = 1
COL_ASCIINAME = 2
COL_COUNTRY_CODE = 8
COL_LAT = 4
COL_LON = 5
COL_FEATURE_CODE = 7
COL_COUNTRY_CODE = 8
COL_ADMIN1_CODE = 10
COL_ADMIN2_CODE = 11
COL_POPULATION = 14
COL_MODIFICATION_DATE = 18
@@ -86,10 +98,21 @@ def _parse_cities_txt(content: bytes) -> list[dict]:
country_code = parts[COL_COUNTRY_CODE].strip().upper()
if not city_name or not country_code:
continue
try:
lat = float(parts[COL_LAT])
lon = float(parts[COL_LON])
except (ValueError, IndexError):
continue
admin1_code = parts[COL_ADMIN1_CODE].strip() if len(parts) > COL_ADMIN1_CODE else ""
admin2_code = parts[COL_ADMIN2_CODE].strip() if len(parts) > COL_ADMIN2_CODE else ""
rows.append({
"geoname_id": geoname_id,
"city_name": city_name,
"country_code": country_code,
"lat": lat,
"lon": lon,
"admin1_code": admin1_code or None,
"admin2_code": admin2_code or None,
"population": population,
"ref_year": REF_YEAR,
})
@@ -102,7 +125,7 @@ def extract(
conn: sqlite3.Connection,
session: niquests.Session,
) -> dict:
"""Download GeoNames cities15000.zip. Skips if already run this month."""
"""Download GeoNames cities1000.zip. Skips if already run this month."""
username = os.environ.get("GEONAMES_USERNAME", "").strip()
if not username:
logger.warning("GEONAMES_USERNAME not set — writing empty placeholder so SQLMesh models can run")
@@ -120,25 +143,25 @@ def extract(
year, month = year_month.split("/")
# GeoNames bulk downloads don't require the username in the URL for cities15000.zip,
# GeoNames bulk downloads don't require the username in the URL for cities1000.zip,
# but the username signals acceptance of their terms of use and helps their monitoring.
url = f"{DOWNLOAD_URL}?username={username}"
logger.info("GET cities15000.zip (~1.5MB compressed)")
resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 4)
logger.info("GET cities1000.zip (~30MB compressed, ~140K locations)")
resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 10)
resp.raise_for_status()
assert len(resp.content) > 100_000, (
f"cities15000.zip too small ({len(resp.content)} bytes) — download may have failed"
assert len(resp.content) > 1_000_000, (
f"cities1000.zip too small ({len(resp.content)} bytes) — download may have failed"
)
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
txt_name = next((n for n in zf.namelist() if n.endswith(".txt")), None)
assert txt_name, f"No .txt file in cities15000.zip: {zf.namelist()}"
assert txt_name, f"No .txt file in cities1000.zip: {zf.namelist()}"
txt_content = zf.read(txt_name)
rows = _parse_cities_txt(txt_content)
assert len(rows) > 5_000, f"Expected >5000 global cities ≥50K pop, got {len(rows)}"
logger.info("parsed %d global cities with population ≥%d", len(rows), MIN_POPULATION)
assert len(rows) > 100_000, f"Expected >100K global locations (pop ≥1K), got {len(rows)}"
logger.info("parsed %d global locations (pop ≥1K)", len(rows))
dest_dir = landing_path(landing_dir, "geonames", year, month)
dest = dest_dir / "cities_global.json.gz"

View File

@@ -0,0 +1,78 @@
"""Overpass API extractor — global tennis court locations from OpenStreetMap.
Queries the Overpass API for all nodes/ways/relations tagged sport=tennis.
Tennis court density near a location is a proxy for racket-sport culture —
areas with many tennis clubs are prime candidates for padel adoption.
The query returns ~150K+ results globally (vs ~5K for padel), so a higher
Overpass timeout is used.
Landing: {LANDING_DIR}/overpass_tennis/{year}/{month}/courts.json.gz
"""
import sqlite3
from pathlib import Path
import niquests
from ._shared import OVERPASS_TIMEOUT_SECONDS, run_extractor, setup_logging
from .utils import landing_path, write_gzip_atomic
logger = setup_logging("padelnomics.extract.overpass_tennis")
EXTRACTOR_NAME = "overpass_tennis"
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
# Tennis returns ~150K+ elements globally vs ~5K for padel — use 3× timeout.
TENNIS_OVERPASS_TIMEOUT_SECONDS = OVERPASS_TIMEOUT_SECONDS * 3
OVERPASS_QUERY = (
"[out:json][timeout:180];\n"
"(\n"
' node["sport"="tennis"];\n'
' way["sport"="tennis"];\n'
' relation["sport"="tennis"];\n'
");\n"
"out body;"
)
def extract(
landing_dir: Path,
year_month: str,
conn: sqlite3.Connection,
session: niquests.Session,
) -> dict:
"""POST OverpassQL query for tennis courts and write raw OSM JSON. Returns run metrics."""
year, month = year_month.split("/")
dest_dir = landing_path(landing_dir, "overpass_tennis", year, month)
dest = dest_dir / "courts.json.gz"
logger.info("POST %s (sport=tennis, ~150K+ results expected)", OVERPASS_URL)
resp = session.post(
OVERPASS_URL,
data={"data": OVERPASS_QUERY},
timeout=TENNIS_OVERPASS_TIMEOUT_SECONDS,
)
resp.raise_for_status()
size_bytes = len(resp.content)
logger.info("%s bytes received", f"{size_bytes:,}")
bytes_written = write_gzip_atomic(dest, resp.content)
logger.info("wrote %s (%s bytes compressed)", dest, f"{bytes_written:,}")
return {
"files_written": 1,
"files_skipped": 0,
"bytes_written": bytes_written,
"cursor_value": year_month,
}
def main() -> None:
run_extractor(EXTRACTOR_NAME, extract)
if __name__ == "__main__":
main()