feat(extract): expand GeoNames to cities1000 + add tennis court extractor

GeoNames: - cities15000 → cities1000 (~140K global locations, pop ≥ 1K) - Add lat/lon, admin1_code, admin2_code to output (needed for dim_locations) - Expand feature codes to include PPLA3/4/5 (Gemeinden, cantons, etc.) - Remove MIN_POPULATION=50K floor — cities1000 already pre-filters to ≥1K - Update assertions for new scale (~100K+ expected) Tennis courts: - New overpass_tennis.py extractor (sport=tennis, 180s Overpass timeout) - Registered as extract-overpass-tennis, added to EXTRACTORS list - New stg_tennis_courts.sql staging model (grain: osm_id) stg_population_geonames: add lat, lon, admin1_code, admin2_code columns Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 16:15:20 +01:00
parent edf1e30444
commit c109488d9d
6 changed files with 208 additions and 20 deletions
--- a/extract/padelnomics_extract/pyproject.toml
+++ b/extract/padelnomics_extract/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
 [project.scripts]
 extract = "padelnomics_extract.all:main"
 extract-overpass = "padelnomics_extract.overpass:main"
 extract-overpass-tennis = "padelnomics_extract.overpass_tennis:main"
 extract-eurostat = "padelnomics_extract.eurostat:main"
 extract-playtomic-tenants = "padelnomics_extract.playtomic_tenants:main"
 extract-playtomic-availability = "padelnomics_extract.playtomic_availability:main"
--- a/extract/padelnomics_extract/src/padelnomics_extract/all.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/all.py
@@ -17,6 +17,8 @@ from .ons_uk import EXTRACTOR_NAME as ONS_UK_NAME
 from .ons_uk import extract as extract_ons_uk
 from .overpass import EXTRACTOR_NAME as OVERPASS_NAME
 from .overpass import extract as extract_overpass
 from .overpass_tennis import EXTRACTOR_NAME as OVERPASS_TENNIS_NAME
 from .overpass_tennis import extract as extract_overpass_tennis
 from .playtomic_availability import EXTRACTOR_NAME as AVAILABILITY_NAME
 from .playtomic_availability import extract as extract_availability
 from .playtomic_tenants import EXTRACTOR_NAME as TENANTS_NAME
@@ -26,6 +28,7 @@ logger = setup_logging("padelnomics.extract")
 EXTRACTORS = [
    (OVERPASS_NAME, extract_overpass),
    (OVERPASS_TENNIS_NAME, extract_overpass_tennis),
    (EUROSTAT_NAME, extract_eurostat),
    (EUROSTAT_CITY_LABELS_NAME, extract_eurostat_city_labels),
    (CENSUS_USA_NAME, extract_census_usa),
--- a/extract/padelnomics_extract/src/padelnomics_extract/geonames.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/geonames.py
@@ -1,17 +1,20 @@
 """GeoNames global city population extractor.
-Downloads the cities15000.zip bulk file (~1.5MB compressed, ~26K entries) from
+Downloads the cities1000.zip bulk file (~30MB compressed, ~140K entries) from
-GeoNames and filters to cities with population ≥ 50,000 and feature codes in
+GeoNames. Includes all populated places with population ≥ 1,000 and feature codes
-{PPLA, PPLA2, PPLC, PPL} (populated places, avoiding parks, airports, etc.).
+in {PPLA, PPLA2, PPLA3, PPLA4, PPLA5, PPLC, PPL}.
-Used as the global fallback for population when Eurostat/Census/ONS don't cover
+This broader coverage (vs. the old cities15000 with ≥50K filter) supports
-a country. Padel is expanding globally so this catches UAE, Australia, Argentina, etc.
+Gemeinde-level market intelligence pages — small municipalities often have the
 highest padel investment opportunity (white space markets).
 Requires: GEONAMES_USERNAME env var (free registration at geonames.org)
 Landing: {LANDING_DIR}/geonames/{year}/{month}/cities_global.json.gz
 Output:  {"rows": [{"geoname_id": 2950159, "city_name": "Berlin",
                    "country_code": "DE", "population": 3644826,
                    "lat": 52.524, "lon": 13.411,
                    "admin1_code": "16", "admin2_code": "00",
                    "ref_year": 2024}], "count": N}
 """
@@ -31,24 +34,33 @@ logger = setup_logging("padelnomics.extract.geonames")
 EXTRACTOR_NAME = "geonames"
-DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities15000.zip"
+DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities1000.zip"
 # Only populated place feature codes — excludes airports, parks, admin areas, etc.
 # PPLC  = capital of a political entity
 # PPLA  = seat of a first-order administrative division
 # PPLA2 = seat of a second-order admin division
 # PPLA3 = seat of a third-order admin division (Gemeinden, cantons, etc.)
 # PPLA4 = seat of a fourth-order admin division
 # PPLA5 = seat of a fifth-order admin division
 # PPL   = populated place
-VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPL"}
+VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPLA3", "PPLA4", "PPLA5", "PPL"}
-MIN_POPULATION = 50_000
+# No population floor — cities1000.zip is pre-filtered to ≥ 1,000.
 # Accept all to maximise Gemeinde-level coverage.
 MIN_POPULATION = 0
-# GeoNames tab-separated column layout for cities15000.txt
+# GeoNames tab-separated column layout for cities1000.txt
 # https://download.geonames.org/export/dump/readme.txt
 COL_GEONAME_ID = 0
 COL_NAME = 1
 COL_ASCIINAME = 2
-COL_COUNTRY_CODE = 8
+COL_LAT = 4
 COL_LON = 5
 COL_FEATURE_CODE = 7
 COL_COUNTRY_CODE = 8
 COL_ADMIN1_CODE = 10
 COL_ADMIN2_CODE = 11
 COL_POPULATION = 14
 COL_MODIFICATION_DATE = 18
@@ -86,10 +98,21 @@ def _parse_cities_txt(content: bytes) -> list[dict]:
        country_code = parts[COL_COUNTRY_CODE].strip().upper()
        if not city_name or not country_code:
            continue
        try:
            lat = float(parts[COL_LAT])
            lon = float(parts[COL_LON])
        except (ValueError, IndexError):
            continue
        admin1_code = parts[COL_ADMIN1_CODE].strip() if len(parts) > COL_ADMIN1_CODE else ""
        admin2_code = parts[COL_ADMIN2_CODE].strip() if len(parts) > COL_ADMIN2_CODE else ""
        rows.append({
            "geoname_id": geoname_id,
            "city_name": city_name,
            "country_code": country_code,
            "lat": lat,
            "lon": lon,
            "admin1_code": admin1_code or None,
            "admin2_code": admin2_code or None,
            "population": population,
            "ref_year": REF_YEAR,
        })
@@ -102,7 +125,7 @@ def extract(
    conn: sqlite3.Connection,
    session: niquests.Session,
 ) -> dict:
-    """Download GeoNames cities15000.zip. Skips if already run this month."""
+    """Download GeoNames cities1000.zip. Skips if already run this month."""
    username = os.environ.get("GEONAMES_USERNAME", "").strip()
    if not username:
        logger.warning("GEONAMES_USERNAME not set — writing empty placeholder so SQLMesh models can run")
@@ -120,25 +143,25 @@ def extract(
    year, month = year_month.split("/")
-    # GeoNames bulk downloads don't require the username in the URL for cities15000.zip,
+    # GeoNames bulk downloads don't require the username in the URL for cities1000.zip,
    # but the username signals acceptance of their terms of use and helps their monitoring.
    url = f"{DOWNLOAD_URL}?username={username}"
-    logger.info("GET cities15000.zip (~1.5MB compressed)")
+    logger.info("GET cities1000.zip (~30MB compressed, ~140K locations)")
-    resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 4)
+    resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 10)
    resp.raise_for_status()
-    assert len(resp.content) > 100_000, (
+    assert len(resp.content) > 1_000_000, (
-        f"cities15000.zip too small ({len(resp.content)} bytes) — download may have failed"
+        f"cities1000.zip too small ({len(resp.content)} bytes) — download may have failed"
    )
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        txt_name = next((n for n in zf.namelist() if n.endswith(".txt")), None)
-        assert txt_name, f"No .txt file in cities15000.zip: {zf.namelist()}"
+        assert txt_name, f"No .txt file in cities1000.zip: {zf.namelist()}"
        txt_content = zf.read(txt_name)
    rows = _parse_cities_txt(txt_content)
-    assert len(rows) > 5_000, f"Expected >5000 global cities ≥50K pop, got {len(rows)}"
+    assert len(rows) > 100_000, f"Expected >100K global locations (pop ≥1K), got {len(rows)}"
-    logger.info("parsed %d global cities with population ≥%d", len(rows), MIN_POPULATION)
+    logger.info("parsed %d global locations (pop ≥1K)", len(rows))
    dest_dir = landing_path(landing_dir, "geonames", year, month)
    dest = dest_dir / "cities_global.json.gz"
--- a/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py
@@ -0,0 +1,78 @@
 """Overpass API extractor — global tennis court locations from OpenStreetMap.
 Queries the Overpass API for all nodes/ways/relations tagged sport=tennis.
 Tennis court density near a location is a proxy for racket-sport culture —
 areas with many tennis clubs are prime candidates for padel adoption.
 The query returns ~150K+ results globally (vs ~5K for padel), so a higher
 Overpass timeout is used.
 Landing: {LANDING_DIR}/overpass_tennis/{year}/{month}/courts.json.gz
 """
 import sqlite3
 from pathlib import Path
 import niquests
 from ._shared import OVERPASS_TIMEOUT_SECONDS, run_extractor, setup_logging
 from .utils import landing_path, write_gzip_atomic
 logger = setup_logging("padelnomics.extract.overpass_tennis")
 EXTRACTOR_NAME = "overpass_tennis"
 OVERPASS_URL = "https://overpass-api.de/api/interpreter"
 # Tennis returns ~150K+ elements globally vs ~5K for padel — use 3× timeout.
 TENNIS_OVERPASS_TIMEOUT_SECONDS = OVERPASS_TIMEOUT_SECONDS * 3
 OVERPASS_QUERY = (
    "[out:json][timeout:180];\n"
    "(\n"
    '  node["sport"="tennis"];\n'
    '  way["sport"="tennis"];\n'
    '  relation["sport"="tennis"];\n'
    ");\n"
    "out body;"
 )
 def extract(
    landing_dir: Path,
    year_month: str,
    conn: sqlite3.Connection,
    session: niquests.Session,
 ) -> dict:
    """POST OverpassQL query for tennis courts and write raw OSM JSON. Returns run metrics."""
    year, month = year_month.split("/")
    dest_dir = landing_path(landing_dir, "overpass_tennis", year, month)
    dest = dest_dir / "courts.json.gz"
    logger.info("POST %s (sport=tennis, ~150K+ results expected)", OVERPASS_URL)
    resp = session.post(
        OVERPASS_URL,
        data={"data": OVERPASS_QUERY},
        timeout=TENNIS_OVERPASS_TIMEOUT_SECONDS,
    )
    resp.raise_for_status()
    size_bytes = len(resp.content)
    logger.info("%s bytes received", f"{size_bytes:,}")
    bytes_written = write_gzip_atomic(dest, resp.content)
    logger.info("wrote %s (%s bytes compressed)", dest, f"{bytes_written:,}")
    return {
        "files_written": 1,
        "files_skipped": 0,
        "bytes_written": bytes_written,
        "cursor_value": year_month,
    }
 def main() -> None:
    run_extractor(EXTRACTOR_NAME, extract)
 if __name__ == "__main__":
    main()
--- a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql
+++ b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql
@@ -1,5 +1,6 @@
-- GeoNames global city population (cities15000 bulk dataset, filtered to ≥50K).
+-- GeoNames global city/municipality population (cities1000 bulk dataset, pop ≥ 1K).
 -- Global fallback for countries not covered by Eurostat, Census, or ONS.
 -- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
 -- One row per geoname_id (GeoNames stable numeric identifier).
 --
 -- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz
@@ -16,6 +17,10 @@ WITH parsed AS (
    TRY_CAST(row ->> 'geoname_id'  AS INTEGER)  AS geoname_id,
    row ->> 'city_name'                          AS city_name,
    row ->> 'country_code'                       AS country_code,
    TRY_CAST(row ->> 'lat'         AS DOUBLE)    AS lat,
    TRY_CAST(row ->> 'lon'         AS DOUBLE)    AS lon,
    row ->> 'admin1_code'                        AS admin1_code,
    row ->> 'admin2_code'                        AS admin2_code,
    TRY_CAST(row ->> 'population'  AS BIGINT)    AS population,
    TRY_CAST(row ->> 'ref_year'    AS INTEGER)   AS ref_year,
    CURRENT_DATE                                 AS extracted_date
@@ -32,6 +37,10 @@ SELECT
  geoname_id,
  TRIM(city_name)      AS city_name,
  UPPER(country_code)  AS country_code,
  lat,
  lon,
  NULLIF(TRIM(admin1_code), '')  AS admin1_code,
  NULLIF(TRIM(admin2_code), '')  AS admin2_code,
  population,
  ref_year,
  extracted_date
@@ -40,3 +49,5 @@ WHERE population IS NOT NULL
  AND population > 0
  AND geoname_id IS NOT NULL
  AND city_name IS NOT NULL
  AND lat IS NOT NULL
  AND lon IS NOT NULL
--- a/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql
+++ b/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql
@@ -0,0 +1,72 @@
 -- Tennis court locations from OpenStreetMap via Overpass API (sport=tennis).
 -- Used as a "racket sport culture" signal in the opportunity score:
 -- areas with high tennis court density are prime padel adoption markets.
 --
 -- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz
 MODEL (
  name staging.stg_tennis_courts,
  kind FULL,
  cron '@daily',
  grain osm_id
 );
 WITH parsed AS (
  SELECT
    elem ->> 'type'                     AS osm_type,
    (elem ->> 'id')::BIGINT             AS osm_id,
    TRY_CAST(elem ->> 'lat' AS DOUBLE)  AS lat,
    TRY_CAST(elem ->> 'lon' AS DOUBLE)  AS lon,
    elem -> 'tags' ->> 'name'           AS name,
    elem -> 'tags' ->> 'addr:country'   AS country_code,
    elem -> 'tags' ->> 'addr:city'      AS city_tag,
    filename                            AS source_file,
    CURRENT_DATE                        AS extracted_date
  FROM (
    SELECT UNNEST(elements) AS elem, filename
    FROM read_json(
      @LANDING_DIR || '/overpass_tennis/*/*/courts.json.gz',
      format = 'auto',
      filename = true
    )
  )
  WHERE (elem ->> 'type') IS NOT NULL
 ),
 deduped AS (
  SELECT *,
    ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
  FROM parsed
  WHERE osm_type = 'node'
    AND lat IS NOT NULL AND lon IS NOT NULL
    AND lat BETWEEN -90  AND 90
    AND lon BETWEEN -180 AND 180
 ),
 with_country AS (
  SELECT
    osm_id, lat, lon,
    COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE
      WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN  5.87 AND 15.04 THEN 'DE'
      WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND  4.33 THEN 'ES'
      WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND  1.77 THEN 'GB'
      WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND  9.56 THEN 'FR'
      WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN  5.96 AND 10.49 THEN 'CH'
      WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN  9.53 AND 17.16 THEN 'AT'
      WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN  6.62 AND 18.51 THEN 'IT'
      WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
      ELSE NULL
    END)                      AS country_code,
    NULLIF(TRIM(name), '')    AS name,
    NULLIF(TRIM(city_tag), '') AS city,
    extracted_date
  FROM deduped
  WHERE rn = 1
 )
 SELECT
  osm_id,
  lat,
  lon,
  country_code,
  name,
  city,
  extracted_date
 FROM with_country