From c109488d9d3a35c96ea1b3f882bd055d133f7f23 Mon Sep 17 00:00:00 2001 From: Deeman Date: Tue, 24 Feb 2026 16:15:20 +0100 Subject: [PATCH] feat(extract): expand GeoNames to cities1000 + add tennis court extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GeoNames: - cities15000 → cities1000 (~140K global locations, pop ≥ 1K) - Add lat/lon, admin1_code, admin2_code to output (needed for dim_locations) - Expand feature codes to include PPLA3/4/5 (Gemeinden, cantons, etc.) - Remove MIN_POPULATION=50K floor — cities1000 already pre-filters to ≥1K - Update assertions for new scale (~100K+ expected) Tennis courts: - New overpass_tennis.py extractor (sport=tennis, 180s Overpass timeout) - Registered as extract-overpass-tennis, added to EXTRACTORS list - New stg_tennis_courts.sql staging model (grain: osm_id) stg_population_geonames: add lat, lon, admin1_code, admin2_code columns Co-Authored-By: Claude Sonnet 4.6 --- extract/padelnomics_extract/pyproject.toml | 1 + .../src/padelnomics_extract/all.py | 3 + .../src/padelnomics_extract/geonames.py | 61 ++++++++++----- .../padelnomics_extract/overpass_tennis.py | 78 +++++++++++++++++++ .../staging/stg_population_geonames.sql | 13 +++- .../models/staging/stg_tennis_courts.sql | 72 +++++++++++++++++ 6 files changed, 208 insertions(+), 20 deletions(-) create mode 100644 extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py create mode 100644 transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql diff --git a/extract/padelnomics_extract/pyproject.toml b/extract/padelnomics_extract/pyproject.toml index 2b52b8e..93098df 100644 --- a/extract/padelnomics_extract/pyproject.toml +++ b/extract/padelnomics_extract/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ [project.scripts] extract = "padelnomics_extract.all:main" extract-overpass = "padelnomics_extract.overpass:main" +extract-overpass-tennis = "padelnomics_extract.overpass_tennis:main" extract-eurostat = "padelnomics_extract.eurostat:main" extract-playtomic-tenants = "padelnomics_extract.playtomic_tenants:main" extract-playtomic-availability = "padelnomics_extract.playtomic_availability:main" diff --git a/extract/padelnomics_extract/src/padelnomics_extract/all.py b/extract/padelnomics_extract/src/padelnomics_extract/all.py index 94db5ab..15c153f 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/all.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/all.py @@ -17,6 +17,8 @@ from .ons_uk import EXTRACTOR_NAME as ONS_UK_NAME from .ons_uk import extract as extract_ons_uk from .overpass import EXTRACTOR_NAME as OVERPASS_NAME from .overpass import extract as extract_overpass +from .overpass_tennis import EXTRACTOR_NAME as OVERPASS_TENNIS_NAME +from .overpass_tennis import extract as extract_overpass_tennis from .playtomic_availability import EXTRACTOR_NAME as AVAILABILITY_NAME from .playtomic_availability import extract as extract_availability from .playtomic_tenants import EXTRACTOR_NAME as TENANTS_NAME @@ -26,6 +28,7 @@ logger = setup_logging("padelnomics.extract") EXTRACTORS = [ (OVERPASS_NAME, extract_overpass), + (OVERPASS_TENNIS_NAME, extract_overpass_tennis), (EUROSTAT_NAME, extract_eurostat), (EUROSTAT_CITY_LABELS_NAME, extract_eurostat_city_labels), (CENSUS_USA_NAME, extract_census_usa), diff --git a/extract/padelnomics_extract/src/padelnomics_extract/geonames.py b/extract/padelnomics_extract/src/padelnomics_extract/geonames.py index d47cad8..b6d6a8d 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/geonames.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/geonames.py @@ -1,17 +1,20 @@ """GeoNames global city population extractor. -Downloads the cities15000.zip bulk file (~1.5MB compressed, ~26K entries) from -GeoNames and filters to cities with population ≥ 50,000 and feature codes in -{PPLA, PPLA2, PPLC, PPL} (populated places, avoiding parks, airports, etc.). +Downloads the cities1000.zip bulk file (~30MB compressed, ~140K entries) from +GeoNames. Includes all populated places with population ≥ 1,000 and feature codes +in {PPLA, PPLA2, PPLA3, PPLA4, PPLA5, PPLC, PPL}. -Used as the global fallback for population when Eurostat/Census/ONS don't cover -a country. Padel is expanding globally so this catches UAE, Australia, Argentina, etc. +This broader coverage (vs. the old cities15000 with ≥50K filter) supports +Gemeinde-level market intelligence pages — small municipalities often have the +highest padel investment opportunity (white space markets). Requires: GEONAMES_USERNAME env var (free registration at geonames.org) Landing: {LANDING_DIR}/geonames/{year}/{month}/cities_global.json.gz Output: {"rows": [{"geoname_id": 2950159, "city_name": "Berlin", "country_code": "DE", "population": 3644826, + "lat": 52.524, "lon": 13.411, + "admin1_code": "16", "admin2_code": "00", "ref_year": 2024}], "count": N} """ @@ -31,24 +34,33 @@ logger = setup_logging("padelnomics.extract.geonames") EXTRACTOR_NAME = "geonames" -DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities15000.zip" +DOWNLOAD_URL = "https://download.geonames.org/export/dump/cities1000.zip" # Only populated place feature codes — excludes airports, parks, admin areas, etc. # PPLC = capital of a political entity # PPLA = seat of a first-order administrative division # PPLA2 = seat of a second-order admin division +# PPLA3 = seat of a third-order admin division (Gemeinden, cantons, etc.) +# PPLA4 = seat of a fourth-order admin division +# PPLA5 = seat of a fifth-order admin division # PPL = populated place -VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPL"} +VALID_FEATURE_CODES = {"PPLC", "PPLA", "PPLA2", "PPLA3", "PPLA4", "PPLA5", "PPL"} -MIN_POPULATION = 50_000 +# No population floor — cities1000.zip is pre-filtered to ≥ 1,000. +# Accept all to maximise Gemeinde-level coverage. +MIN_POPULATION = 0 -# GeoNames tab-separated column layout for cities15000.txt +# GeoNames tab-separated column layout for cities1000.txt # https://download.geonames.org/export/dump/readme.txt COL_GEONAME_ID = 0 COL_NAME = 1 COL_ASCIINAME = 2 -COL_COUNTRY_CODE = 8 +COL_LAT = 4 +COL_LON = 5 COL_FEATURE_CODE = 7 +COL_COUNTRY_CODE = 8 +COL_ADMIN1_CODE = 10 +COL_ADMIN2_CODE = 11 COL_POPULATION = 14 COL_MODIFICATION_DATE = 18 @@ -86,10 +98,21 @@ def _parse_cities_txt(content: bytes) -> list[dict]: country_code = parts[COL_COUNTRY_CODE].strip().upper() if not city_name or not country_code: continue + try: + lat = float(parts[COL_LAT]) + lon = float(parts[COL_LON]) + except (ValueError, IndexError): + continue + admin1_code = parts[COL_ADMIN1_CODE].strip() if len(parts) > COL_ADMIN1_CODE else "" + admin2_code = parts[COL_ADMIN2_CODE].strip() if len(parts) > COL_ADMIN2_CODE else "" rows.append({ "geoname_id": geoname_id, "city_name": city_name, "country_code": country_code, + "lat": lat, + "lon": lon, + "admin1_code": admin1_code or None, + "admin2_code": admin2_code or None, "population": population, "ref_year": REF_YEAR, }) @@ -102,7 +125,7 @@ def extract( conn: sqlite3.Connection, session: niquests.Session, ) -> dict: - """Download GeoNames cities15000.zip. Skips if already run this month.""" + """Download GeoNames cities1000.zip. Skips if already run this month.""" username = os.environ.get("GEONAMES_USERNAME", "").strip() if not username: logger.warning("GEONAMES_USERNAME not set — writing empty placeholder so SQLMesh models can run") @@ -120,25 +143,25 @@ def extract( year, month = year_month.split("/") - # GeoNames bulk downloads don't require the username in the URL for cities15000.zip, + # GeoNames bulk downloads don't require the username in the URL for cities1000.zip, # but the username signals acceptance of their terms of use and helps their monitoring. url = f"{DOWNLOAD_URL}?username={username}" - logger.info("GET cities15000.zip (~1.5MB compressed)") - resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 4) + logger.info("GET cities1000.zip (~30MB compressed, ~140K locations)") + resp = session.get(url, timeout=HTTP_TIMEOUT_SECONDS * 10) resp.raise_for_status() - assert len(resp.content) > 100_000, ( - f"cities15000.zip too small ({len(resp.content)} bytes) — download may have failed" + assert len(resp.content) > 1_000_000, ( + f"cities1000.zip too small ({len(resp.content)} bytes) — download may have failed" ) with zipfile.ZipFile(io.BytesIO(resp.content)) as zf: txt_name = next((n for n in zf.namelist() if n.endswith(".txt")), None) - assert txt_name, f"No .txt file in cities15000.zip: {zf.namelist()}" + assert txt_name, f"No .txt file in cities1000.zip: {zf.namelist()}" txt_content = zf.read(txt_name) rows = _parse_cities_txt(txt_content) - assert len(rows) > 5_000, f"Expected >5000 global cities ≥50K pop, got {len(rows)}" - logger.info("parsed %d global cities with population ≥%d", len(rows), MIN_POPULATION) + assert len(rows) > 100_000, f"Expected >100K global locations (pop ≥1K), got {len(rows)}" + logger.info("parsed %d global locations (pop ≥1K)", len(rows)) dest_dir = landing_path(landing_dir, "geonames", year, month) dest = dest_dir / "cities_global.json.gz" diff --git a/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py b/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py new file mode 100644 index 0000000..79c75e7 --- /dev/null +++ b/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py @@ -0,0 +1,78 @@ +"""Overpass API extractor — global tennis court locations from OpenStreetMap. + +Queries the Overpass API for all nodes/ways/relations tagged sport=tennis. +Tennis court density near a location is a proxy for racket-sport culture — +areas with many tennis clubs are prime candidates for padel adoption. + +The query returns ~150K+ results globally (vs ~5K for padel), so a higher +Overpass timeout is used. + +Landing: {LANDING_DIR}/overpass_tennis/{year}/{month}/courts.json.gz +""" + +import sqlite3 +from pathlib import Path + +import niquests + +from ._shared import OVERPASS_TIMEOUT_SECONDS, run_extractor, setup_logging +from .utils import landing_path, write_gzip_atomic + +logger = setup_logging("padelnomics.extract.overpass_tennis") + +EXTRACTOR_NAME = "overpass_tennis" +OVERPASS_URL = "https://overpass-api.de/api/interpreter" + +# Tennis returns ~150K+ elements globally vs ~5K for padel — use 3× timeout. +TENNIS_OVERPASS_TIMEOUT_SECONDS = OVERPASS_TIMEOUT_SECONDS * 3 + +OVERPASS_QUERY = ( + "[out:json][timeout:180];\n" + "(\n" + ' node["sport"="tennis"];\n' + ' way["sport"="tennis"];\n' + ' relation["sport"="tennis"];\n' + ");\n" + "out body;" +) + + +def extract( + landing_dir: Path, + year_month: str, + conn: sqlite3.Connection, + session: niquests.Session, +) -> dict: + """POST OverpassQL query for tennis courts and write raw OSM JSON. Returns run metrics.""" + year, month = year_month.split("/") + dest_dir = landing_path(landing_dir, "overpass_tennis", year, month) + dest = dest_dir / "courts.json.gz" + + logger.info("POST %s (sport=tennis, ~150K+ results expected)", OVERPASS_URL) + resp = session.post( + OVERPASS_URL, + data={"data": OVERPASS_QUERY}, + timeout=TENNIS_OVERPASS_TIMEOUT_SECONDS, + ) + resp.raise_for_status() + + size_bytes = len(resp.content) + logger.info("%s bytes received", f"{size_bytes:,}") + + bytes_written = write_gzip_atomic(dest, resp.content) + logger.info("wrote %s (%s bytes compressed)", dest, f"{bytes_written:,}") + + return { + "files_written": 1, + "files_skipped": 0, + "bytes_written": bytes_written, + "cursor_value": year_month, + } + + +def main() -> None: + run_extractor(EXTRACTOR_NAME, extract) + + +if __name__ == "__main__": + main() diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql index cad9eca..6f40d10 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql @@ -1,5 +1,6 @@ --- GeoNames global city population (cities15000 bulk dataset, filtered to ≥50K). +-- GeoNames global city/municipality population (cities1000 bulk dataset, pop ≥ 1K). -- Global fallback for countries not covered by Eurostat, Census, or ONS. +-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence. -- One row per geoname_id (GeoNames stable numeric identifier). -- -- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz @@ -16,6 +17,10 @@ WITH parsed AS ( TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id, row ->> 'city_name' AS city_name, row ->> 'country_code' AS country_code, + TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat, + TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon, + row ->> 'admin1_code' AS admin1_code, + row ->> 'admin2_code' AS admin2_code, TRY_CAST(row ->> 'population' AS BIGINT) AS population, TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year, CURRENT_DATE AS extracted_date @@ -32,6 +37,10 @@ SELECT geoname_id, TRIM(city_name) AS city_name, UPPER(country_code) AS country_code, + lat, + lon, + NULLIF(TRIM(admin1_code), '') AS admin1_code, + NULLIF(TRIM(admin2_code), '') AS admin2_code, population, ref_year, extracted_date @@ -40,3 +49,5 @@ WHERE population IS NOT NULL AND population > 0 AND geoname_id IS NOT NULL AND city_name IS NOT NULL + AND lat IS NOT NULL + AND lon IS NOT NULL diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql b/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql new file mode 100644 index 0000000..8821f45 --- /dev/null +++ b/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql @@ -0,0 +1,72 @@ +-- Tennis court locations from OpenStreetMap via Overpass API (sport=tennis). +-- Used as a "racket sport culture" signal in the opportunity score: +-- areas with high tennis court density are prime padel adoption markets. +-- +-- Source: data/landing/overpass_tennis/{year}/{month}/courts.json.gz + +MODEL ( + name staging.stg_tennis_courts, + kind FULL, + cron '@daily', + grain osm_id +); + +WITH parsed AS ( + SELECT + elem ->> 'type' AS osm_type, + (elem ->> 'id')::BIGINT AS osm_id, + TRY_CAST(elem ->> 'lat' AS DOUBLE) AS lat, + TRY_CAST(elem ->> 'lon' AS DOUBLE) AS lon, + elem -> 'tags' ->> 'name' AS name, + elem -> 'tags' ->> 'addr:country' AS country_code, + elem -> 'tags' ->> 'addr:city' AS city_tag, + filename AS source_file, + CURRENT_DATE AS extracted_date + FROM ( + SELECT UNNEST(elements) AS elem, filename + FROM read_json( + @LANDING_DIR || '/overpass_tennis/*/*/courts.json.gz', + format = 'auto', + filename = true + ) + ) + WHERE (elem ->> 'type') IS NOT NULL +), +deduped AS ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn + FROM parsed + WHERE osm_type = 'node' + AND lat IS NOT NULL AND lon IS NOT NULL + AND lat BETWEEN -90 AND 90 + AND lon BETWEEN -180 AND 180 +), +with_country AS ( + SELECT + osm_id, lat, lon, + COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE + WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE' + WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES' + WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB' + WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR' + WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH' + WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT' + WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT' + WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT' + ELSE NULL + END) AS country_code, + NULLIF(TRIM(name), '') AS name, + NULLIF(TRIM(city_tag), '') AS city, + extracted_date + FROM deduped + WHERE rn = 1 +) +SELECT + osm_id, + lat, + lon, + country_code, + name, + city, + extracted_date +FROM with_country