feat(data): Sprint 1-5 population pipeline — city labels, US/UK/Global extractors
Part A: Data Layer — Sprints 1-5 Sprint 1 — Eurostat SDMX city labels (unblocks EU population): - New extractor: eurostat_city_labels.py — fetches ESTAT/CITIES codelist (city_code → city_name mapping) with ETag dedup - New staging model: stg_city_labels.sql — grain city_code - Updated dim_cities.sql — joins Eurostat population via city code lookup; replaces hardcoded 0::BIGINT population Sprint 2 — Market score formula v2: - city_market_profile.sql: 30pt population (LN/1M), 25pt income PPS (/200), 30pt demand (occupancy or density), 15pt data confidence - Moved venue_pricing_benchmarks join into base CTE so median_occupancy_rate is available to the scoring formula Sprint 3 — US Census ACS extractor: - New extractor: census_usa.py — ACS 5-year place population (vintage 2023) - New staging model: stg_population_usa.sql — grain (place_fips, ref_year) Sprint 4 — ONS UK extractor: - New extractor: ons_uk.py — 2021 Census LAD population via ONS beta API - New staging model: stg_population_uk.sql — grain (lad_code, ref_year) Sprint 5 — GeoNames global extractor: - New extractor: geonames.py — cities15000.zip bulk download, filtered to ≥50K pop - New staging model: stg_population_geonames.sql — grain geoname_id - dim_cities: 5-source population cascade (Eurostat > Census > ONS > GeoNames > 0) with case/whitespace-insensitive city name matching Registered all 4 new CLI entrypoints in pyproject.toml and all.py. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,14 +3,17 @@
|
||||
-- tracks cities where padel venues actually exist, not an administrative city list.
|
||||
--
|
||||
-- Conformed dimension: used by city_market_profile and all pSEO serving models.
|
||||
-- Integrates two sources:
|
||||
-- dim_venues → city list, venue count, coordinates (Playtomic + OSM)
|
||||
-- stg_income → country-level median income (Eurostat)
|
||||
-- Integrates four sources:
|
||||
-- dim_venues → city list, venue count, coordinates (Playtomic + OSM)
|
||||
-- stg_income → country-level median income (Eurostat)
|
||||
-- stg_city_labels → Eurostat city_code → city_name mapping (EU cities)
|
||||
-- stg_population → Eurostat city-level population (EU, joined via city code)
|
||||
-- stg_population_usa → US Census ACS place population
|
||||
-- stg_population_uk → ONS LAD population
|
||||
-- stg_population_geonames → GeoNames global fallback
|
||||
--
|
||||
-- Population note: Eurostat uses coded identifiers (e.g. DE001C = Berlin) with no
|
||||
-- city name column in the dataset we extract. City-level population requires a
|
||||
-- separate code→name lookup extract (future improvement). Population is set to 0
|
||||
-- until that source is available; market_score degrades gracefully.
|
||||
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0.
|
||||
-- City name matching is case/whitespace-insensitive within each country.
|
||||
--
|
||||
-- Grain: (country_code, city_slug) — two cities in different countries can share a
|
||||
-- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs.
|
||||
@@ -42,6 +45,39 @@ country_income AS (
|
||||
SELECT country_code, median_income_pps, ref_year AS income_year
|
||||
FROM staging.stg_income
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
-- Eurostat EU population: join city labels (code→name) with population values.
|
||||
-- QUALIFY keeps only the most recent year per (country, city name).
|
||||
eurostat_pop AS (
|
||||
SELECT
|
||||
cl.country_code,
|
||||
cl.city_name,
|
||||
p.population,
|
||||
p.ref_year
|
||||
FROM staging.stg_city_labels cl
|
||||
JOIN staging.stg_population p ON cl.city_code = p.city_code
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
PARTITION BY cl.country_code, cl.city_name
|
||||
ORDER BY p.ref_year DESC
|
||||
) = 1
|
||||
),
|
||||
-- US Census ACS population (place-level, filtered to ≥50K)
|
||||
us_pop AS (
|
||||
SELECT city_name, country_code, population, ref_year
|
||||
FROM staging.stg_population_usa
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY place_fips ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
-- ONS UK Local Authority District population
|
||||
uk_pop AS (
|
||||
SELECT lad_name AS city_name, country_code, population, ref_year
|
||||
FROM staging.stg_population_uk
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY lad_code ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
-- GeoNames global fallback (all cities ≥50K)
|
||||
geonames_pop AS (
|
||||
SELECT city_name, country_code, population, ref_year
|
||||
FROM staging.stg_population_geonames
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY geoname_id ORDER BY ref_year DESC) = 1
|
||||
)
|
||||
SELECT
|
||||
vc.country_code,
|
||||
@@ -99,15 +135,43 @@ SELECT
|
||||
)) AS country_slug,
|
||||
vc.centroid_lat AS lat,
|
||||
vc.centroid_lon AS lon,
|
||||
-- Population: requires code→name Eurostat lookup (not yet extracted); defaults to 0.
|
||||
-- market_score uses LOG(GREATEST(population, 1)) so 0 degrades score gracefully.
|
||||
0::BIGINT AS population,
|
||||
0::INTEGER AS population_year,
|
||||
-- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames > 0.
|
||||
-- City name match is case/whitespace-insensitive within each country.
|
||||
COALESCE(
|
||||
ep.population,
|
||||
usa.population,
|
||||
uk.population,
|
||||
gn.population,
|
||||
0
|
||||
)::BIGINT AS population,
|
||||
COALESCE(
|
||||
ep.ref_year,
|
||||
usa.ref_year,
|
||||
uk.ref_year,
|
||||
gn.ref_year,
|
||||
0
|
||||
)::INTEGER AS population_year,
|
||||
vc.padel_venue_count,
|
||||
ci.median_income_pps,
|
||||
ci.income_year
|
||||
FROM venue_cities vc
|
||||
LEFT JOIN country_income ci ON vc.country_code = ci.country_code
|
||||
-- Eurostat EU population (via city code→name lookup)
|
||||
LEFT JOIN eurostat_pop ep
|
||||
ON vc.country_code = ep.country_code
|
||||
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(ep.city_name))
|
||||
-- US Census population
|
||||
LEFT JOIN us_pop usa
|
||||
ON vc.country_code = usa.country_code
|
||||
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(usa.city_name))
|
||||
-- ONS UK population
|
||||
LEFT JOIN uk_pop uk
|
||||
ON vc.country_code = uk.country_code
|
||||
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(uk.city_name))
|
||||
-- GeoNames global fallback
|
||||
LEFT JOIN geonames_pop gn
|
||||
ON vc.country_code = gn.country_code
|
||||
AND LOWER(TRIM(vc.city_name)) = LOWER(TRIM(gn.city_name))
|
||||
-- Enforce grain: if two cities in the same country have the same slug
|
||||
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
-- One Big Table: per-city padel market intelligence.
|
||||
-- Consumed by: SEO article generation, planner city-select pre-fill, API endpoints.
|
||||
--
|
||||
-- Market score (0–100) is a simple composite:
|
||||
-- 40% population (log-scaled, city > 500K = max)
|
||||
-- 40% venue density (courts per 100K residents)
|
||||
-- 20% data confidence (completeness of both population + venue data)
|
||||
-- Market score v2 (0–100):
|
||||
-- 30 pts population — log-scaled to 1M+ city ceiling (was 40pts/500K)
|
||||
-- 25 pts income PPS — normalised to 200 ceiling (covers CH/NO/LU outliers)
|
||||
-- 30 pts demand — observed occupancy if available, else venue density
|
||||
-- 15 pts data quality — completeness discount, not a market signal
|
||||
|
||||
MODEL (
|
||||
name serving.city_market_profile,
|
||||
@@ -37,19 +38,41 @@ WITH base AS (
|
||||
WHEN c.population > 0 AND c.padel_venue_count > 0 THEN 1.0
|
||||
WHEN c.population > 0 OR c.padel_venue_count > 0 THEN 0.5
|
||||
ELSE 0.0
|
||||
END AS data_confidence
|
||||
END AS data_confidence,
|
||||
-- Pricing / occupancy from Playtomic (NULL when no availability data)
|
||||
vpb.median_hourly_rate,
|
||||
vpb.median_peak_rate,
|
||||
vpb.median_offpeak_rate,
|
||||
vpb.median_occupancy_rate,
|
||||
vpb.median_daily_revenue_per_venue,
|
||||
vpb.price_currency
|
||||
FROM foundation.dim_cities c
|
||||
LEFT JOIN serving.venue_pricing_benchmarks vpb
|
||||
ON c.country_code = vpb.country_code
|
||||
AND LOWER(TRIM(c.city_name)) = LOWER(TRIM(vpb.city))
|
||||
WHERE c.padel_venue_count > 0
|
||||
),
|
||||
scored AS (
|
||||
SELECT *,
|
||||
ROUND(
|
||||
-- Population component (log scale, 500K+ city → 40 pts)
|
||||
40.0 * LEAST(1.0, LN(GREATEST(population, 1)) / LN(500000))
|
||||
-- Density component (5 courts/100K → 40 pts)
|
||||
+ 40.0 * LEAST(1.0, COALESCE(venues_per_100k, 0) / 5.0)
|
||||
-- Confidence component
|
||||
+ 20.0 * data_confidence
|
||||
-- Population (30 pts): log-scale, 1M+ city = full marks.
|
||||
-- LN(1) = 0 so unpopulated cities score 0 here — they still score on demand.
|
||||
30.0 * LEAST(1.0, LN(GREATEST(population, 1)) / LN(1000000))
|
||||
-- Economic power (25 pts): income PPS normalised to 200 ceiling.
|
||||
-- 200 covers high-income outliers (CH ~190, NO ~180, LU ~200+).
|
||||
-- Drives pricing power and willingness-to-pay directly.
|
||||
+ 25.0 * LEAST(1.0, COALESCE(median_income_pps, 100) / 200.0)
|
||||
-- Demand evidence (30 pts): observed occupancy is the best signal
|
||||
-- (proves real demand). If unavailable, venue density is the proxy
|
||||
-- (proves market exists; caps at 4/100K to avoid penalising dense cities).
|
||||
+ 30.0 * CASE
|
||||
WHEN median_occupancy_rate IS NOT NULL
|
||||
THEN LEAST(1.0, median_occupancy_rate / 0.65)
|
||||
ELSE LEAST(1.0, COALESCE(venues_per_100k, 0) / 4.0)
|
||||
END
|
||||
-- Data quality (15 pts): measures completeness, not market quality.
|
||||
-- Reduced from 20pts — kept as confidence discount, not market signal.
|
||||
+ 15.0 * data_confidence
|
||||
, 1) AS market_score
|
||||
FROM base
|
||||
)
|
||||
@@ -69,16 +92,12 @@ SELECT
|
||||
s.market_score,
|
||||
s.median_income_pps,
|
||||
s.income_year,
|
||||
-- Playtomic pricing/occupancy (NULL when no availability data)
|
||||
vpb.median_hourly_rate,
|
||||
vpb.median_peak_rate,
|
||||
vpb.median_offpeak_rate,
|
||||
vpb.median_occupancy_rate,
|
||||
vpb.median_daily_revenue_per_venue,
|
||||
vpb.price_currency,
|
||||
s.median_hourly_rate,
|
||||
s.median_peak_rate,
|
||||
s.median_offpeak_rate,
|
||||
s.median_occupancy_rate,
|
||||
s.median_daily_revenue_per_venue,
|
||||
s.price_currency,
|
||||
CURRENT_DATE AS refreshed_date
|
||||
FROM scored s
|
||||
LEFT JOIN serving.venue_pricing_benchmarks vpb
|
||||
ON s.country_code = vpb.country_code
|
||||
AND LOWER(TRIM(s.city_name)) = LOWER(TRIM(vpb.city))
|
||||
ORDER BY s.market_score DESC
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
-- Eurostat SDMX city codelist: city_code → city_name mapping.
|
||||
-- Maps coded identifiers (e.g. DE001C) to human-readable names (e.g. Berlin).
|
||||
-- This is the bridge table that lets stg_population join to dim_cities.
|
||||
--
|
||||
-- Source: data/landing/eurostat_city_labels/{year}/{month}/cities_codelist.json.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_city_labels,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain city_code
|
||||
);
|
||||
|
||||
WITH raw AS (
|
||||
SELECT unnest(rows) AS r
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/eurostat_city_labels/*/*/cities_codelist.json.gz',
|
||||
auto_detect = true
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
UPPER(TRIM(r ->> 'city_code')) AS city_code,
|
||||
TRIM(r ->> 'city_name') AS city_name,
|
||||
-- Country code is always the first two letters of the city code (e.g. DE001C → DE)
|
||||
UPPER(LEFT(TRIM(r ->> 'city_code'), 2)) AS country_code,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM raw
|
||||
WHERE (r ->> 'city_code') IS NOT NULL
|
||||
AND (r ->> 'city_name') IS NOT NULL
|
||||
AND LENGTH(TRIM(r ->> 'city_code')) > 0
|
||||
AND LENGTH(TRIM(r ->> 'city_name')) > 0
|
||||
@@ -0,0 +1,42 @@
|
||||
-- GeoNames global city population (cities15000 bulk dataset, filtered to ≥50K).
|
||||
-- Global fallback for countries not covered by Eurostat, Census, or ONS.
|
||||
-- One row per geoname_id (GeoNames stable numeric identifier).
|
||||
--
|
||||
-- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_population_geonames,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain geoname_id
|
||||
);
|
||||
|
||||
WITH parsed AS (
|
||||
SELECT
|
||||
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
|
||||
row ->> 'city_name' AS city_name,
|
||||
row ->> 'country_code' AS country_code,
|
||||
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
|
||||
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM (
|
||||
SELECT UNNEST(rows) AS row
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/geonames/*/*/cities_global.json.gz',
|
||||
auto_detect = true
|
||||
)
|
||||
)
|
||||
WHERE (row ->> 'geoname_id') IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
geoname_id,
|
||||
TRIM(city_name) AS city_name,
|
||||
UPPER(country_code) AS country_code,
|
||||
population,
|
||||
ref_year,
|
||||
extracted_date
|
||||
FROM parsed
|
||||
WHERE population IS NOT NULL
|
||||
AND population > 0
|
||||
AND geoname_id IS NOT NULL
|
||||
AND city_name IS NOT NULL
|
||||
@@ -0,0 +1,41 @@
|
||||
-- ONS 2021 Census population by Local Authority District (LAD).
|
||||
-- Reads pre-processed landing zone JSON from ons_uk extractor.
|
||||
-- One row per (lad_code, ref_year) — LAD code is the ONS area identifier.
|
||||
--
|
||||
-- Source: data/landing/ons_uk/{year}/{month}/lad_population.json.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_population_uk,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain (lad_code, ref_year)
|
||||
);
|
||||
|
||||
WITH parsed AS (
|
||||
SELECT
|
||||
row ->> 'lad_code' AS lad_code,
|
||||
row ->> 'lad_name' AS lad_name,
|
||||
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
|
||||
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
|
||||
row ->> 'country_code' AS country_code,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM (
|
||||
SELECT UNNEST(rows) AS row
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/ons_uk/*/*/lad_population.json.gz',
|
||||
auto_detect = true
|
||||
)
|
||||
)
|
||||
WHERE (row ->> 'lad_code') IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
UPPER(TRIM(lad_code)) AS lad_code,
|
||||
TRIM(lad_name) AS lad_name,
|
||||
population,
|
||||
ref_year,
|
||||
UPPER(country_code) AS country_code,
|
||||
extracted_date
|
||||
FROM parsed
|
||||
WHERE population IS NOT NULL
|
||||
AND population > 0
|
||||
AND lad_code IS NOT NULL
|
||||
@@ -0,0 +1,43 @@
|
||||
-- US Census ACS 5-year place-level population.
|
||||
-- Reads pre-processed landing zone JSON from census_usa extractor.
|
||||
-- One row per (place_fips, ref_year) — surrogate key is the Census FIPS code.
|
||||
--
|
||||
-- Source: data/landing/census_usa/{year}/{month}/acs5_places.json.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_population_usa,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain (place_fips, ref_year)
|
||||
);
|
||||
|
||||
WITH parsed AS (
|
||||
SELECT
|
||||
row ->> 'city_name' AS city_name,
|
||||
row ->> 'state_fips' AS state_fips,
|
||||
row ->> 'place_fips' AS place_fips,
|
||||
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
|
||||
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
|
||||
row ->> 'country_code' AS country_code,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM (
|
||||
SELECT UNNEST(rows) AS row
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/census_usa/*/*/acs5_places.json.gz',
|
||||
auto_detect = true
|
||||
)
|
||||
)
|
||||
WHERE (row ->> 'place_fips') IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
TRIM(city_name) AS city_name,
|
||||
state_fips,
|
||||
place_fips,
|
||||
population,
|
||||
ref_year,
|
||||
UPPER(country_code) AS country_code,
|
||||
extracted_date
|
||||
FROM parsed
|
||||
WHERE population IS NOT NULL
|
||||
AND population > 0
|
||||
AND place_fips IS NOT NULL
|
||||
Reference in New Issue
Block a user