feat(sqlmesh): add transform workspace member with 4-layer DuckDB pipeline
Adds sqlmesh_padelnomics UV workspace member at transform/sqlmesh_padelnomics/.
DuckDB gateway, LANDING_DIR variable, @daily cron on all models.
Raw layer (reads landing zone gzip JSON):
raw_overpass_courts — OSM padel court elements (nodes with lat/lon/tags)
raw_playtomic_tenants — Playtomic venue records (tenant_id, location, name)
raw_eurostat_population — Eurostat urb_cpop1 city population (unpivoted)
Staging layer (typed, deduped, country-resolved):
stg_padel_courts — OSM nodes only, ~100m bbox country approximation
stg_playtomic_venues — deduplicated Playtomic venues
stg_population — city population by year with integer types
Foundation layer:
dim_venues — deduped union of OSM + Playtomic (~100m grid)
dim_cities — Eurostat cities with population + venue counts
Serving layer (consumed by web app and SEO generation):
city_market_profile — OBT: market score, venue density, population per city
planner_defaults — per-city calculator pre-fill values with country median
fallbacks and competitive-pressure rate adjustments
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
- `transform/sqlmesh_padelnomics` workspace member: SQLMesh 4-layer model pipeline over DuckDB
|
||||||
|
- Raw: `raw_overpass_courts`, `raw_playtomic_tenants`, `raw_eurostat_population`
|
||||||
|
- Staging: `stg_padel_courts`, `stg_playtomic_venues`, `stg_population`
|
||||||
|
- Foundation: `dim_venues` (OSM + Playtomic deduped), `dim_cities` (with Eurostat population)
|
||||||
|
- Serving: `city_market_profile` (market score OBT), `planner_defaults` (per-city calculator pre-fill)
|
||||||
- `extract/padelnomics_extract` workspace member: Overpass API (padel courts via OSM), Eurostat city demographics (`urb_cpop1`, `ilc_di03`), and Playtomic unauthenticated tenant search extractors
|
- `extract/padelnomics_extract` workspace member: Overpass API (padel courts via OSM), Eurostat city demographics (`urb_cpop1`, `ilc_di03`), and Playtomic unauthenticated tenant search extractors
|
||||||
- Landing zone structure at `data/landing/` with per-source subdirectories: `overpass/`, `eurostat/`, `playtomic/`
|
- Landing zone structure at `data/landing/` with per-source subdirectories: `overpass/`, `eurostat/`, `playtomic/`
|
||||||
- `.env.example` entries for `DUCKDB_PATH` and `LANDING_DIR`
|
- `.env.example` entries for `DUCKDB_PATH` and `LANDING_DIR`
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
members = [
|
members = [
|
||||||
"web",
|
"web",
|
||||||
"extract/padelnomics_extract",
|
"extract/padelnomics_extract",
|
||||||
|
"transform/sqlmesh_padelnomics",
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
|
|||||||
18
padelnomics/transform/sqlmesh_padelnomics/config.yaml
Normal file
18
padelnomics/transform/sqlmesh_padelnomics/config.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
gateways:
|
||||||
|
duckdb:
|
||||||
|
connection:
|
||||||
|
type: duckdb
|
||||||
|
catalogs:
|
||||||
|
local: "{{ env_var('DUCKDB_PATH', 'data/lakehouse.duckdb') }}"
|
||||||
|
|
||||||
|
default_gateway: duckdb
|
||||||
|
|
||||||
|
variables:
|
||||||
|
LANDING_DIR: "{{ env_var('LANDING_DIR', 'data/landing') }}"
|
||||||
|
|
||||||
|
model_defaults:
|
||||||
|
dialect: duckdb
|
||||||
|
start: 2025-01-01
|
||||||
|
cron: '@daily'
|
||||||
|
|
||||||
|
default_target_environment: "dev_{{ user() }}"
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
-- City dimension: canonical city records with population and venue count.
|
||||||
|
-- Built from Eurostat Urban Audit codes joined to venue locations.
|
||||||
|
-- Cities without Eurostat coverage (US, non-EU) are derived from venue clusters.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.dim_cities,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain city_code
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH -- Eurostat cities: latest population per city code
|
||||||
|
eurostat_cities AS (
|
||||||
|
SELECT
|
||||||
|
city_code,
|
||||||
|
country_code,
|
||||||
|
population,
|
||||||
|
ref_year
|
||||||
|
FROM padelnomics.stg_population
|
||||||
|
QUALIFY ROW_NUMBER() OVER (PARTITION BY city_code ORDER BY ref_year DESC) = 1
|
||||||
|
),
|
||||||
|
-- Venue counts per (country_code, city) from dim_venues
|
||||||
|
venue_counts AS (
|
||||||
|
SELECT
|
||||||
|
country_code,
|
||||||
|
city,
|
||||||
|
COUNT(*) AS venue_count,
|
||||||
|
AVG(lat) AS centroid_lat,
|
||||||
|
AVG(lon) AS centroid_lon
|
||||||
|
FROM padelnomics.dim_venues
|
||||||
|
WHERE city IS NOT NULL AND city != ''
|
||||||
|
GROUP BY country_code, city
|
||||||
|
),
|
||||||
|
-- Eurostat city label mapping to canonical city names
|
||||||
|
-- (Eurostat uses codes like DE001C → Berlin; we keep both)
|
||||||
|
eurostat_labels AS (
|
||||||
|
SELECT DISTINCT
|
||||||
|
city_code,
|
||||||
|
country_code,
|
||||||
|
-- Derive a slug-friendly city name from the code as fallback
|
||||||
|
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
|
||||||
|
FROM eurostat_cities
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
ec.city_code,
|
||||||
|
ec.country_code,
|
||||||
|
COALESCE(vc.city, ec.city_code) AS city_name,
|
||||||
|
LOWER(REGEXP_REPLACE(
|
||||||
|
COALESCE(vc.city, ec.city_slug_raw), '[^a-z0-9]+', '-'
|
||||||
|
)) AS city_slug,
|
||||||
|
COALESCE(vc.centroid_lat, 0::DOUBLE) AS lat,
|
||||||
|
COALESCE(vc.centroid_lon, 0::DOUBLE) AS lon,
|
||||||
|
ec.population,
|
||||||
|
ec.ref_year AS population_year,
|
||||||
|
COALESCE(vc.venue_count, 0) AS padel_venue_count
|
||||||
|
FROM eurostat_cities ec
|
||||||
|
LEFT JOIN venue_counts vc
|
||||||
|
ON ec.country_code = vc.country_code
|
||||||
|
AND LOWER(TRIM(vc.city)) LIKE '%' || LOWER(LEFT(ec.city_code, 2)) || '%'
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
|
||||||
|
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
|
||||||
|
-- collapsed to a single record preferring Playtomic data (richer metadata).
|
||||||
|
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.dim_venues,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain venue_id
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH all_venues AS (
|
||||||
|
SELECT
|
||||||
|
'osm:' || osm_id::TEXT AS venue_id,
|
||||||
|
source,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
country_code,
|
||||||
|
name,
|
||||||
|
city,
|
||||||
|
postcode,
|
||||||
|
NULL AS tenant_type,
|
||||||
|
extracted_date
|
||||||
|
FROM padelnomics.stg_padel_courts
|
||||||
|
WHERE country_code IS NOT NULL
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'pt:' || tenant_id AS venue_id,
|
||||||
|
source,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
country_code,
|
||||||
|
name,
|
||||||
|
city,
|
||||||
|
postcode,
|
||||||
|
tenant_type,
|
||||||
|
extracted_date
|
||||||
|
FROM padelnomics.stg_playtomic_venues
|
||||||
|
WHERE country_code IS NOT NULL
|
||||||
|
),
|
||||||
|
-- Rank venues so Playtomic records win ties in proximity dedup
|
||||||
|
ranked AS (
|
||||||
|
SELECT *,
|
||||||
|
CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
|
||||||
|
FROM all_venues
|
||||||
|
)
|
||||||
|
-- Note: full proximity dedup (haversine clustering) is expensive in SQL.
|
||||||
|
-- For now, deduplicate on exact (country_code, ROUND(lat,3), ROUND(lon,3))
|
||||||
|
-- — ≈111m grid cells. Refine with spatial index if volumes grow.
|
||||||
|
SELECT
|
||||||
|
MIN(venue_id) OVER (
|
||||||
|
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
|
||||||
|
ORDER BY source_rank
|
||||||
|
) AS venue_id,
|
||||||
|
country_code,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
COALESCE(
|
||||||
|
MAX(CASE WHEN source = 'playtomic' THEN name END)
|
||||||
|
OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
|
||||||
|
name
|
||||||
|
) AS name,
|
||||||
|
COALESCE(city, '') AS city,
|
||||||
|
postcode,
|
||||||
|
source,
|
||||||
|
tenant_type,
|
||||||
|
extracted_date
|
||||||
|
FROM ranked
|
||||||
|
QUALIFY ROW_NUMBER() OVER (
|
||||||
|
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
|
||||||
|
ORDER BY source_rank
|
||||||
|
) = 1
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
-- Raw Eurostat Urban Audit city population (dataset: urb_cpop1).
|
||||||
|
-- Source: data/landing/eurostat/{year}/{month}/urb_cpop1.json.gz
|
||||||
|
-- Format: Eurostat JSON Statistics API (dimensions + flat value array).
|
||||||
|
--
|
||||||
|
-- The Eurostat JSON format encodes dimensions separately from values:
|
||||||
|
-- dimension.cities.category.index → maps city code to flat array position
|
||||||
|
-- dimension.time.category.index → maps year to flat array position
|
||||||
|
-- values → flat object {position_str: value}
|
||||||
|
--
|
||||||
|
-- This model stores one row per (city_code, year) by computing positions.
|
||||||
|
-- Reference: https://wikis.ec.europa.eu/display/EUROSTATHELP/API+Statistics
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.raw_eurostat_population,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain (city_code, ref_year)
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH raw AS (
|
||||||
|
SELECT
|
||||||
|
raw_json,
|
||||||
|
filename
|
||||||
|
FROM read_json(
|
||||||
|
@LANDING_DIR || '/eurostat/*/*/urb_cpop1.json.gz',
|
||||||
|
format = 'auto',
|
||||||
|
filename = true,
|
||||||
|
columns = { 'raw_json': 'JSON' }
|
||||||
|
)
|
||||||
|
),
|
||||||
|
-- Unnest city codes with their ordinal positions
|
||||||
|
cities AS (
|
||||||
|
SELECT
|
||||||
|
city_code,
|
||||||
|
(city_pos)::INTEGER AS city_pos,
|
||||||
|
filename,
|
||||||
|
raw_json,
|
||||||
|
(json_extract(raw_json, '$.size[1]'))::INTEGER AS n_times
|
||||||
|
FROM raw,
|
||||||
|
LATERAL (
|
||||||
|
SELECT key AS city_code, value::INTEGER AS city_pos
|
||||||
|
FROM json_each(json_extract(raw_json, '$.dimension.cities.category.index'))
|
||||||
|
)
|
||||||
|
),
|
||||||
|
-- Unnest time (year) values with positions
|
||||||
|
times AS (
|
||||||
|
SELECT key AS ref_year, value::INTEGER AS time_pos
|
||||||
|
FROM (SELECT raw_json FROM raw LIMIT 1),
|
||||||
|
LATERAL (
|
||||||
|
SELECT key, value
|
||||||
|
FROM json_each(json_extract(raw_json, '$.dimension.time.category.index'))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
c.city_code,
|
||||||
|
t.ref_year,
|
||||||
|
TRY_CAST(
|
||||||
|
json_extract(c.raw_json, '$.' || (c.city_pos * c.n_times + t.time_pos)::TEXT)
|
||||||
|
AS DOUBLE
|
||||||
|
) AS population,
|
||||||
|
c.filename AS source_file,
|
||||||
|
CURRENT_DATE AS extracted_date
|
||||||
|
FROM cities c
|
||||||
|
CROSS JOIN times t
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
-- Raw OpenStreetMap padel courts from Overpass API landing files.
|
||||||
|
-- Source: data/landing/overpass/{year}/{month}/courts.json.gz
|
||||||
|
-- Format: {"version": ..., "elements": [{type, id, lat, lon, tags}, ...]}
|
||||||
|
--
|
||||||
|
-- Only node elements carry direct lat/lon. Way and relation elements need
|
||||||
|
-- centroid calculation from member nodes (not done here — filter to node only
|
||||||
|
-- for the initial raw layer; ways/relations retained as-is for future enrichment).
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.raw_overpass_courts,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain (osm_type, osm_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
elem ->> 'type' AS osm_type,
|
||||||
|
(elem ->> 'id')::BIGINT AS osm_id,
|
||||||
|
TRY_CAST(elem ->> 'lat' AS DOUBLE) AS lat,
|
||||||
|
TRY_CAST(elem ->> 'lon' AS DOUBLE) AS lon,
|
||||||
|
elem -> 'tags' ->> 'name' AS name,
|
||||||
|
elem -> 'tags' ->> 'sport' AS sport,
|
||||||
|
elem -> 'tags' ->> 'leisure' AS leisure,
|
||||||
|
elem -> 'tags' ->> 'addr:country' AS country_code,
|
||||||
|
elem -> 'tags' ->> 'addr:city' AS city_tag,
|
||||||
|
elem -> 'tags' ->> 'addr:postcode' AS postcode,
|
||||||
|
elem -> 'tags' ->> 'operator' AS operator_name,
|
||||||
|
elem -> 'tags' ->> 'opening_hours' AS opening_hours,
|
||||||
|
elem -> 'tags' ->> 'fee' AS fee,
|
||||||
|
filename AS source_file,
|
||||||
|
CURRENT_DATE AS extracted_date
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
UNNEST(elements) AS elem,
|
||||||
|
filename
|
||||||
|
FROM read_json(
|
||||||
|
@LANDING_DIR || '/overpass/*/*/courts.json.gz',
|
||||||
|
format = 'auto',
|
||||||
|
filename = true
|
||||||
|
)
|
||||||
|
)
|
||||||
|
WHERE (elem ->> 'type') IS NOT NULL
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
-- Raw Playtomic venue (tenant) listings from unauthenticated tenant search API.
|
||||||
|
-- Source: data/landing/playtomic/{year}/{month}/tenants.json.gz
|
||||||
|
-- Format: {"tenants": [{tenant_id, name, address, sport_ids, ...}], "count": N}
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.raw_playtomic_tenants,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain tenant_id
|
||||||
|
);
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
tenant ->> 'tenant_id' AS tenant_id,
|
||||||
|
tenant ->> 'tenant_name' AS tenant_name,
|
||||||
|
tenant -> 'address' ->> 'street' AS street,
|
||||||
|
tenant -> 'address' ->> 'city' AS city,
|
||||||
|
tenant -> 'address' ->> 'postal_code' AS postal_code,
|
||||||
|
tenant -> 'address' ->> 'country_code' AS country_code,
|
||||||
|
TRY_CAST(tenant -> 'address' ->> 'coordinate_lat' AS DOUBLE) AS lat,
|
||||||
|
TRY_CAST(tenant -> 'address' ->> 'coordinate_lon' AS DOUBLE) AS lon,
|
||||||
|
tenant ->> 'sport_ids' AS sport_ids_raw,
|
||||||
|
tenant ->> 'tenant_type' AS tenant_type,
|
||||||
|
filename AS source_file,
|
||||||
|
CURRENT_DATE AS extracted_date
|
||||||
|
FROM (
|
||||||
|
SELECT
|
||||||
|
UNNEST(tenants) AS tenant,
|
||||||
|
filename
|
||||||
|
FROM read_json(
|
||||||
|
@LANDING_DIR || '/playtomic/*/*/tenants.json.gz',
|
||||||
|
format = 'auto',
|
||||||
|
filename = true
|
||||||
|
)
|
||||||
|
)
|
||||||
|
WHERE (tenant ->> 'tenant_id') IS NOT NULL
|
||||||
@@ -0,0 +1,68 @@
|
|||||||
|
-- One Big Table: per-city padel market intelligence.
|
||||||
|
-- Consumed by: SEO article generation, planner city-select pre-fill, API endpoints.
|
||||||
|
--
|
||||||
|
-- Market score (0–100) is a simple composite:
|
||||||
|
-- 40% population (log-scaled, city > 500K = max)
|
||||||
|
-- 40% venue density (courts per 100K residents)
|
||||||
|
-- 20% data confidence (completeness of both population + venue data)
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.city_market_profile,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain city_slug
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH base AS (
|
||||||
|
SELECT
|
||||||
|
c.city_code,
|
||||||
|
c.country_code,
|
||||||
|
c.city_name,
|
||||||
|
c.city_slug,
|
||||||
|
c.lat,
|
||||||
|
c.lon,
|
||||||
|
c.population,
|
||||||
|
c.population_year,
|
||||||
|
c.padel_venue_count,
|
||||||
|
-- Venue density: padel venues per 100K residents
|
||||||
|
CASE WHEN c.population > 0
|
||||||
|
THEN ROUND(c.padel_venue_count::DOUBLE / c.population * 100000, 2)
|
||||||
|
ELSE NULL
|
||||||
|
END AS venues_per_100k,
|
||||||
|
-- Data confidence: 1.0 if both population and venues are present
|
||||||
|
CASE
|
||||||
|
WHEN c.population > 0 AND c.padel_venue_count > 0 THEN 1.0
|
||||||
|
WHEN c.population > 0 OR c.padel_venue_count > 0 THEN 0.5
|
||||||
|
ELSE 0.0
|
||||||
|
END AS data_confidence
|
||||||
|
FROM padelnomics.dim_cities c
|
||||||
|
WHERE c.padel_venue_count > 0
|
||||||
|
),
|
||||||
|
scored AS (
|
||||||
|
SELECT *,
|
||||||
|
ROUND(
|
||||||
|
-- Population component (log scale, 500K+ city → 40 pts)
|
||||||
|
40.0 * LEAST(1.0, LN(GREATEST(population, 1)) / LN(500000))
|
||||||
|
-- Density component (5 courts/100K → 40 pts)
|
||||||
|
+ 40.0 * LEAST(1.0, COALESCE(venues_per_100k, 0) / 5.0)
|
||||||
|
-- Confidence component
|
||||||
|
+ 20.0 * data_confidence
|
||||||
|
, 1) AS market_score
|
||||||
|
FROM base
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
city_code,
|
||||||
|
country_code,
|
||||||
|
city_name,
|
||||||
|
city_slug,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
population,
|
||||||
|
population_year,
|
||||||
|
padel_venue_count,
|
||||||
|
venues_per_100k,
|
||||||
|
data_confidence,
|
||||||
|
market_score,
|
||||||
|
CURRENT_DATE AS refreshed_date
|
||||||
|
FROM scored
|
||||||
|
ORDER BY market_score DESC
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
-- Per-city planner defaults for the financial calculator.
|
||||||
|
-- When a user selects a city in the planner, these values pre-fill the inputs.
|
||||||
|
-- Consumed by: padelnomics.planner.routes — city_defaults(city_slug) lookup.
|
||||||
|
--
|
||||||
|
-- Values are derived from market data where available, otherwise fall back to
|
||||||
|
-- country-level medians, then to global fallbacks from market research report.
|
||||||
|
--
|
||||||
|
-- Units are explicit in column names (EUR, %, h). All monetary values in EUR.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.planner_defaults,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain city_slug
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH country_medians AS (
|
||||||
|
-- Country-level fallback values from market research (hardcoded until we
|
||||||
|
-- have richer pricing data from Playtomic or direct scraping).
|
||||||
|
SELECT * FROM (VALUES
|
||||||
|
-- (country_code, hourly_rate_peak_eur, monthly_rent_eur_sqm, capex_court_eur,
|
||||||
|
-- avg_utilisation_pct, courts_typical)
|
||||||
|
('DE', 22.0, 14.0, 42000.0, 0.55, 4),
|
||||||
|
('ES', 16.0, 9.0, 32000.0, 0.62, 6),
|
||||||
|
('GB', 24.0, 18.0, 48000.0, 0.52, 4),
|
||||||
|
('FR', 18.0, 12.0, 36000.0, 0.58, 5),
|
||||||
|
('IT', 15.0, 10.0, 30000.0, 0.60, 6),
|
||||||
|
('PT', 12.0, 8.0, 28000.0, 0.65, 6),
|
||||||
|
('AT', 20.0, 13.0, 40000.0, 0.54, 4),
|
||||||
|
('CH', 28.0, 22.0, 55000.0, 0.50, 4),
|
||||||
|
('NL', 20.0, 15.0, 40000.0, 0.56, 4),
|
||||||
|
('BE', 18.0, 13.0, 36000.0, 0.57, 4),
|
||||||
|
('SE', 22.0, 14.0, 42000.0, 0.50, 4),
|
||||||
|
('US', 20.0, 12.0, 38000.0, 0.58, 6)
|
||||||
|
) AS t(country_code, hourly_rate_peak_eur, monthly_rent_eur_sqm, capex_court_eur,
|
||||||
|
avg_utilisation_pct, courts_typical)
|
||||||
|
),
|
||||||
|
city_venue_density AS (
|
||||||
|
SELECT
|
||||||
|
city_slug,
|
||||||
|
country_code,
|
||||||
|
padel_venue_count,
|
||||||
|
population,
|
||||||
|
venues_per_100k,
|
||||||
|
market_score
|
||||||
|
FROM padelnomics.city_market_profile
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
cvd.city_slug,
|
||||||
|
cvd.country_code,
|
||||||
|
cvd.padel_venue_count,
|
||||||
|
cvd.population,
|
||||||
|
cvd.market_score,
|
||||||
|
-- Hourly rate: adjust country median by market maturity
|
||||||
|
-- (high-density markets → slightly lower rates from competition)
|
||||||
|
ROUND(
|
||||||
|
cm.hourly_rate_peak_eur
|
||||||
|
* CASE
|
||||||
|
WHEN cvd.venues_per_100k > 4 THEN 0.90 -- very competitive
|
||||||
|
WHEN cvd.venues_per_100k > 2 THEN 0.95 -- competitive
|
||||||
|
WHEN cvd.venues_per_100k < 0.5 THEN 1.10 -- underserved premium
|
||||||
|
ELSE 1.0
|
||||||
|
END
|
||||||
|
, 2) AS hourly_rate_peak_eur,
|
||||||
|
ROUND(cm.hourly_rate_peak_eur * 0.75, 2) AS hourly_rate_offpeak_eur,
|
||||||
|
cm.monthly_rent_eur_sqm,
|
||||||
|
cm.capex_court_eur,
|
||||||
|
cm.avg_utilisation_pct,
|
||||||
|
cm.courts_typical,
|
||||||
|
CURRENT_DATE AS refreshed_date
|
||||||
|
FROM city_venue_density cvd
|
||||||
|
LEFT JOIN country_medians cm ON cvd.country_code = cm.country_code
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
-- Cleaned OSM padel courts — node elements only (direct lat/lon available).
|
||||||
|
-- Deduplicates on osm_id, keeps most recently extracted record.
|
||||||
|
-- Country code resolved from addr:country tag or approximated by lat/lon bbox.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.stg_padel_courts,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain osm_id
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH deduped AS (
|
||||||
|
SELECT *,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
|
||||||
|
FROM padelnomics.raw_overpass_courts
|
||||||
|
WHERE osm_type = 'node'
|
||||||
|
AND lat IS NOT NULL
|
||||||
|
AND lon IS NOT NULL
|
||||||
|
AND lat BETWEEN -90 AND 90
|
||||||
|
AND lon BETWEEN -180 AND 180
|
||||||
|
),
|
||||||
|
-- Approximate country from lat/lon when addr:country tag is absent
|
||||||
|
with_country AS (
|
||||||
|
SELECT
|
||||||
|
osm_id,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE
|
||||||
|
WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE'
|
||||||
|
WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES'
|
||||||
|
WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB'
|
||||||
|
WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR'
|
||||||
|
WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH'
|
||||||
|
WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT'
|
||||||
|
WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT'
|
||||||
|
WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT'
|
||||||
|
ELSE NULL
|
||||||
|
END) AS country_code,
|
||||||
|
NULLIF(TRIM(name), '') AS name,
|
||||||
|
NULLIF(TRIM(city_tag), '') AS city_tag,
|
||||||
|
postcode,
|
||||||
|
operator_name,
|
||||||
|
opening_hours,
|
||||||
|
fee,
|
||||||
|
extracted_date
|
||||||
|
FROM deduped
|
||||||
|
WHERE rn = 1
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
osm_id,
|
||||||
|
'osm' AS source,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
country_code,
|
||||||
|
name,
|
||||||
|
city_tag AS city,
|
||||||
|
postcode,
|
||||||
|
operator_name,
|
||||||
|
opening_hours,
|
||||||
|
CASE LOWER(fee) WHEN 'yes' THEN TRUE WHEN 'no' THEN FALSE ELSE NULL END AS is_paid,
|
||||||
|
extracted_date
|
||||||
|
FROM with_country
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
-- Cleaned Playtomic padel venue records. One row per venue, deduped on tenant_id.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.stg_playtomic_venues,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain tenant_id
|
||||||
|
);
|
||||||
|
|
||||||
|
WITH deduped AS (
|
||||||
|
SELECT *,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY tenant_id ORDER BY extracted_date DESC) AS rn
|
||||||
|
FROM padelnomics.raw_playtomic_tenants
|
||||||
|
WHERE tenant_id IS NOT NULL
|
||||||
|
AND lat IS NOT NULL
|
||||||
|
AND lon IS NOT NULL
|
||||||
|
AND lat BETWEEN -90 AND 90
|
||||||
|
AND lon BETWEEN -180 AND 180
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
tenant_id,
|
||||||
|
'playtomic' AS source,
|
||||||
|
lat,
|
||||||
|
lon,
|
||||||
|
UPPER(country_code) AS country_code,
|
||||||
|
NULLIF(TRIM(tenant_name), '') AS name,
|
||||||
|
NULLIF(TRIM(city), '') AS city,
|
||||||
|
postal_code AS postcode,
|
||||||
|
tenant_type,
|
||||||
|
extracted_date
|
||||||
|
FROM deduped
|
||||||
|
WHERE rn = 1
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
-- Eurostat Urban Audit city population, cleaned and typed.
|
||||||
|
-- Eurostat city codes follow the NUTS Urban Audit convention (e.g. DE001C).
|
||||||
|
-- Country code is the first two characters of the city code.
|
||||||
|
|
||||||
|
MODEL (
|
||||||
|
name padelnomics.stg_population,
|
||||||
|
kind FULL,
|
||||||
|
cron '@daily',
|
||||||
|
grain (city_code, ref_year)
|
||||||
|
);
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
UPPER(city_code) AS city_code,
|
||||||
|
UPPER(LEFT(city_code, 2)) AS country_code,
|
||||||
|
ref_year::INTEGER AS ref_year,
|
||||||
|
population::BIGINT AS population,
|
||||||
|
extracted_date
|
||||||
|
FROM padelnomics.raw_eurostat_population
|
||||||
|
WHERE population IS NOT NULL
|
||||||
|
AND population > 0
|
||||||
|
AND ref_year ~ '^\d{4}$'
|
||||||
18
padelnomics/transform/sqlmesh_padelnomics/pyproject.toml
Normal file
18
padelnomics/transform/sqlmesh_padelnomics/pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[project]
|
||||||
|
name = "sqlmesh_padelnomics"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "SQLMesh transformation models for padelnomics analytics"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"sqlmesh>=0.130.0",
|
||||||
|
"duckdb>=1.0.0",
|
||||||
|
"python-dotenv>=1.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
# No Python packages — this workspace member contains only SQLMesh SQL models.
|
||||||
|
packages = ["models"]
|
||||||
1039
padelnomics/uv.lock
generated
1039
padelnomics/uv.lock
generated
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user