refactor: align transform layer with template methodology
Three deviations from the quart_saas_boilerplate methodology corrected:
1. Fix dim_cities LIKE join (data quality bug)
- Old: FROM eurostat_cities LEFT JOIN venue_counts LIKE '%country_code%'
→ cartesian product (2.6M rows vs ~5500 expected)
- New: FROM venue_cities (dim_venues) as primary table, Eurostat for
enrichment only. grain (country_code, city_slug).
- Also fixes REGEXP_REPLACE to LOWER() before regex so uppercase city
names aren't stripped to '-'
2. Rename fct_venue_capacity → dim_venue_capacity
- Static venue attributes with no time key are a dimension, not a fact
- No SQL logic changes; update fct_daily_availability reference
3. Add fct_availability_slot at event grain
- New: grain (snapshot_date, tenant_id, resource_id, slot_start_time)
- Recheck dedup logic moves here from fct_daily_availability
- fct_daily_availability now reads fct_availability_slot (cleaner DAG)
Downstream fixes:
- city_market_profile, planner_defaults grain → (country_code, city_slug)
- pseo_city_costs_de, pseo_city_pricing add city_key composite natural key
(country_slug || '-' || city_slug) to avoid URL collisions across countries
- planner_defaults join in pseo_city_costs_de uses both country_code + city_slug
- Templates updated: natural_key city_slug → city_key
Added transform/sqlmesh_padelnomics/CLAUDE.md documenting data modeling rules,
conformed dimension map, and source integration architecture.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,62 +1,54 @@
|
||||
-- City dimension: canonical city records with population and venue count.
|
||||
-- Built from Eurostat Urban Audit codes joined to venue locations.
|
||||
-- Cities without Eurostat coverage (US, non-EU) are derived from venue clusters.
|
||||
-- City dimension: canonical city records with venue count and country metadata.
|
||||
-- Built from venue locations (dim_venues) as the primary source — padelnomics
|
||||
-- tracks cities where padel venues actually exist, not an administrative city list.
|
||||
--
|
||||
-- Conformed dimension: used by city_market_profile and all pSEO serving models.
|
||||
-- Integrates two sources:
|
||||
-- dim_venues → city list, venue count, coordinates (Playtomic + OSM)
|
||||
-- stg_income → country-level median income (Eurostat)
|
||||
--
|
||||
-- Population note: Eurostat uses coded identifiers (e.g. DE001C = Berlin) with no
|
||||
-- city name column in the dataset we extract. City-level population requires a
|
||||
-- separate code→name lookup extract (future improvement). Population is set to 0
|
||||
-- until that source is available; market_score degrades gracefully.
|
||||
--
|
||||
-- Grain: (country_code, city_slug) — two cities in different countries can share a
|
||||
-- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs.
|
||||
|
||||
MODEL (
|
||||
name foundation.dim_cities,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain city_code
|
||||
grain (country_code, city_slug)
|
||||
);
|
||||
|
||||
WITH -- Eurostat cities: latest population per city code
|
||||
eurostat_cities AS (
|
||||
SELECT
|
||||
city_code,
|
||||
country_code,
|
||||
population,
|
||||
ref_year,
|
||||
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
|
||||
FROM staging.stg_population
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY city_code ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
-- Venue counts per (country_code, city) from dim_venues
|
||||
venue_counts AS (
|
||||
WITH
|
||||
-- Primary: distinct cities from dim_venues (canonical padel city list)
|
||||
venue_cities AS (
|
||||
SELECT
|
||||
country_code,
|
||||
city,
|
||||
COUNT(*) AS venue_count,
|
||||
AVG(lat) AS centroid_lat,
|
||||
AVG(lon) AS centroid_lon
|
||||
city AS city_name,
|
||||
-- Lowercase before regex so uppercase letters aren't stripped to '-'
|
||||
LOWER(REGEXP_REPLACE(LOWER(city), '[^a-z0-9]+', '-')) AS city_slug,
|
||||
COUNT(*) AS padel_venue_count,
|
||||
AVG(lat) AS centroid_lat,
|
||||
AVG(lon) AS centroid_lon
|
||||
FROM foundation.dim_venues
|
||||
WHERE city IS NOT NULL AND city != ''
|
||||
WHERE city IS NOT NULL AND LENGTH(city) > 0
|
||||
GROUP BY country_code, city
|
||||
),
|
||||
-- Eurostat city label mapping to canonical city names
|
||||
-- (Eurostat uses codes like DE001C → Berlin; we keep both)
|
||||
eurostat_labels AS (
|
||||
SELECT DISTINCT
|
||||
city_code,
|
||||
country_code,
|
||||
-- Derive a slug-friendly city name from the code as fallback
|
||||
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
|
||||
FROM eurostat_cities
|
||||
),
|
||||
-- Country-level median income (latest year per country)
|
||||
-- Latest country income per country
|
||||
country_income AS (
|
||||
SELECT country_code, median_income_pps, ref_year AS income_year
|
||||
FROM staging.stg_income
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
|
||||
)
|
||||
SELECT
|
||||
ec.city_code,
|
||||
ec.country_code,
|
||||
COALESCE(vc.city, ec.city_code) AS city_name,
|
||||
LOWER(REGEXP_REPLACE(
|
||||
COALESCE(vc.city, ec.city_slug_raw), '[^a-z0-9]+', '-'
|
||||
)) AS city_slug,
|
||||
vc.country_code,
|
||||
vc.city_slug,
|
||||
vc.city_name,
|
||||
-- Human-readable country name for pSEO templates and internal linking
|
||||
CASE ec.country_code
|
||||
CASE vc.country_code
|
||||
WHEN 'DE' THEN 'Germany'
|
||||
WHEN 'ES' THEN 'Spain'
|
||||
WHEN 'GB' THEN 'United Kingdom'
|
||||
@@ -77,11 +69,11 @@ SELECT
|
||||
WHEN 'AE' THEN 'UAE'
|
||||
WHEN 'AU' THEN 'Australia'
|
||||
WHEN 'IE' THEN 'Ireland'
|
||||
ELSE ec.country_code
|
||||
END AS country_name_en,
|
||||
-- URL-safe country slug derived from country_name_en
|
||||
ELSE vc.country_code
|
||||
END AS country_name_en,
|
||||
-- URL-safe country slug
|
||||
LOWER(REGEXP_REPLACE(
|
||||
CASE ec.country_code
|
||||
CASE vc.country_code
|
||||
WHEN 'DE' THEN 'Germany'
|
||||
WHEN 'ES' THEN 'Spain'
|
||||
WHEN 'GB' THEN 'United Kingdom'
|
||||
@@ -102,19 +94,23 @@ SELECT
|
||||
WHEN 'AE' THEN 'UAE'
|
||||
WHEN 'AU' THEN 'Australia'
|
||||
WHEN 'IE' THEN 'Ireland'
|
||||
ELSE ec.country_code
|
||||
ELSE vc.country_code
|
||||
END, '[^a-zA-Z0-9]+', '-'
|
||||
)) AS country_slug,
|
||||
COALESCE(vc.centroid_lat, 0::DOUBLE) AS lat,
|
||||
COALESCE(vc.centroid_lon, 0::DOUBLE) AS lon,
|
||||
ec.population,
|
||||
ec.ref_year AS population_year,
|
||||
COALESCE(vc.venue_count, 0) AS padel_venue_count,
|
||||
)) AS country_slug,
|
||||
vc.centroid_lat AS lat,
|
||||
vc.centroid_lon AS lon,
|
||||
-- Population: requires code→name Eurostat lookup (not yet extracted); defaults to 0.
|
||||
-- market_score uses LOG(GREATEST(population, 1)) so 0 degrades score gracefully.
|
||||
0::BIGINT AS population,
|
||||
0::INTEGER AS population_year,
|
||||
vc.padel_venue_count,
|
||||
ci.median_income_pps,
|
||||
ci.income_year
|
||||
FROM eurostat_cities ec
|
||||
LEFT JOIN venue_counts vc
|
||||
ON ec.country_code = vc.country_code
|
||||
AND LOWER(TRIM(vc.city)) LIKE '%' || LOWER(LEFT(ec.city_code, 2)) || '%'
|
||||
LEFT JOIN country_income ci
|
||||
ON ec.country_code = ci.country_code
|
||||
FROM venue_cities vc
|
||||
LEFT JOIN country_income ci ON vc.country_code = ci.country_code
|
||||
-- Enforce grain: if two cities in the same country have the same slug
|
||||
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
PARTITION BY vc.country_code, vc.city_slug
|
||||
ORDER BY vc.padel_venue_count DESC NULLS LAST
|
||||
) = 1
|
||||
|
||||
Reference in New Issue
Block a user