refactor: align transform layer with template methodology

Three deviations from the quart_saas_boilerplate methodology corrected:

1. Fix dim_cities LIKE join (data quality bug)
   - Old: FROM eurostat_cities LEFT JOIN venue_counts LIKE '%country_code%'
     → cartesian product (2.6M rows vs ~5500 expected)
   - New: FROM venue_cities (dim_venues) as primary table, Eurostat for
     enrichment only. grain (country_code, city_slug).
   - Also fixes REGEXP_REPLACE to LOWER() before regex so uppercase city
     names aren't stripped to '-'

2. Rename fct_venue_capacity → dim_venue_capacity
   - Static venue attributes with no time key are a dimension, not a fact
   - No SQL logic changes; update fct_daily_availability reference

3. Add fct_availability_slot at event grain
   - New: grain (snapshot_date, tenant_id, resource_id, slot_start_time)
   - Recheck dedup logic moves here from fct_daily_availability
   - fct_daily_availability now reads fct_availability_slot (cleaner DAG)

Downstream fixes:
- city_market_profile, planner_defaults grain → (country_code, city_slug)
- pseo_city_costs_de, pseo_city_pricing add city_key composite natural key
  (country_slug || '-' || city_slug) to avoid URL collisions across countries
- planner_defaults join in pseo_city_costs_de uses both country_code + city_slug
- Templates updated: natural_key city_slug → city_key

Added transform/sqlmesh_padelnomics/CLAUDE.md documenting data modeling rules,
conformed dimension map, and source integration architecture.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-23 21:17:04 +01:00
parent 4006d47a79
commit ebba46f700
12 changed files with 248 additions and 112 deletions

View File

@@ -1,62 +1,54 @@
-- City dimension: canonical city records with population and venue count.
-- Built from Eurostat Urban Audit codes joined to venue locations.
-- Cities without Eurostat coverage (US, non-EU) are derived from venue clusters.
-- City dimension: canonical city records with venue count and country metadata.
-- Built from venue locations (dim_venues) as the primary source — padelnomics
-- tracks cities where padel venues actually exist, not an administrative city list.
--
-- Conformed dimension: used by city_market_profile and all pSEO serving models.
-- Integrates two sources:
-- dim_venues → city list, venue count, coordinates (Playtomic + OSM)
-- stg_income → country-level median income (Eurostat)
--
-- Population note: Eurostat uses coded identifiers (e.g. DE001C = Berlin) with no
-- city name column in the dataset we extract. City-level population requires a
-- separate code→name lookup extract (future improvement). Population is set to 0
-- until that source is available; market_score degrades gracefully.
--
-- Grain: (country_code, city_slug) — two cities in different countries can share a
-- city name. QUALIFY enforces no duplicate (country_code, city_slug) pairs.
MODEL (
name foundation.dim_cities,
kind FULL,
cron '@daily',
grain city_code
grain (country_code, city_slug)
);
WITH -- Eurostat cities: latest population per city code
eurostat_cities AS (
SELECT
city_code,
country_code,
population,
ref_year,
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
FROM staging.stg_population
QUALIFY ROW_NUMBER() OVER (PARTITION BY city_code ORDER BY ref_year DESC) = 1
),
-- Venue counts per (country_code, city) from dim_venues
venue_counts AS (
WITH
-- Primary: distinct cities from dim_venues (canonical padel city list)
venue_cities AS (
SELECT
country_code,
city,
COUNT(*) AS venue_count,
AVG(lat) AS centroid_lat,
AVG(lon) AS centroid_lon
city AS city_name,
-- Lowercase before regex so uppercase letters aren't stripped to '-'
LOWER(REGEXP_REPLACE(LOWER(city), '[^a-z0-9]+', '-')) AS city_slug,
COUNT(*) AS padel_venue_count,
AVG(lat) AS centroid_lat,
AVG(lon) AS centroid_lon
FROM foundation.dim_venues
WHERE city IS NOT NULL AND city != ''
WHERE city IS NOT NULL AND LENGTH(city) > 0
GROUP BY country_code, city
),
-- Eurostat city label mapping to canonical city names
-- (Eurostat uses codes like DE001C → Berlin; we keep both)
eurostat_labels AS (
SELECT DISTINCT
city_code,
country_code,
-- Derive a slug-friendly city name from the code as fallback
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
FROM eurostat_cities
),
-- Country-level median income (latest year per country)
-- Latest country income per country
country_income AS (
SELECT country_code, median_income_pps, ref_year AS income_year
FROM staging.stg_income
QUALIFY ROW_NUMBER() OVER (PARTITION BY country_code ORDER BY ref_year DESC) = 1
)
SELECT
ec.city_code,
ec.country_code,
COALESCE(vc.city, ec.city_code) AS city_name,
LOWER(REGEXP_REPLACE(
COALESCE(vc.city, ec.city_slug_raw), '[^a-z0-9]+', '-'
)) AS city_slug,
vc.country_code,
vc.city_slug,
vc.city_name,
-- Human-readable country name for pSEO templates and internal linking
CASE ec.country_code
CASE vc.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
@@ -77,11 +69,11 @@ SELECT
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE ec.country_code
END AS country_name_en,
-- URL-safe country slug derived from country_name_en
ELSE vc.country_code
END AS country_name_en,
-- URL-safe country slug
LOWER(REGEXP_REPLACE(
CASE ec.country_code
CASE vc.country_code
WHEN 'DE' THEN 'Germany'
WHEN 'ES' THEN 'Spain'
WHEN 'GB' THEN 'United Kingdom'
@@ -102,19 +94,23 @@ SELECT
WHEN 'AE' THEN 'UAE'
WHEN 'AU' THEN 'Australia'
WHEN 'IE' THEN 'Ireland'
ELSE ec.country_code
ELSE vc.country_code
END, '[^a-zA-Z0-9]+', '-'
)) AS country_slug,
COALESCE(vc.centroid_lat, 0::DOUBLE) AS lat,
COALESCE(vc.centroid_lon, 0::DOUBLE) AS lon,
ec.population,
ec.ref_year AS population_year,
COALESCE(vc.venue_count, 0) AS padel_venue_count,
)) AS country_slug,
vc.centroid_lat AS lat,
vc.centroid_lon AS lon,
-- Population: requires code→name Eurostat lookup (not yet extracted); defaults to 0.
-- market_score uses LOG(GREATEST(population, 1)) so 0 degrades score gracefully.
0::BIGINT AS population,
0::INTEGER AS population_year,
vc.padel_venue_count,
ci.median_income_pps,
ci.income_year
FROM eurostat_cities ec
LEFT JOIN venue_counts vc
ON ec.country_code = vc.country_code
AND LOWER(TRIM(vc.city)) LIKE '%' || LOWER(LEFT(ec.city_code, 2)) || '%'
LEFT JOIN country_income ci
ON ec.country_code = ci.country_code
FROM venue_cities vc
LEFT JOIN country_income ci ON vc.country_code = ci.country_code
-- Enforce grain: if two cities in the same country have the same slug
-- (e.g. 'São Paulo' and 'Sao Paulo'), keep the one with more venues
QUALIFY ROW_NUMBER() OVER (
PARTITION BY vc.country_code, vc.city_slug
ORDER BY vc.padel_venue_count DESC NULLS LAST
) = 1

View File

@@ -3,9 +3,11 @@
-- Used as the denominator for occupancy rate in fct_daily_availability.
--
-- One row per venue (Playtomic tenant).
-- Named dim_* because these are static venue attributes with no time key,
-- not events or measurements.
MODEL (
name foundation.fct_venue_capacity,
name foundation.dim_venue_capacity,
kind FULL,
cron '@daily',
grain tenant_id

View File

@@ -0,0 +1,56 @@
-- Slot-level availability fact: one row per deduplicated available slot.
-- Event grain: (snapshot_date, tenant_id, resource_id, slot_start_time).
--
-- "Available" means the slot was NOT booked at capture time.
-- Recheck-aware: for each (date, tenant, resource, start_time), prefer the
-- latest recheck snapshot over the morning snapshot. If a slot was present
-- in the morning but absent in the recheck, that means it was booked between
-- snapshots — and it will simply not appear in this model (correct behaviour:
-- unavailable slots are not in the available-slots fact).
--
-- is_peak: convenience flag for 17:0021:00 slots (main evening rush).
-- Downstream models (fct_daily_availability) use this to avoid re-computing
-- the peak window condition on every aggregation.
MODEL (
name foundation.fct_availability_slot,
kind FULL,
cron '@daily',
grain (snapshot_date, tenant_id, resource_id, slot_start_time)
);
WITH deduped AS (
SELECT
snapshot_date,
tenant_id,
resource_id,
slot_start_time,
price_amount,
price_currency,
snapshot_type,
captured_at_utc,
-- Prefer recheck over morning; within same snapshot_type prefer latest capture
ROW_NUMBER() OVER (
PARTITION BY snapshot_date, tenant_id, resource_id, slot_start_time
ORDER BY
CASE WHEN snapshot_type = 'recheck' THEN 1 ELSE 2 END,
captured_at_utc DESC
) AS rn
FROM staging.stg_playtomic_availability
WHERE price_amount IS NOT NULL
AND price_amount > 0
)
SELECT
snapshot_date,
tenant_id,
resource_id,
slot_start_time,
price_amount,
price_currency,
snapshot_type,
captured_at_utc,
( slot_start_time::TIME >= '17:00:00'
AND slot_start_time::TIME < '21:00:00'
) AS is_peak
FROM deduped
WHERE rn = 1

View File

@@ -1,11 +1,9 @@
-- Daily venue-level availability, pricing, occupancy, and revenue estimates.
-- Aggregates slot-level data from stg_playtomic_availability into per-venue
-- per-day statistics, then calculates occupancy by comparing available hours
-- against total capacity from fct_venue_capacity.
-- Aggregates fct_availability_slot (event-grain fact) into per-venue per-day
-- statistics, then calculates occupancy against capacity from dim_venue_capacity.
--
-- Recheck-aware occupancy: for each (tenant, resource, slot_start_time),
-- prefer the latest snapshot (recheck > morning). A slot present in morning
-- but absent in the recheck = booked between snapshots → more accurate occupancy.
-- Recheck-aware deduplication lives in fct_availability_slot — this model only
-- reads the already-deduplicated best-snapshot slots.
--
-- Occupancy = 1 - (available_court_hours / capacity_court_hours_per_day)
-- Revenue estimate = booked_court_hours × avg_price_of_available_slots
@@ -19,26 +17,7 @@ MODEL (
grain (snapshot_date, tenant_id)
);
-- Prefer the latest snapshot for each slot:
-- If a recheck exists for a (date, tenant, resource, start_time), use it.
-- Otherwise fall back to the morning snapshot.
WITH ranked_slots AS (
SELECT
a.*,
ROW_NUMBER() OVER (
PARTITION BY a.snapshot_date, a.tenant_id, a.resource_id, a.slot_start_time
ORDER BY
CASE WHEN a.snapshot_type = 'recheck' THEN 1 ELSE 2 END,
a.captured_at_utc DESC
) AS rn
FROM staging.stg_playtomic_availability a
WHERE a.price_amount IS NOT NULL
AND a.price_amount > 0
),
latest_slots AS (
SELECT * FROM ranked_slots WHERE rn = 1
),
slot_agg AS (
WITH slot_agg AS (
SELECT
a.snapshot_date,
a.tenant_id,
@@ -51,19 +30,13 @@ slot_agg AS (
ROUND(AVG(a.price_amount), 2) AS avg_price,
MIN(a.price_amount) AS min_price,
MAX(a.price_amount) AS max_price,
-- Peak: 17:0021:00
ROUND(MEDIAN(a.price_amount) FILTER (
WHERE a.slot_start_time::TIME >= '17:00:00'
AND a.slot_start_time::TIME < '21:00:00'
), 2) AS median_price_peak,
-- Peak: 17:0021:00 (is_peak flag computed once in fct_availability_slot)
ROUND(MEDIAN(a.price_amount) FILTER (WHERE a.is_peak), 2) AS median_price_peak,
-- Off-peak: everything outside 17:0021:00
ROUND(MEDIAN(a.price_amount) FILTER (
WHERE a.slot_start_time::TIME < '17:00:00'
OR a.slot_start_time::TIME >= '21:00:00'
), 2) AS median_price_offpeak,
ROUND(MEDIAN(a.price_amount) FILTER (WHERE NOT a.is_peak), 2) AS median_price_offpeak,
MAX(a.price_currency) AS price_currency,
MAX(a.captured_at_utc) AS captured_at_utc
FROM latest_slots a
FROM foundation.fct_availability_slot a
GROUP BY a.snapshot_date, a.tenant_id
)
SELECT
@@ -106,4 +79,4 @@ SELECT
sa.price_currency,
sa.captured_at_utc
FROM slot_agg sa
JOIN foundation.fct_venue_capacity cap ON sa.tenant_id = cap.tenant_id
JOIN foundation.dim_venue_capacity cap ON sa.tenant_id = cap.tenant_id