Files
padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_venues.sql
Deeman 4e82907a70 refactor(transform): conform geographic dimension hierarchy via city_slug
Propagates the conformed city key (city_slug) from dim_venues through the
full pricing pipeline, eliminating 3 fragile LOWER(TRIM(...)) fuzzy string
joins with deterministic key joins.

Changes (cascading, task-by-task):
- dim_venues: add city_slug computed column (REGEXP_REPLACE slug derivation)
- dim_venue_capacity: join foundation.dim_venues instead of stg_playtomic_venues;
  carry city_slug alongside country_code/city
- fct_daily_availability: carry city_slug from dim_venue_capacity
- venue_pricing_benchmarks: carry city_slug from fct_daily_availability;
  add to venue_stats GROUP BY and final SELECT/GROUP BY
- city_market_profile: join vpb on city_slug = city_slug (was LOWER(TRIM))
- planner_defaults: add city_slug to city_benchmarks CTE; join on city_slug
- pseo_city_pricing: join city_market_profile on city_slug (was LOWER(TRIM))
- pipeline_routes._DAG: dim_venue_capacity now depends on dim_venues, not stg_playtomic_venues

Result: dim_venues.city_slug → dim_cities.(country_code, city_slug) forms a
fully conformed geographic hierarchy with no fuzzy string comparisons.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 13:23:03 +01:00

109 lines
3.5 KiB
SQL

-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
-- collapsed to a single record preferring Playtomic data (richer metadata).
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
--
-- Playtomic venues include court counts, indoor/outdoor split, currency, and timezone.
MODEL (
name foundation.dim_venues,
kind FULL,
cron '@daily',
grain venue_id
);
WITH playtomic_venues AS (
SELECT
'pt:' || v.tenant_id AS venue_id,
v.tenant_id,
'playtomic' AS source,
v.lat,
v.lon,
v.country_code,
v.name,
v.city,
v.postcode,
v.tenant_type,
v.timezone,
v.vat_rate,
v.default_currency,
-- Court counts from resources
COUNT(r.resource_id) AS court_count,
COUNT(r.resource_id) FILTER (WHERE r.resource_type = 'indoor') AS indoor_court_count,
COUNT(r.resource_id) FILTER (WHERE r.resource_type = 'outdoor') AS outdoor_court_count,
v.extracted_date
FROM staging.stg_playtomic_venues v
LEFT JOIN staging.stg_playtomic_resources r
ON v.tenant_id = r.tenant_id AND r.is_active = TRUE
WHERE v.country_code IS NOT NULL
GROUP BY
v.tenant_id, v.lat, v.lon, v.country_code, v.name, v.city,
v.postcode, v.tenant_type, v.timezone, v.vat_rate,
v.default_currency, v.extracted_date
),
osm_venues AS (
SELECT
'osm:' || osm_id::TEXT AS venue_id,
NULL AS tenant_id,
'osm' AS source,
lat,
lon,
country_code,
name,
city,
postcode,
NULL AS tenant_type,
NULL AS timezone,
NULL AS vat_rate,
NULL AS default_currency,
NULL AS court_count,
NULL AS indoor_court_count,
NULL AS outdoor_court_count,
extracted_date
FROM staging.stg_padel_courts
WHERE country_code IS NOT NULL
),
all_venues AS (
SELECT * FROM playtomic_venues
UNION ALL
SELECT * FROM osm_venues
),
ranked AS (
SELECT *,
CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
FROM all_venues
)
-- Deduplicate on ~111m grid cells, preferring Playtomic
SELECT
MIN(venue_id) OVER (
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
ORDER BY source_rank
) AS venue_id,
tenant_id,
country_code,
lat,
lon,
COALESCE(
MAX(CASE WHEN source = 'playtomic' THEN name END)
OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
name
) AS name,
COALESCE(city, '') AS city,
postcode,
source,
tenant_type,
timezone,
vat_rate,
default_currency,
court_count,
indoor_court_count,
outdoor_court_count,
-- Conformed city key: enables deterministic joins to dim_cities / venue_pricing_benchmarks
LOWER(REGEXP_REPLACE(LOWER(COALESCE(city, '')), '[^a-z0-9]+', '-')) AS city_slug,
extracted_date
FROM ranked
QUALIFY ROW_NUMBER() OVER (
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
ORDER BY source_rank
) = 1