Propagates the conformed city key (city_slug) from dim_venues through the full pricing pipeline, eliminating 3 fragile LOWER(TRIM(...)) fuzzy string joins with deterministic key joins. Changes (cascading, task-by-task): - dim_venues: add city_slug computed column (REGEXP_REPLACE slug derivation) - dim_venue_capacity: join foundation.dim_venues instead of stg_playtomic_venues; carry city_slug alongside country_code/city - fct_daily_availability: carry city_slug from dim_venue_capacity - venue_pricing_benchmarks: carry city_slug from fct_daily_availability; add to venue_stats GROUP BY and final SELECT/GROUP BY - city_market_profile: join vpb on city_slug = city_slug (was LOWER(TRIM)) - planner_defaults: add city_slug to city_benchmarks CTE; join on city_slug - pseo_city_pricing: join city_market_profile on city_slug (was LOWER(TRIM)) - pipeline_routes._DAG: dim_venue_capacity now depends on dim_venues, not stg_playtomic_venues Result: dim_venues.city_slug → dim_cities.(country_code, city_slug) forms a fully conformed geographic hierarchy with no fuzzy string comparisons. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
109 lines
3.5 KiB
SQL
109 lines
3.5 KiB
SQL
-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
|
|
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
|
|
-- collapsed to a single record preferring Playtomic data (richer metadata).
|
|
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
|
|
--
|
|
-- Playtomic venues include court counts, indoor/outdoor split, currency, and timezone.
|
|
|
|
MODEL (
|
|
name foundation.dim_venues,
|
|
kind FULL,
|
|
cron '@daily',
|
|
grain venue_id
|
|
);
|
|
|
|
WITH playtomic_venues AS (
|
|
SELECT
|
|
'pt:' || v.tenant_id AS venue_id,
|
|
v.tenant_id,
|
|
'playtomic' AS source,
|
|
v.lat,
|
|
v.lon,
|
|
v.country_code,
|
|
v.name,
|
|
v.city,
|
|
v.postcode,
|
|
v.tenant_type,
|
|
v.timezone,
|
|
v.vat_rate,
|
|
v.default_currency,
|
|
-- Court counts from resources
|
|
COUNT(r.resource_id) AS court_count,
|
|
COUNT(r.resource_id) FILTER (WHERE r.resource_type = 'indoor') AS indoor_court_count,
|
|
COUNT(r.resource_id) FILTER (WHERE r.resource_type = 'outdoor') AS outdoor_court_count,
|
|
v.extracted_date
|
|
FROM staging.stg_playtomic_venues v
|
|
LEFT JOIN staging.stg_playtomic_resources r
|
|
ON v.tenant_id = r.tenant_id AND r.is_active = TRUE
|
|
WHERE v.country_code IS NOT NULL
|
|
GROUP BY
|
|
v.tenant_id, v.lat, v.lon, v.country_code, v.name, v.city,
|
|
v.postcode, v.tenant_type, v.timezone, v.vat_rate,
|
|
v.default_currency, v.extracted_date
|
|
),
|
|
osm_venues AS (
|
|
SELECT
|
|
'osm:' || osm_id::TEXT AS venue_id,
|
|
NULL AS tenant_id,
|
|
'osm' AS source,
|
|
lat,
|
|
lon,
|
|
country_code,
|
|
name,
|
|
city,
|
|
postcode,
|
|
NULL AS tenant_type,
|
|
NULL AS timezone,
|
|
NULL AS vat_rate,
|
|
NULL AS default_currency,
|
|
NULL AS court_count,
|
|
NULL AS indoor_court_count,
|
|
NULL AS outdoor_court_count,
|
|
extracted_date
|
|
FROM staging.stg_padel_courts
|
|
WHERE country_code IS NOT NULL
|
|
),
|
|
all_venues AS (
|
|
SELECT * FROM playtomic_venues
|
|
UNION ALL
|
|
SELECT * FROM osm_venues
|
|
),
|
|
ranked AS (
|
|
SELECT *,
|
|
CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
|
|
FROM all_venues
|
|
)
|
|
-- Deduplicate on ~111m grid cells, preferring Playtomic
|
|
SELECT
|
|
MIN(venue_id) OVER (
|
|
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
|
|
ORDER BY source_rank
|
|
) AS venue_id,
|
|
tenant_id,
|
|
country_code,
|
|
lat,
|
|
lon,
|
|
COALESCE(
|
|
MAX(CASE WHEN source = 'playtomic' THEN name END)
|
|
OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
|
|
name
|
|
) AS name,
|
|
COALESCE(city, '') AS city,
|
|
postcode,
|
|
source,
|
|
tenant_type,
|
|
timezone,
|
|
vat_rate,
|
|
default_currency,
|
|
court_count,
|
|
indoor_court_count,
|
|
outdoor_court_count,
|
|
-- Conformed city key: enables deterministic joins to dim_cities / venue_pricing_benchmarks
|
|
LOWER(REGEXP_REPLACE(LOWER(COALESCE(city, '')), '[^a-z0-9]+', '-')) AS city_slug,
|
|
extracted_date
|
|
FROM ranked
|
|
QUALIFY ROW_NUMBER() OVER (
|
|
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
|
|
ORDER BY source_rank
|
|
) = 1
|