-- Deduplicated venue dimension combining OSM courts and Playtomic tenants. -- Venues from both sources are unioned; near-duplicates (within ~100m) are -- collapsed to a single record preferring Playtomic data (richer metadata). -- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km. MODEL ( name padelnomics.dim_venues, kind FULL, cron '@daily', grain venue_id ); WITH all_venues AS ( SELECT 'osm:' || osm_id::TEXT AS venue_id, source, lat, lon, country_code, name, city, postcode, NULL AS tenant_type, extracted_date FROM padelnomics.stg_padel_courts WHERE country_code IS NOT NULL UNION ALL SELECT 'pt:' || tenant_id AS venue_id, source, lat, lon, country_code, name, city, postcode, tenant_type, extracted_date FROM padelnomics.stg_playtomic_venues WHERE country_code IS NOT NULL ), -- Rank venues so Playtomic records win ties in proximity dedup ranked AS ( SELECT *, CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank FROM all_venues ) -- Note: full proximity dedup (haversine clustering) is expensive in SQL. -- For now, deduplicate on exact (country_code, ROUND(lat,3), ROUND(lon,3)) -- — ≈111m grid cells. Refine with spatial index if volumes grow. SELECT MIN(venue_id) OVER ( PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT ORDER BY source_rank ) AS venue_id, country_code, lat, lon, COALESCE( MAX(CASE WHEN source = 'playtomic' THEN name END) OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT), name ) AS name, COALESCE(city, '') AS city, postcode, source, tenant_type, extracted_date FROM ranked QUALIFY ROW_NUMBER() OVER ( PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT ORDER BY source_rank ) = 1