-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
-- collapsed to a single record preferring Playtomic data (richer metadata).
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.

MODEL (
  name padelnomics.dim_venues,
  kind FULL,
  cron '@daily',
  grain venue_id
);

WITH all_venues AS (
  SELECT
    'osm:' || osm_id::TEXT AS venue_id,
    source,
    lat,
    lon,
    country_code,
    name,
    city,
    postcode,
    NULL AS tenant_type,
    extracted_date
  FROM padelnomics.stg_padel_courts
  WHERE country_code IS NOT NULL

  UNION ALL

  SELECT
    'pt:' || tenant_id      AS venue_id,
    source,
    lat,
    lon,
    country_code,
    name,
    city,
    postcode,
    tenant_type,
    extracted_date
  FROM padelnomics.stg_playtomic_venues
  WHERE country_code IS NOT NULL
),
-- Rank venues so Playtomic records win ties in proximity dedup
ranked AS (
  SELECT *,
    CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
  FROM all_venues
)
-- Note: full proximity dedup (haversine clustering) is expensive in SQL.
-- For now, deduplicate on exact (country_code, ROUND(lat,3), ROUND(lon,3))
-- — ≈111m grid cells. Refine with spatial index if volumes grow.
SELECT
  MIN(venue_id) OVER (
    PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
    ORDER BY source_rank
  )                                                         AS venue_id,
  country_code,
  lat,
  lon,
  COALESCE(
    MAX(CASE WHEN source = 'playtomic' THEN name END)
      OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
    name
  )                                                         AS name,
  COALESCE(city, '')                                        AS city,
  postcode,
  source,
  tenant_type,
  extracted_date
FROM ranked
QUALIFY ROW_NUMBER() OVER (
  PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
  ORDER BY source_rank
) = 1