Files
padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_venues.sql
Deeman 2db66efe77 feat: migrate transform to 3-layer architecture with per-layer schemas
Remove raw/ layer — staging models now read landing JSON directly.
Rename all model schemas from padelnomics.* to staging.*/foundation.*/serving.*.
Web app queries updated to serving.planner_defaults via SERVING_DUCKDB_PATH.
Supervisor gets daily sleep interval between pipeline runs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 19:04:40 +01:00

76 lines
2.0 KiB
SQL

-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
-- collapsed to a single record preferring Playtomic data (richer metadata).
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
MODEL (
name foundation.dim_venues,
kind FULL,
cron '@daily',
grain venue_id
);
WITH all_venues AS (
SELECT
'osm:' || osm_id::TEXT AS venue_id,
source,
lat,
lon,
country_code,
name,
city,
postcode,
NULL AS tenant_type,
extracted_date
FROM staging.stg_padel_courts
WHERE country_code IS NOT NULL
UNION ALL
SELECT
'pt:' || tenant_id AS venue_id,
source,
lat,
lon,
country_code,
name,
city,
postcode,
tenant_type,
extracted_date
FROM staging.stg_playtomic_venues
WHERE country_code IS NOT NULL
),
-- Rank venues so Playtomic records win ties in proximity dedup
ranked AS (
SELECT *,
CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
FROM all_venues
)
-- Note: full proximity dedup (haversine clustering) is expensive in SQL.
-- For now, deduplicate on exact (country_code, ROUND(lat,3), ROUND(lon,3))
-- — ≈111m grid cells. Refine with spatial index if volumes grow.
SELECT
MIN(venue_id) OVER (
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
ORDER BY source_rank
) AS venue_id,
country_code,
lat,
lon,
COALESCE(
MAX(CASE WHEN source = 'playtomic' THEN name END)
OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
name
) AS name,
COALESCE(city, '') AS city,
postcode,
source,
tenant_type,
extracted_date
FROM ranked
QUALIFY ROW_NUMBER() OVER (
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
ORDER BY source_rank
) = 1