git mv all tracked files from the nested padelnomics/ workspace directory to the git repo root. Merged .gitignore files. No code changes — pure path rename. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
76 lines
2.0 KiB
SQL
76 lines
2.0 KiB
SQL
-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
|
|
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
|
|
-- collapsed to a single record preferring Playtomic data (richer metadata).
|
|
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
|
|
|
|
MODEL (
|
|
name padelnomics.dim_venues,
|
|
kind FULL,
|
|
cron '@daily',
|
|
grain venue_id
|
|
);
|
|
|
|
WITH all_venues AS (
|
|
SELECT
|
|
'osm:' || osm_id::TEXT AS venue_id,
|
|
source,
|
|
lat,
|
|
lon,
|
|
country_code,
|
|
name,
|
|
city,
|
|
postcode,
|
|
NULL AS tenant_type,
|
|
extracted_date
|
|
FROM padelnomics.stg_padel_courts
|
|
WHERE country_code IS NOT NULL
|
|
|
|
UNION ALL
|
|
|
|
SELECT
|
|
'pt:' || tenant_id AS venue_id,
|
|
source,
|
|
lat,
|
|
lon,
|
|
country_code,
|
|
name,
|
|
city,
|
|
postcode,
|
|
tenant_type,
|
|
extracted_date
|
|
FROM padelnomics.stg_playtomic_venues
|
|
WHERE country_code IS NOT NULL
|
|
),
|
|
-- Rank venues so Playtomic records win ties in proximity dedup
|
|
ranked AS (
|
|
SELECT *,
|
|
CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
|
|
FROM all_venues
|
|
)
|
|
-- Note: full proximity dedup (haversine clustering) is expensive in SQL.
|
|
-- For now, deduplicate on exact (country_code, ROUND(lat,3), ROUND(lon,3))
|
|
-- — ≈111m grid cells. Refine with spatial index if volumes grow.
|
|
SELECT
|
|
MIN(venue_id) OVER (
|
|
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
|
|
ORDER BY source_rank
|
|
) AS venue_id,
|
|
country_code,
|
|
lat,
|
|
lon,
|
|
COALESCE(
|
|
MAX(CASE WHEN source = 'playtomic' THEN name END)
|
|
OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
|
|
name
|
|
) AS name,
|
|
COALESCE(city, '') AS city,
|
|
postcode,
|
|
source,
|
|
tenant_type,
|
|
extracted_date
|
|
FROM ranked
|
|
QUALIFY ROW_NUMBER() OVER (
|
|
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
|
|
ORDER BY source_rank
|
|
) = 1
|