Files
padelnomics/transform/sqlmesh_padelnomics/models/foundation/dim_venues.sql
Deeman a00c8727d7 fix(content): slugify transliteration + article links + country overview ranking
- Add @slugify SQLMesh macro (STRIP_ACCENTS + ß→ss) replacing broken
  inline REGEXP_REPLACE that dropped non-ASCII chars (Düsseldorf → d-sseldorf)
- Apply @slugify to dim_venues, dim_cities, dim_locations
- Fix Python slugify() to pre-replace ß→ss before NFKD normalization
- Add language prefix to B2B article market links (/markets/germany → /de/markets/germany)
- Change country overview top-5 ranking: venue count (not raw market_score)
  for top cities, population for top opportunity cities

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 10:46:30 +01:00

109 lines
3.5 KiB
SQL

-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
-- Venues from both sources are unioned; near-duplicates (within ~100m) are
-- collapsed to a single record preferring Playtomic data (richer metadata).
-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
--
-- Playtomic venues include court counts, indoor/outdoor split, currency, and timezone.
MODEL (
name foundation.dim_venues,
kind FULL,
cron '@daily',
grain venue_id
);
WITH playtomic_venues AS (
SELECT
'pt:' || v.tenant_id AS venue_id,
v.tenant_id,
'playtomic' AS source,
v.lat,
v.lon,
v.country_code,
v.name,
v.city,
v.postcode,
v.tenant_type,
v.timezone,
v.vat_rate,
v.default_currency,
-- Court counts from resources
COUNT(r.resource_id) AS court_count,
COUNT(r.resource_id) FILTER (WHERE r.resource_type = 'indoor') AS indoor_court_count,
COUNT(r.resource_id) FILTER (WHERE r.resource_type = 'outdoor') AS outdoor_court_count,
v.extracted_date
FROM staging.stg_playtomic_venues v
LEFT JOIN staging.stg_playtomic_resources r
ON v.tenant_id = r.tenant_id AND r.is_active = TRUE
WHERE v.country_code IS NOT NULL
GROUP BY
v.tenant_id, v.lat, v.lon, v.country_code, v.name, v.city,
v.postcode, v.tenant_type, v.timezone, v.vat_rate,
v.default_currency, v.extracted_date
),
osm_venues AS (
SELECT
'osm:' || osm_id::TEXT AS venue_id,
NULL AS tenant_id,
'osm' AS source,
lat,
lon,
country_code,
name,
city,
postcode,
NULL AS tenant_type,
NULL AS timezone,
NULL AS vat_rate,
NULL AS default_currency,
NULL AS court_count,
NULL AS indoor_court_count,
NULL AS outdoor_court_count,
extracted_date
FROM staging.stg_padel_courts
WHERE country_code IS NOT NULL
),
all_venues AS (
SELECT * FROM playtomic_venues
UNION ALL
SELECT * FROM osm_venues
),
ranked AS (
SELECT *,
CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
FROM all_venues
)
-- Deduplicate on ~111m grid cells, preferring Playtomic
SELECT
MIN(venue_id) OVER (
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
ORDER BY source_rank
) AS venue_id,
tenant_id,
country_code,
lat,
lon,
COALESCE(
MAX(CASE WHEN source = 'playtomic' THEN name END)
OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
name
) AS name,
COALESCE(city, '') AS city,
postcode,
source,
tenant_type,
timezone,
vat_rate,
default_currency,
court_count,
indoor_court_count,
outdoor_court_count,
-- Conformed city key: enables deterministic joins to dim_cities / venue_pricing_benchmarks
@slugify(COALESCE(city, '')) AS city_slug,
extracted_date
FROM ranked
QUALIFY ROW_NUMBER() OVER (
PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
ORDER BY source_rank
) = 1