perf(transform): optimize dim_locations spatial joins via IEJoin + country filters
All checks were successful
CI / test (push) Successful in 51s
CI / tag (push) Successful in 2s

Replace ABS() bbox predicates with BETWEEN in all three spatial CTEs
(nearest_padel, padel_local, tennis_nearby). BETWEEN enables DuckDB's
IEJoin (interval join) which is O((N+M) log M) vs the previous O(N×M)
nested-loop cross-join.

Add country pre-filters to restrict the left side from ~140K global
locations to ~20K rows for padel/tennis CTEs (~8 countries each).

Expected: ~50-200x speedup on the spatial CTE portion of the model.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-03-01 02:57:05 +01:00
parent 2590020014
commit fea4f85da3

View File

@@ -19,8 +19,10 @@
-- 4. Country-level income (global fallback from stg_income / ilc_di03)
--
-- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension).
-- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the
-- exact sphere distance is computed.
-- Spatial joins use BETWEEN predicates (not ABS()) to enable DuckDB's IEJoin
-- (interval join) optimization: O((N+M) log M) vs O(N×M) nested-loop.
-- Country pre-filters restrict the left side to ~20K rows for padel/tennis CTEs
-- (~8 countries each), down from ~140K global locations.
MODEL (
name foundation.dim_locations,
@@ -147,6 +149,8 @@ padel_courts AS (
WHERE lat IS NOT NULL AND lon IS NOT NULL
),
-- Nearest padel court distance per location (bbox pre-filter → exact sphere distance)
-- BETWEEN enables DuckDB IEJoin (O((N+M) log M)) vs ABS() nested-loop (O(N×M)).
-- Country pre-filter reduces left side from ~140K to ~20K rows (padel is ~8 countries).
nearest_padel AS (
SELECT
l.geoname_id,
@@ -158,9 +162,12 @@ nearest_padel AS (
) AS nearest_padel_court_km
FROM locations l
JOIN padel_courts p
-- ~55km bounding box pre-filter to limit cross-join before sphere calc
ON ABS(l.lat - p.lat) < 0.5
AND ABS(l.lon - p.lon) < 0.5
-- ~55km bounding box pre-filter; BETWEEN triggers IEJoin optimization
ON l.lat BETWEEN p.lat - 0.5 AND p.lat + 0.5
AND l.lon BETWEEN p.lon - 0.5 AND p.lon + 0.5
WHERE l.country_code IN (
SELECT DISTINCT country_code FROM padel_courts WHERE country_code IS NOT NULL
)
GROUP BY l.geoname_id
),
-- Padel venues within 5km of each location (counts as "local padel supply")
@@ -170,24 +177,35 @@ padel_local AS (
COUNT(*) AS padel_venue_count
FROM locations l
JOIN padel_courts p
ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter
AND ABS(l.lon - p.lon) < 0.05
WHERE ST_Distance_Sphere(
-- ~5km bbox pre-filter; BETWEEN triggers IEJoin optimization
ON l.lat BETWEEN p.lat - 0.05 AND p.lat + 0.05
AND l.lon BETWEEN p.lon - 0.05 AND p.lon + 0.05
WHERE l.country_code IN (
SELECT DISTINCT country_code FROM padel_courts WHERE country_code IS NOT NULL
)
AND ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(p.lon, p.lat)
) / 1000.0 <= 5.0
GROUP BY l.geoname_id
),
-- Tennis courts within 25km of each location (sports culture proxy)
-- Country pre-filter reduces left side from ~140K to ~20K rows (tennis courts are European only).
tennis_nearby AS (
SELECT
l.geoname_id,
COUNT(*) AS tennis_courts_within_25km
FROM locations l
JOIN staging.stg_tennis_courts t
ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter
AND ABS(l.lon - t.lon) < 0.23
WHERE ST_Distance_Sphere(
-- ~25km bbox pre-filter; BETWEEN triggers IEJoin optimization
ON l.lat BETWEEN t.lat - 0.23 AND t.lat + 0.23
AND l.lon BETWEEN t.lon - 0.23 AND t.lon + 0.23
WHERE l.country_code IN (
SELECT DISTINCT country_code
FROM staging.stg_tennis_courts
WHERE country_code IS NOT NULL
)
AND ST_Distance_Sphere(
ST_Point(l.lon, l.lat),
ST_Point(t.lon, t.lat)
) / 1000.0 <= 25.0