diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index cad8730..2a77577 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -19,8 +19,10 @@ -- 4. Country-level income (global fallback from stg_income / ilc_di03) -- -- Distance calculations use ST_Distance_Sphere (DuckDB spatial extension). --- A bounding-box pre-filter (~0.5°, ≈55km) reduces the cross-join before the --- exact sphere distance is computed. +-- Spatial joins use BETWEEN predicates (not ABS()) to enable DuckDB's IEJoin +-- (interval join) optimization: O((N+M) log M) vs O(N×M) nested-loop. +-- Country pre-filters restrict the left side to ~20K rows for padel/tennis CTEs +-- (~8 countries each), down from ~140K global locations. MODEL ( name foundation.dim_locations, @@ -147,6 +149,8 @@ padel_courts AS ( WHERE lat IS NOT NULL AND lon IS NOT NULL ), -- Nearest padel court distance per location (bbox pre-filter → exact sphere distance) +-- BETWEEN enables DuckDB IEJoin (O((N+M) log M)) vs ABS() nested-loop (O(N×M)). +-- Country pre-filter reduces left side from ~140K to ~20K rows (padel is ~8 countries). nearest_padel AS ( SELECT l.geoname_id, @@ -158,9 +162,12 @@ nearest_padel AS ( ) AS nearest_padel_court_km FROM locations l JOIN padel_courts p - -- ~55km bounding box pre-filter to limit cross-join before sphere calc - ON ABS(l.lat - p.lat) < 0.5 - AND ABS(l.lon - p.lon) < 0.5 + -- ~55km bounding box pre-filter; BETWEEN triggers IEJoin optimization + ON l.lat BETWEEN p.lat - 0.5 AND p.lat + 0.5 + AND l.lon BETWEEN p.lon - 0.5 AND p.lon + 0.5 + WHERE l.country_code IN ( + SELECT DISTINCT country_code FROM padel_courts WHERE country_code IS NOT NULL + ) GROUP BY l.geoname_id ), -- Padel venues within 5km of each location (counts as "local padel supply") @@ -170,24 +177,35 @@ padel_local AS ( COUNT(*) AS padel_venue_count FROM locations l JOIN padel_courts p - ON ABS(l.lat - p.lat) < 0.05 -- ~5km bbox pre-filter - AND ABS(l.lon - p.lon) < 0.05 - WHERE ST_Distance_Sphere( + -- ~5km bbox pre-filter; BETWEEN triggers IEJoin optimization + ON l.lat BETWEEN p.lat - 0.05 AND p.lat + 0.05 + AND l.lon BETWEEN p.lon - 0.05 AND p.lon + 0.05 + WHERE l.country_code IN ( + SELECT DISTINCT country_code FROM padel_courts WHERE country_code IS NOT NULL + ) + AND ST_Distance_Sphere( ST_Point(l.lon, l.lat), ST_Point(p.lon, p.lat) ) / 1000.0 <= 5.0 GROUP BY l.geoname_id ), -- Tennis courts within 25km of each location (sports culture proxy) +-- Country pre-filter reduces left side from ~140K to ~20K rows (tennis courts are European only). tennis_nearby AS ( SELECT l.geoname_id, COUNT(*) AS tennis_courts_within_25km FROM locations l JOIN staging.stg_tennis_courts t - ON ABS(l.lat - t.lat) < 0.23 -- ~25km bbox pre-filter - AND ABS(l.lon - t.lon) < 0.23 - WHERE ST_Distance_Sphere( + -- ~25km bbox pre-filter; BETWEEN triggers IEJoin optimization + ON l.lat BETWEEN t.lat - 0.23 AND t.lat + 0.23 + AND l.lon BETWEEN t.lon - 0.23 AND t.lon + 0.23 + WHERE l.country_code IN ( + SELECT DISTINCT country_code + FROM staging.stg_tennis_courts + WHERE country_code IS NOT NULL + ) + AND ST_Distance_Sphere( ST_Point(l.lon, l.lat), ST_Point(t.lon, t.lat) ) / 1000.0 <= 25.0