refactor: flatten padelnomics/padelnomics/ → repo root

git mv all tracked files from the nested padelnomics/ workspace directory to the git repo root. Merged .gitignore files. No code changes — pure path rename. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 00:44:40 +01:00
parent 5e471567b9
commit 4ae00b35d1
235 changed files with 45 additions and 42 deletions
--- a/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql
+++ b/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql
@@ -0,0 +1,59 @@
+-- City dimension: canonical city records with population and venue count.
+-- Built from Eurostat Urban Audit codes joined to venue locations.
+-- Cities without Eurostat coverage (US, non-EU) are derived from venue clusters.
+
+MODEL (
+  name padelnomics.dim_cities,
+  kind FULL,
+  cron '@daily',
+  grain city_code
+);
+
+WITH -- Eurostat cities: latest population per city code
+eurostat_cities AS (
+  SELECT
+    city_code,
+    country_code,
+    population,
+    ref_year
+  FROM padelnomics.stg_population
+  QUALIFY ROW_NUMBER() OVER (PARTITION BY city_code ORDER BY ref_year DESC) = 1
+),
+-- Venue counts per (country_code, city) from dim_venues
+venue_counts AS (
+  SELECT
+    country_code,
+    city,
+    COUNT(*)  AS venue_count,
+    AVG(lat)  AS centroid_lat,
+    AVG(lon)  AS centroid_lon
+  FROM padelnomics.dim_venues
+  WHERE city IS NOT NULL AND city != ''
+  GROUP BY country_code, city
+),
+-- Eurostat city label mapping to canonical city names
+-- (Eurostat uses codes like DE001C → Berlin; we keep both)
+eurostat_labels AS (
+  SELECT DISTINCT
+    city_code,
+    country_code,
+    -- Derive a slug-friendly city name from the code as fallback
+    LOWER(REPLACE(city_code, country_code, ''))  AS city_slug_raw
+  FROM eurostat_cities
+)
+SELECT
+  ec.city_code,
+  ec.country_code,
+  COALESCE(vc.city, ec.city_code)               AS city_name,
+  LOWER(REGEXP_REPLACE(
+    COALESCE(vc.city, ec.city_slug_raw), '[^a-z0-9]+', '-'
+  ))                                             AS city_slug,
+  COALESCE(vc.centroid_lat, 0::DOUBLE)           AS lat,
+  COALESCE(vc.centroid_lon, 0::DOUBLE)           AS lon,
+  ec.population,
+  ec.ref_year                                    AS population_year,
+  COALESCE(vc.venue_count, 0)                    AS padel_venue_count
+FROM eurostat_cities ec
+LEFT JOIN venue_counts vc
+  ON  ec.country_code = vc.country_code
+  AND LOWER(TRIM(vc.city)) LIKE '%' || LOWER(LEFT(ec.city_code, 2)) || '%'
--- a/transform/sqlmesh_padelnomics/models/foundation/dim_venues.sql
+++ b/transform/sqlmesh_padelnomics/models/foundation/dim_venues.sql
@@ -0,0 +1,75 @@
+-- Deduplicated venue dimension combining OSM courts and Playtomic tenants.
+-- Venues from both sources are unioned; near-duplicates (within ~100m) are
+-- collapsed to a single record preferring Playtomic data (richer metadata).
+-- Proximity dedup uses haversine approximation: 1 degree lat ≈ 111 km.
+
+MODEL (
+  name padelnomics.dim_venues,
+  kind FULL,
+  cron '@daily',
+  grain venue_id
+);
+
+WITH all_venues AS (
+  SELECT
+    'osm:' || osm_id::TEXT AS venue_id,
+    source,
+    lat,
+    lon,
+    country_code,
+    name,
+    city,
+    postcode,
+    NULL AS tenant_type,
+    extracted_date
+  FROM padelnomics.stg_padel_courts
+  WHERE country_code IS NOT NULL
+
+  UNION ALL
+
+  SELECT
+    'pt:' || tenant_id      AS venue_id,
+    source,
+    lat,
+    lon,
+    country_code,
+    name,
+    city,
+    postcode,
+    tenant_type,
+    extracted_date
+  FROM padelnomics.stg_playtomic_venues
+  WHERE country_code IS NOT NULL
+),
+-- Rank venues so Playtomic records win ties in proximity dedup
+ranked AS (
+  SELECT *,
+    CASE source WHEN 'playtomic' THEN 1 ELSE 2 END AS source_rank
+  FROM all_venues
+)
+-- Note: full proximity dedup (haversine clustering) is expensive in SQL.
+-- For now, deduplicate on exact (country_code, ROUND(lat,3), ROUND(lon,3))
+-- — ≈111m grid cells. Refine with spatial index if volumes grow.
+SELECT
+  MIN(venue_id) OVER (
+    PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
+    ORDER BY source_rank
+  )                                                         AS venue_id,
+  country_code,
+  lat,
+  lon,
+  COALESCE(
+    MAX(CASE WHEN source = 'playtomic' THEN name END)
+      OVER (PARTITION BY country_code, ROUND(lat,3)::TEXT, ROUND(lon,3)::TEXT),
+    name
+  )                                                         AS name,
+  COALESCE(city, '')                                        AS city,
+  postcode,
+  source,
+  tenant_type,
+  extracted_date
+FROM ranked
+QUALIFY ROW_NUMBER() OVER (
+  PARTITION BY country_code, ROUND(lat, 3)::TEXT, ROUND(lon, 3)::TEXT
+  ORDER BY source_rank
+) = 1