diff --git a/transform/sqlmesh_padelnomics/macros/__init__.py b/transform/sqlmesh_padelnomics/macros/__init__.py index b4b675c..860bef5 100644 --- a/transform/sqlmesh_padelnomics/macros/__init__.py +++ b/transform/sqlmesh_padelnomics/macros/__init__.py @@ -16,5 +16,92 @@ def padelnomics_glob(evaluator) -> str: return f"'{landing_dir}/padelnomics/**/*.csv.gz'" -# Add one macro per landing zone subdirectory you create. -# Pattern: def {source}_glob(evaluator) → f"'{landing_dir}/{source}/**/*.csv.gz'" +# ── Country code helpers ───────────────────────────────────────────────────── +# Shared lookup used by dim_cities and dim_locations. + +_COUNTRY_NAMES = { + "DE": "Germany", "ES": "Spain", "GB": "United Kingdom", + "FR": "France", "IT": "Italy", "PT": "Portugal", + "AT": "Austria", "CH": "Switzerland", "NL": "Netherlands", + "BE": "Belgium", "SE": "Sweden", "NO": "Norway", + "DK": "Denmark", "FI": "Finland", "US": "United States", + "AR": "Argentina", "MX": "Mexico", "AE": "UAE", + "AU": "Australia", "IE": "Ireland", +} + + +def _country_case(col: str) -> str: + """Build a CASE expression mapping ISO 3166-1 alpha-2 → English name.""" + whens = "\n ".join( + f"WHEN '{code}' THEN '{name}'" for code, name in _COUNTRY_NAMES.items() + ) + return f"CASE {col}\n {whens}\n ELSE {col}\n END" + + +@macro() +def country_name(evaluator, code_col) -> str: + """CASE expression: country code → English name. + + Usage in SQL: @country_name(vc.country_code) AS country_name_en + """ + return _country_case(str(code_col)) + + +@macro() +def country_slug(evaluator, code_col) -> str: + """CASE expression: country code → URL-safe slug (lowercased, spaces → dashes). + + Usage in SQL: @country_slug(vc.country_code) AS country_slug + """ + return f"LOWER(REGEXP_REPLACE({_country_case(str(code_col))}, '[^a-zA-Z0-9]+', '-'))" + + +@macro() +def normalize_eurostat_country(evaluator, code_col) -> str: + """Normalize Eurostat country codes to ISO 3166-1 alpha-2: EL→GR, UK→GB. + + Usage in SQL: @normalize_eurostat_country(geo_code) AS country_code + """ + col = str(code_col) + return f"CASE {col} WHEN 'EL' THEN 'GR' WHEN 'UK' THEN 'GB' ELSE {col} END" + + +@macro() +def normalize_eurostat_nuts(evaluator, code_col) -> str: + """Normalize NUTS code prefix: EL→GR, UK→GB, preserving the suffix. + + Usage in SQL: @normalize_eurostat_nuts(geo_code) AS nuts_code + """ + col = str(code_col) + return ( + f"CASE" + f" WHEN {col} LIKE 'EL%' THEN 'GR' || SUBSTR({col}, 3)" + f" WHEN {col} LIKE 'UK%' THEN 'GB' || SUBSTR({col}, 3)" + f" ELSE {col}" + f" END" + ) + + +@macro() +def infer_country_from_coords(evaluator, lat_col, lon_col) -> str: + """Infer ISO country code from lat/lon using bounding boxes for 8 European markets. + + Usage in SQL: + COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), + @infer_country_from_coords(lat, lon)) AS country_code + """ + lat = str(lat_col) + lon = str(lon_col) + return ( + f"CASE" + f" WHEN {lat} BETWEEN 47.27 AND 55.06 AND {lon} BETWEEN 5.87 AND 15.04 THEN 'DE'" + f" WHEN {lat} BETWEEN 35.95 AND 43.79 AND {lon} BETWEEN -9.39 AND 4.33 THEN 'ES'" + f" WHEN {lat} BETWEEN 49.90 AND 60.85 AND {lon} BETWEEN -8.62 AND 1.77 THEN 'GB'" + f" WHEN {lat} BETWEEN 41.36 AND 51.09 AND {lon} BETWEEN -5.14 AND 9.56 THEN 'FR'" + f" WHEN {lat} BETWEEN 45.46 AND 47.80 AND {lon} BETWEEN 5.96 AND 10.49 THEN 'CH'" + f" WHEN {lat} BETWEEN 46.37 AND 49.02 AND {lon} BETWEEN 9.53 AND 17.16 THEN 'AT'" + f" WHEN {lat} BETWEEN 36.35 AND 47.09 AND {lon} BETWEEN 6.62 AND 18.51 THEN 'IT'" + f" WHEN {lat} BETWEEN 37.00 AND 42.15 AND {lon} BETWEEN -9.50 AND -6.19 THEN 'PT'" + f" ELSE NULL" + f" END" + ) diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql index b1b1067..ba9a51a 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql @@ -110,55 +110,9 @@ SELECT vc.city_slug, vc.city_name, -- Human-readable country name for pSEO templates and internal linking - CASE vc.country_code - WHEN 'DE' THEN 'Germany' - WHEN 'ES' THEN 'Spain' - WHEN 'GB' THEN 'United Kingdom' - WHEN 'FR' THEN 'France' - WHEN 'IT' THEN 'Italy' - WHEN 'PT' THEN 'Portugal' - WHEN 'AT' THEN 'Austria' - WHEN 'CH' THEN 'Switzerland' - WHEN 'NL' THEN 'Netherlands' - WHEN 'BE' THEN 'Belgium' - WHEN 'SE' THEN 'Sweden' - WHEN 'NO' THEN 'Norway' - WHEN 'DK' THEN 'Denmark' - WHEN 'FI' THEN 'Finland' - WHEN 'US' THEN 'United States' - WHEN 'AR' THEN 'Argentina' - WHEN 'MX' THEN 'Mexico' - WHEN 'AE' THEN 'UAE' - WHEN 'AU' THEN 'Australia' - WHEN 'IE' THEN 'Ireland' - ELSE vc.country_code - END AS country_name_en, + @country_name(vc.country_code) AS country_name_en, -- URL-safe country slug - LOWER(REGEXP_REPLACE( - CASE vc.country_code - WHEN 'DE' THEN 'Germany' - WHEN 'ES' THEN 'Spain' - WHEN 'GB' THEN 'United Kingdom' - WHEN 'FR' THEN 'France' - WHEN 'IT' THEN 'Italy' - WHEN 'PT' THEN 'Portugal' - WHEN 'AT' THEN 'Austria' - WHEN 'CH' THEN 'Switzerland' - WHEN 'NL' THEN 'Netherlands' - WHEN 'BE' THEN 'Belgium' - WHEN 'SE' THEN 'Sweden' - WHEN 'NO' THEN 'Norway' - WHEN 'DK' THEN 'Denmark' - WHEN 'FI' THEN 'Finland' - WHEN 'US' THEN 'United States' - WHEN 'AR' THEN 'Argentina' - WHEN 'MX' THEN 'Mexico' - WHEN 'AE' THEN 'UAE' - WHEN 'AU' THEN 'Australia' - WHEN 'IE' THEN 'Ireland' - ELSE vc.country_code - END, '[^a-zA-Z0-9]+', '-' - )) AS country_slug, + @country_slug(vc.country_code) AS country_slug, vc.centroid_lat AS lat, vc.centroid_lon AS lon, -- Population cascade: Eurostat EU > US Census > ONS UK > GeoNames string > GeoNames spatial > 0. diff --git a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql index 2a77577..f86673a 100644 --- a/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql +++ b/transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql @@ -215,55 +215,9 @@ SELECT l.geoname_id, l.country_code, -- Human-readable country name (consistent with dim_cities) - CASE l.country_code - WHEN 'DE' THEN 'Germany' - WHEN 'ES' THEN 'Spain' - WHEN 'GB' THEN 'United Kingdom' - WHEN 'FR' THEN 'France' - WHEN 'IT' THEN 'Italy' - WHEN 'PT' THEN 'Portugal' - WHEN 'AT' THEN 'Austria' - WHEN 'CH' THEN 'Switzerland' - WHEN 'NL' THEN 'Netherlands' - WHEN 'BE' THEN 'Belgium' - WHEN 'SE' THEN 'Sweden' - WHEN 'NO' THEN 'Norway' - WHEN 'DK' THEN 'Denmark' - WHEN 'FI' THEN 'Finland' - WHEN 'US' THEN 'United States' - WHEN 'AR' THEN 'Argentina' - WHEN 'MX' THEN 'Mexico' - WHEN 'AE' THEN 'UAE' - WHEN 'AU' THEN 'Australia' - WHEN 'IE' THEN 'Ireland' - ELSE l.country_code - END AS country_name_en, + @country_name(l.country_code) AS country_name_en, -- URL-safe country slug - LOWER(REGEXP_REPLACE( - CASE l.country_code - WHEN 'DE' THEN 'Germany' - WHEN 'ES' THEN 'Spain' - WHEN 'GB' THEN 'United Kingdom' - WHEN 'FR' THEN 'France' - WHEN 'IT' THEN 'Italy' - WHEN 'PT' THEN 'Portugal' - WHEN 'AT' THEN 'Austria' - WHEN 'CH' THEN 'Switzerland' - WHEN 'NL' THEN 'Netherlands' - WHEN 'BE' THEN 'Belgium' - WHEN 'SE' THEN 'Sweden' - WHEN 'NO' THEN 'Norway' - WHEN 'DK' THEN 'Denmark' - WHEN 'FI' THEN 'Finland' - WHEN 'US' THEN 'United States' - WHEN 'AR' THEN 'Argentina' - WHEN 'MX' THEN 'Mexico' - WHEN 'AE' THEN 'UAE' - WHEN 'AU' THEN 'Australia' - WHEN 'IE' THEN 'Ireland' - ELSE l.country_code - END, '[^a-zA-Z0-9]+', '-' - )) AS country_slug, + @country_slug(l.country_code) AS country_slug, l.location_name, l.location_slug, l.lat, diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_income.sql b/transform/sqlmesh_padelnomics/models/staging/stg_income.sql index 5e660c4..f5a9df6 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_income.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_income.sql @@ -30,11 +30,7 @@ parsed AS ( ) SELECT -- Normalise to ISO 3166-1 alpha-2: EL→GR, UK→GB - CASE geo_code - WHEN 'EL' THEN 'GR' - WHEN 'UK' THEN 'GB' - ELSE geo_code - END AS country_code, + @normalize_eurostat_country(geo_code) AS country_code, ref_year, median_income_pps, extracted_date diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql b/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql index 32481d5..b4bb789 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_nuts2_boundaries.sql @@ -28,11 +28,7 @@ WITH raw AS ( SELECT NUTS_ID AS nuts2_code, -- Normalise country prefix to ISO 3166-1 alpha-2: EL→GR, UK→GB - CASE CNTR_CODE - WHEN 'EL' THEN 'GR' - WHEN 'UK' THEN 'GB' - ELSE CNTR_CODE - END AS country_code, + @normalize_eurostat_country(CNTR_CODE) AS country_code, NAME_LATN AS region_name, geom AS geometry, -- Pre-compute bounding box for efficient spatial pre-filter in dim_locations. diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_padel_courts.sql b/transform/sqlmesh_padelnomics/models/staging/stg_padel_courts.sql index 5a21831..d287cb8 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_padel_courts.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_padel_courts.sql @@ -48,17 +48,8 @@ deduped AS ( with_country AS ( SELECT osm_id, lat, lon, - COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE - WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE' - WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES' - WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB' - WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR' - WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH' - WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT' - WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT' - WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT' - ELSE NULL - END) AS country_code, + COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), + @infer_country_from_coords(lat, lon)) AS country_code, NULLIF(TRIM(name), '') AS name, NULLIF(TRIM(city_tag), '') AS city, postcode, operator_name, opening_hours, fee, extracted_date diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql index e5f7db5..592f958 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_regional_income.sql @@ -30,11 +30,7 @@ parsed AS ( ) SELECT -- Normalise to ISO 3166-1 alpha-2 prefix: EL→GR, UK→GB - CASE - WHEN geo_code LIKE 'EL%' THEN 'GR' || SUBSTR(geo_code, 3) - WHEN geo_code LIKE 'UK%' THEN 'GB' || SUBSTR(geo_code, 3) - ELSE geo_code - END AS nuts_code, + @normalize_eurostat_nuts(geo_code) AS nuts_code, -- NUTS level: 3-char = NUTS-1, 4-char = NUTS-2 LENGTH(geo_code) - 2 AS nuts_level, ref_year, diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql b/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql index 7d75851..342e410 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_tennis_courts.sql @@ -54,17 +54,8 @@ deduped AS ( with_country AS ( SELECT osm_id, lat, lon, - COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), CASE - WHEN lat BETWEEN 47.27 AND 55.06 AND lon BETWEEN 5.87 AND 15.04 THEN 'DE' - WHEN lat BETWEEN 35.95 AND 43.79 AND lon BETWEEN -9.39 AND 4.33 THEN 'ES' - WHEN lat BETWEEN 49.90 AND 60.85 AND lon BETWEEN -8.62 AND 1.77 THEN 'GB' - WHEN lat BETWEEN 41.36 AND 51.09 AND lon BETWEEN -5.14 AND 9.56 THEN 'FR' - WHEN lat BETWEEN 45.46 AND 47.80 AND lon BETWEEN 5.96 AND 10.49 THEN 'CH' - WHEN lat BETWEEN 46.37 AND 49.02 AND lon BETWEEN 9.53 AND 17.16 THEN 'AT' - WHEN lat BETWEEN 36.35 AND 47.09 AND lon BETWEEN 6.62 AND 18.51 THEN 'IT' - WHEN lat BETWEEN 37.00 AND 42.15 AND lon BETWEEN -9.50 AND -6.19 THEN 'PT' - ELSE NULL - END) AS country_code, + COALESCE(NULLIF(TRIM(UPPER(country_code)), ''), + @infer_country_from_coords(lat, lon)) AS country_code, NULLIF(TRIM(name), '') AS name, NULLIF(TRIM(city_tag), '') AS city, extracted_date