From e62aad148bf16574742bda961e03c5e6731e97e1 Mon Sep 17 00:00:00 2001 From: Deeman Date: Sat, 28 Feb 2026 18:40:15 +0100 Subject: [PATCH] fix(transform): remove blob CTE from stg_population_geonames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server has cities_global.jsonl.gz (JSONL), not cities_global.json.gz (blob). TigerStyle clean break — removed blob_rows CTE and UNION ALL. Simplified to a single SELECT directly from read_json. Co-Authored-By: Claude Sonnet 4.6 --- .../staging/stg_population_geonames.sql | 93 +++++-------------- 1 file changed, 22 insertions(+), 71 deletions(-) diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql index 82f4826..6c6404b 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_population_geonames.sql @@ -3,11 +3,7 @@ -- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence. -- One row per geoname_id (GeoNames stable numeric identifier). -- --- Supports two landing formats (UNION ALL during migration): --- New: cities_global.jsonl.gz — one city per line, columns directly accessible --- Old: cities_global.json.gz — {"rows": [...]} blob (UNNEST required) --- --- Source: data/landing/geonames/{year}/{month}/cities_global.{jsonl,json}.gz +-- Source: data/landing/geonames/{year}/{month}/cities_global.jsonl.gz MODEL ( name staging.stg_population_geonames, @@ -16,74 +12,29 @@ MODEL ( grain geoname_id ); -WITH --- New format: one city per JSONL line -jsonl_rows AS ( - SELECT - TRY_CAST(geoname_id AS INTEGER) AS geoname_id, - city_name, - country_code, - TRY_CAST(lat AS DOUBLE) AS lat, - TRY_CAST(lon AS DOUBLE) AS lon, - admin1_code, - admin2_code, - TRY_CAST(population AS BIGINT) AS population, - TRY_CAST(ref_year AS INTEGER) AS ref_year, - CURRENT_DATE AS extracted_date - FROM read_json( - @LANDING_DIR || '/geonames/*/*/cities_global.jsonl.gz', - format = 'newline_delimited', - columns = { - geoname_id: 'INTEGER', city_name: 'VARCHAR', country_code: 'VARCHAR', - lat: 'DOUBLE', lon: 'DOUBLE', admin1_code: 'VARCHAR', admin2_code: 'VARCHAR', - population: 'BIGINT', ref_year: 'INTEGER' - } - ) - WHERE geoname_id IS NOT NULL -), --- Old format: {"rows": [...]} blob — kept for transition -blob_rows AS ( - SELECT - TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id, - row ->> 'city_name' AS city_name, - row ->> 'country_code' AS country_code, - TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat, - TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon, - row ->> 'admin1_code' AS admin1_code, - row ->> 'admin2_code' AS admin2_code, - TRY_CAST(row ->> 'population' AS BIGINT) AS population, - TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year, - CURRENT_DATE AS extracted_date - FROM ( - SELECT UNNEST(rows) AS row - FROM read_json( - @LANDING_DIR || '/geonames/*/*/cities_global.json.gz', - auto_detect = true, - maximum_object_size = 40000000 - ) - ) - WHERE (row ->> 'geoname_id') IS NOT NULL -), -all_rows AS ( - SELECT * FROM jsonl_rows - UNION ALL - SELECT * FROM blob_rows -) SELECT - geoname_id, - TRIM(city_name) AS city_name, - UPPER(country_code) AS country_code, - lat, - lon, - NULLIF(TRIM(admin1_code), '') AS admin1_code, - NULLIF(TRIM(admin2_code), '') AS admin2_code, - population, - ref_year, - extracted_date -FROM all_rows -WHERE population IS NOT NULL + TRY_CAST(geoname_id AS INTEGER) AS geoname_id, + TRIM(city_name) AS city_name, + UPPER(country_code) AS country_code, + TRY_CAST(lat AS DOUBLE) AS lat, + TRY_CAST(lon AS DOUBLE) AS lon, + NULLIF(TRIM(admin1_code), '') AS admin1_code, + NULLIF(TRIM(admin2_code), '') AS admin2_code, + TRY_CAST(population AS BIGINT) AS population, + TRY_CAST(ref_year AS INTEGER) AS ref_year, + CURRENT_DATE AS extracted_date +FROM read_json( + @LANDING_DIR || '/geonames/*/*/cities_global.jsonl.gz', + format = 'newline_delimited', + columns = { + geoname_id: 'INTEGER', city_name: 'VARCHAR', country_code: 'VARCHAR', + lat: 'DOUBLE', lon: 'DOUBLE', admin1_code: 'VARCHAR', admin2_code: 'VARCHAR', + population: 'BIGINT', ref_year: 'INTEGER' + } +) +WHERE geoname_id IS NOT NULL + AND population IS NOT NULL AND population > 0 - AND geoname_id IS NOT NULL AND city_name IS NOT NULL AND lat IS NOT NULL AND lon IS NOT NULL