1 Commits
v10 ... v11

Author SHA1 Message Date
Deeman
e62aad148b fix(transform): remove blob CTE from stg_population_geonames
All checks were successful
CI / test (push) Successful in 49s
CI / tag (push) Successful in 2s
Server has cities_global.jsonl.gz (JSONL), not cities_global.json.gz (blob).
TigerStyle clean break — removed blob_rows CTE and UNION ALL.
Simplified to a single SELECT directly from read_json.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-28 18:40:15 +01:00

View File

@@ -3,11 +3,7 @@
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
-- One row per geoname_id (GeoNames stable numeric identifier).
--
-- Supports two landing formats (UNION ALL during migration):
-- New: cities_global.jsonl.gz — one city per line, columns directly accessible
-- Old: cities_global.json.gz — {"rows": [...]} blob (UNNEST required)
--
-- Source: data/landing/geonames/{year}/{month}/cities_global.{jsonl,json}.gz
-- Source: data/landing/geonames/{year}/{month}/cities_global.jsonl.gz
MODEL (
name staging.stg_population_geonames,
@@ -16,21 +12,18 @@ MODEL (
grain geoname_id
);
WITH
-- New format: one city per JSONL line
jsonl_rows AS (
SELECT
SELECT
TRY_CAST(geoname_id AS INTEGER) AS geoname_id,
city_name,
country_code,
TRIM(city_name) AS city_name,
UPPER(country_code) AS country_code,
TRY_CAST(lat AS DOUBLE) AS lat,
TRY_CAST(lon AS DOUBLE) AS lon,
admin1_code,
admin2_code,
NULLIF(TRIM(admin1_code), '') AS admin1_code,
NULLIF(TRIM(admin2_code), '') AS admin2_code,
TRY_CAST(population AS BIGINT) AS population,
TRY_CAST(ref_year AS INTEGER) AS ref_year,
CURRENT_DATE AS extracted_date
FROM read_json(
FROM read_json(
@LANDING_DIR || '/geonames/*/*/cities_global.jsonl.gz',
format = 'newline_delimited',
columns = {
@@ -38,52 +31,10 @@ jsonl_rows AS (
lat: 'DOUBLE', lon: 'DOUBLE', admin1_code: 'VARCHAR', admin2_code: 'VARCHAR',
population: 'BIGINT', ref_year: 'INTEGER'
}
)
WHERE geoname_id IS NOT NULL
),
-- Old format: {"rows": [...]} blob — kept for transition
blob_rows AS (
SELECT
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
row ->> 'city_name' AS city_name,
row ->> 'country_code' AS country_code,
TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat,
TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon,
row ->> 'admin1_code' AS admin1_code,
row ->> 'admin2_code' AS admin2_code,
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
CURRENT_DATE AS extracted_date
FROM (
SELECT UNNEST(rows) AS row
FROM read_json(
@LANDING_DIR || '/geonames/*/*/cities_global.json.gz',
auto_detect = true,
maximum_object_size = 40000000
)
)
WHERE (row ->> 'geoname_id') IS NOT NULL
),
all_rows AS (
SELECT * FROM jsonl_rows
UNION ALL
SELECT * FROM blob_rows
)
SELECT
geoname_id,
TRIM(city_name) AS city_name,
UPPER(country_code) AS country_code,
lat,
lon,
NULLIF(TRIM(admin1_code), '') AS admin1_code,
NULLIF(TRIM(admin2_code), '') AS admin2_code,
population,
ref_year,
extracted_date
FROM all_rows
WHERE population IS NOT NULL
WHERE geoname_id IS NOT NULL
AND population IS NOT NULL
AND population > 0
AND geoname_id IS NOT NULL
AND city_name IS NOT NULL
AND lat IS NOT NULL
AND lon IS NOT NULL