feat(extract): convert geonames to JSONL output
- cities_global.jsonl.gz replaces .json.gz (one city object per line)
- Empty placeholder writes a minimal .jsonl.gz (null row, filtered in staging)
- Eliminates the {"rows": [...]} blob wrapper and maximum_object_size workaround
stg_population_geonames: UNION ALL transition (jsonl_rows + blob_rows)
- jsonl_rows: read_json JSONL, explicit columns, no UNNEST
- blob_rows: existing UNNEST(rows) pattern with 40MB size limit retained
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,11 @@
|
||||
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
|
||||
-- One row per geoname_id (GeoNames stable numeric identifier).
|
||||
--
|
||||
-- Source: data/landing/geonames/{year}/{month}/cities_global.json.gz
|
||||
-- Supports two landing formats (UNION ALL during migration):
|
||||
-- New: cities_global.jsonl.gz — one city per line, columns directly accessible
|
||||
-- Old: cities_global.json.gz — {"rows": [...]} blob (UNNEST required)
|
||||
--
|
||||
-- Source: data/landing/geonames/{year}/{month}/cities_global.{jsonl,json}.gz
|
||||
|
||||
MODEL (
|
||||
name staging.stg_population_geonames,
|
||||
@@ -12,7 +16,33 @@ MODEL (
|
||||
grain geoname_id
|
||||
);
|
||||
|
||||
WITH parsed AS (
|
||||
WITH
|
||||
-- New format: one city per JSONL line
|
||||
jsonl_rows AS (
|
||||
SELECT
|
||||
TRY_CAST(geoname_id AS INTEGER) AS geoname_id,
|
||||
city_name,
|
||||
country_code,
|
||||
TRY_CAST(lat AS DOUBLE) AS lat,
|
||||
TRY_CAST(lon AS DOUBLE) AS lon,
|
||||
admin1_code,
|
||||
admin2_code,
|
||||
TRY_CAST(population AS BIGINT) AS population,
|
||||
TRY_CAST(ref_year AS INTEGER) AS ref_year,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/geonames/*/*/cities_global.jsonl.gz',
|
||||
format = 'newline_delimited',
|
||||
columns = {
|
||||
geoname_id: 'INTEGER', city_name: 'VARCHAR', country_code: 'VARCHAR',
|
||||
lat: 'DOUBLE', lon: 'DOUBLE', admin1_code: 'VARCHAR', admin2_code: 'VARCHAR',
|
||||
population: 'BIGINT', ref_year: 'INTEGER'
|
||||
}
|
||||
)
|
||||
WHERE geoname_id IS NOT NULL
|
||||
),
|
||||
-- Old format: {"rows": [...]} blob — kept for transition
|
||||
blob_rows AS (
|
||||
SELECT
|
||||
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
|
||||
row ->> 'city_name' AS city_name,
|
||||
@@ -33,11 +63,16 @@ WITH parsed AS (
|
||||
)
|
||||
)
|
||||
WHERE (row ->> 'geoname_id') IS NOT NULL
|
||||
),
|
||||
all_rows AS (
|
||||
SELECT * FROM jsonl_rows
|
||||
UNION ALL
|
||||
SELECT * FROM blob_rows
|
||||
)
|
||||
SELECT
|
||||
geoname_id,
|
||||
TRIM(city_name) AS city_name,
|
||||
UPPER(country_code) AS country_code,
|
||||
TRIM(city_name) AS city_name,
|
||||
UPPER(country_code) AS country_code,
|
||||
lat,
|
||||
lon,
|
||||
NULLIF(TRIM(admin1_code), '') AS admin1_code,
|
||||
@@ -45,7 +80,7 @@ SELECT
|
||||
population,
|
||||
ref_year,
|
||||
extracted_date
|
||||
FROM parsed
|
||||
FROM all_rows
|
||||
WHERE population IS NOT NULL
|
||||
AND population > 0
|
||||
AND geoname_id IS NOT NULL
|
||||
|
||||
Reference in New Issue
Block a user