fix: eurostat JSON-stat parsing + staging model corrections
Eurostat JSON-stat format (4-7 dimension sparse dict with 583K values) causes DuckDB OOM — pre-process in extractor to flat records. Also fix dim_cities unused CTE bug and playtomic venue lat/lon path. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,8 @@ eurostat_cities AS (
|
||||
city_code,
|
||||
country_code,
|
||||
population,
|
||||
ref_year
|
||||
ref_year,
|
||||
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
|
||||
FROM staging.stg_population
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY city_code ORDER BY ref_year DESC) = 1
|
||||
),
|
||||
@@ -30,16 +31,6 @@ venue_counts AS (
|
||||
FROM foundation.dim_venues
|
||||
WHERE city IS NOT NULL AND city != ''
|
||||
GROUP BY country_code, city
|
||||
),
|
||||
-- Eurostat city label mapping to canonical city names
|
||||
-- (Eurostat uses codes like DE001C → Berlin; we keep both)
|
||||
eurostat_labels AS (
|
||||
SELECT DISTINCT
|
||||
city_code,
|
||||
country_code,
|
||||
-- Derive a slug-friendly city name from the code as fallback
|
||||
LOWER(REPLACE(city_code, country_code, '')) AS city_slug_raw
|
||||
FROM eurostat_cities
|
||||
)
|
||||
SELECT
|
||||
ec.city_code,
|
||||
|
||||
@@ -19,8 +19,8 @@ WITH parsed AS (
|
||||
tenant -> 'address' ->> 'city' AS city,
|
||||
tenant -> 'address' ->> 'postal_code' AS postal_code,
|
||||
tenant -> 'address' ->> 'country_code' AS country_code,
|
||||
TRY_CAST(tenant -> 'address' ->> 'coordinate_lat' AS DOUBLE) AS lat,
|
||||
TRY_CAST(tenant -> 'address' ->> 'coordinate_lon' AS DOUBLE) AS lon,
|
||||
TRY_CAST(tenant -> 'address' -> 'coordinate' ->> 'lat' AS DOUBLE) AS lat,
|
||||
TRY_CAST(tenant -> 'address' -> 'coordinate' ->> 'lon' AS DOUBLE) AS lon,
|
||||
tenant ->> 'sport_ids' AS sport_ids_raw,
|
||||
tenant ->> 'tenant_type' AS tenant_type,
|
||||
filename AS source_file,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
-- Eurostat Urban Audit city population (dataset: urb_cpop1).
|
||||
-- Reads landing zone JSON directly and parses the Eurostat multidimensional format.
|
||||
-- Reads pre-processed landing zone JSON (extractor normalizes JSON-stat to flat rows).
|
||||
-- One row per (city_code, year) with validated population values.
|
||||
--
|
||||
-- Source: data/landing/eurostat/{year}/{month}/urb_cpop1.json.gz
|
||||
@@ -11,51 +11,26 @@ MODEL (
|
||||
grain (city_code, ref_year)
|
||||
);
|
||||
|
||||
WITH raw AS (
|
||||
SELECT raw_json, filename
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/eurostat/*/*/urb_cpop1.json.gz',
|
||||
format = 'auto',
|
||||
filename = true,
|
||||
columns = { 'raw_json': 'JSON' }
|
||||
)
|
||||
),
|
||||
cities AS (
|
||||
WITH parsed AS (
|
||||
SELECT
|
||||
city_code,
|
||||
(city_pos)::INTEGER AS city_pos,
|
||||
filename, raw_json,
|
||||
(json_extract(raw_json, '$.size[1]'))::INTEGER AS n_times
|
||||
FROM raw,
|
||||
LATERAL (
|
||||
SELECT key AS city_code, value::INTEGER AS city_pos
|
||||
FROM json_each(json_extract(raw_json, '$.dimension.cities.category.index'))
|
||||
row ->> 'geo_code' AS geo_code,
|
||||
row ->> 'ref_year' AS ref_year,
|
||||
TRY_CAST(row ->> 'value' AS DOUBLE) AS population,
|
||||
filename AS source_file,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM (
|
||||
SELECT UNNEST(rows) AS row, filename
|
||||
FROM read_json(
|
||||
@LANDING_DIR || '/eurostat/*/*/urb_cpop1.json.gz',
|
||||
format = 'auto',
|
||||
filename = true
|
||||
)
|
||||
)
|
||||
),
|
||||
times AS (
|
||||
SELECT key AS ref_year, value::INTEGER AS time_pos
|
||||
FROM (SELECT raw_json FROM raw LIMIT 1),
|
||||
LATERAL (
|
||||
SELECT key, value
|
||||
FROM json_each(json_extract(raw_json, '$.dimension.time.category.index'))
|
||||
)
|
||||
),
|
||||
parsed AS (
|
||||
SELECT
|
||||
c.city_code,
|
||||
t.ref_year,
|
||||
TRY_CAST(
|
||||
json_extract(c.raw_json, '$.' || (c.city_pos * c.n_times + t.time_pos)::TEXT)
|
||||
AS DOUBLE
|
||||
) AS population,
|
||||
c.filename AS source_file,
|
||||
CURRENT_DATE AS extracted_date
|
||||
FROM cities c
|
||||
CROSS JOIN times t
|
||||
WHERE (row ->> 'geo_code') IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
UPPER(city_code) AS city_code,
|
||||
UPPER(LEFT(city_code, 2)) AS country_code,
|
||||
UPPER(geo_code) AS city_code,
|
||||
UPPER(LEFT(geo_code, 2)) AS country_code,
|
||||
ref_year::INTEGER AS ref_year,
|
||||
population::BIGINT AS population,
|
||||
extracted_date
|
||||
|
||||
Reference in New Issue
Block a user