fix: eurostat JSON-stat parsing + staging model corrections

Eurostat JSON-stat format (4-7 dimension sparse dict with 583K values)
causes DuckDB OOM — pre-process in extractor to flat records.
Also fix dim_cities unused CTE bug and playtomic venue lat/lon path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-22 20:52:25 +01:00
parent c25e20f83a
commit 5a1bb21624
4 changed files with 146 additions and 65 deletions

View File

@@ -1,11 +1,16 @@
"""Eurostat extractor — city-level demographic datasets.
Fetches Eurostat Statistics API JSON datasets using etag-based deduplication.
Fetches Eurostat Statistics API JSON-stat datasets with etag-based deduplication.
Data only changes ~twice a year so most runs skip with 304 Not Modified.
The raw Eurostat JSON-stat format is a 4D sparse dictionary (freq × indicator ×
city × time) which DuckDB cannot efficiently parse. This extractor normalizes
the response into simple JSON arrays that the staging SQL can UNNEST directly.
Landing: {LANDING_DIR}/eurostat/{year}/{month}/{dataset_code}.json.gz
"""
import json
import sqlite3
from pathlib import Path
@@ -19,11 +24,103 @@ logger = setup_logging("padelnomics.extract.eurostat")
EXTRACTOR_NAME = "eurostat"
EUROSTAT_BASE_URL = "https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data"
# Datasets to fetch
DATASETS = [
"urb_cpop1", # Urban Audit — city population
"ilc_di03", # Median equivalised net income by NUTS2
]
# Dataset configs: filters fix dimension values, geo_dim/time_dim are iterated.
# All other dimensions must either be in filters or have size=1.
DATASETS: dict[str, dict] = {
"urb_cpop1": {
"filters": {"indic_ur": "DE1001V"}, # Population on 1 January, total
"geo_dim": "cities",
"time_dim": "time",
},
"ilc_di03": {
"filters": { # Median equivalised net income
"age": "TOTAL",
"sex": "T",
"indic_il": "MED_E",
"unit": "PPS",
},
"geo_dim": "geo",
"time_dim": "time",
},
}
def _parse_jsonstat(
data: dict,
filters: dict[str, str],
geo_dim: str,
time_dim: str,
) -> list[dict]:
"""Parse a Eurostat JSON-stat response into flat records.
JSON-stat stores values in a sparse flat dict keyed by linear index
computed from dimension positions:
index = sum(pos_i * stride_i) where stride_i = product(sizes[i+1:])
We fix all dimensions except geo and time using ``filters``, then
iterate geo × time to extract every non-null value.
"""
dims = data["dimension"]
sizes = data["size"]
values = data["value"]
dim_names = data["id"]
assert len(dim_names) == len(sizes)
# Compute strides (row-major): stride[i] = product of sizes[i+1:]
strides = [1] * len(sizes)
for i in range(len(sizes) - 2, -1, -1):
strides[i] = strides[i + 1] * sizes[i + 1]
# Resolve fixed dimension positions → compute base offset
offset = 0
geo_idx = None
time_idx = None
for i, name in enumerate(dim_names):
if name == geo_dim:
geo_idx = i
elif name == time_dim:
time_idx = i
elif name in filters:
cat_index = dims[name]["category"]["index"]
code = filters[name]
assert code in cat_index, (
f"Filter value {code!r} not in dimension {name!r}. "
f"Available: {list(cat_index.keys())[:10]}..."
)
offset += cat_index[code] * strides[i]
else:
# Dimension not filtered and not geo/time — must have size 1
assert sizes[i] == 1, f"Dimension {name!r} has size {sizes[i]} but no filter provided"
assert geo_idx is not None, f"geo_dim {geo_dim!r} not found in {dim_names}"
assert time_idx is not None, f"time_dim {time_dim!r} not found in {dim_names}"
geo_index = dims[geo_dim]["category"]["index"]
time_index = dims[time_dim]["category"]["index"]
geo_by_pos = {pos: code for code, pos in geo_index.items()}
time_by_pos = {pos: code for code, pos in time_index.items()}
geo_stride = strides[geo_idx]
time_stride = strides[time_idx]
rows: list[dict] = []
for geo_pos in range(sizes[geo_idx]):
geo_code = geo_by_pos[geo_pos]
for time_pos in range(sizes[time_idx]):
idx = offset + geo_pos * geo_stride + time_pos * time_stride
val = values.get(str(idx))
if val is not None:
rows.append(
{
"geo_code": geo_code,
"ref_year": time_by_pos[time_pos],
"value": val,
}
)
return rows
def _etag_path(dest: Path) -> Path:
@@ -35,8 +132,12 @@ def _fetch_with_etag(
url: str,
dest: Path,
session: niquests.Session,
dataset_config: dict | None = None,
) -> int:
"""GET url with If-None-Match etag. Returns bytes_written (0 if 304)."""
"""GET url with If-None-Match etag. Pre-processes JSON-stat if config given.
Returns bytes_written (0 if 304).
"""
etag_file = _etag_path(dest)
headers: dict[str, str] = {}
@@ -49,7 +150,21 @@ def _fetch_with_etag(
return 0
resp.raise_for_status()
bytes_written = write_gzip_atomic(dest, resp.content)
if dataset_config:
raw = resp.json()
rows = _parse_jsonstat(
raw,
filters=dataset_config["filters"],
geo_dim=dataset_config["geo_dim"],
time_dim=dataset_config["time_dim"],
)
payload = json.dumps({"rows": rows, "count": len(rows)}).encode()
logger.info("parsed %d records", len(rows))
else:
payload = resp.content
bytes_written = write_gzip_atomic(dest, payload)
if etag := resp.headers.get("etag"):
etag_file.parent.mkdir(parents=True, exist_ok=True)
@@ -70,13 +185,13 @@ def extract(
files_skipped = 0
bytes_written_total = 0
for dataset_code in DATASETS:
for dataset_code, config in DATASETS.items():
url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
dest_dir = landing_path(landing_dir, "eurostat", year, month)
dest = dest_dir / f"{dataset_code}.json.gz"
logger.info("GET %s", dataset_code)
bytes_written = _fetch_with_etag(url, dest, session)
bytes_written = _fetch_with_etag(url, dest, session, config)
if bytes_written > 0:
logger.info("%s updated — %s bytes compressed", dataset_code, f"{bytes_written:,}")