feat(transform): individualise article costs with per-country Eurostat data
Add real per-country cost data to ~30 calculator fields so pSEO articles show country-specific CAPEX/OPEX instead of hardcoded DE defaults. Extractor: - eurostat.py: add 8 new datasets (nrg_pc_205, nrg_pc_203, lc_lci_lev, 5×prc_ppp_ind variants); add optional `dataset_code` field so multiple dict entries can share one Eurostat API endpoint Staging (4 new models): - stg_electricity_prices — EUR/kWh by country, semi-annual - stg_gas_prices — EUR/GJ by country, semi-annual - stg_labour_costs — EUR/hour by country, annual (future staffed scenario) - stg_price_levels — PLI indices (EU27=100) for 5 categories, annual Foundation: - dim_countries (new) — conformed country dimension; eliminates ~50-line CASE blocks duplicated in dim_cities/dim_locations; computes ~29 calculator cost override columns from PLI ratios and energy price ratios vs DE baseline; NULL for DE so calculator falls through to DEFAULTS unchanged - dim_cities — replace country_name/slug CASE blocks + country_income CTE with JOIN dim_countries - dim_locations — same refactor as dim_cities Serving: - pseo_city_costs_de — JOIN dim_countries; add 29 camelCase override columns auto-applied by calculator (electricity, heating, rentSqm, hallCostSqm, …) - planner_defaults — JOIN dim_countries; same 29 cost columns flow through to /api/market-data endpoint Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,10 @@ EUROSTAT_BASE_URL = "https://ec.europa.eu/eurostat/api/dissemination/statistics/
|
||||
|
||||
# Dataset configs: filters fix dimension values, geo_dim/time_dim are iterated.
|
||||
# All other dimensions must either be in filters or have size=1.
|
||||
#
|
||||
# Optional `dataset_code` field: when present, used for the API URL instead of the dict key.
|
||||
# This allows multiple entries to share the same Eurostat dataset with different filters
|
||||
# (e.g. five prc_ppp_ind entries with different ppp_cat values).
|
||||
DATASETS: dict[str, dict] = {
|
||||
"urb_cpop1": {
|
||||
"filters": {"indic_ur": "DE1001V"}, # Population on 1 January, total
|
||||
@@ -51,6 +55,59 @@ DATASETS: dict[str, dict] = {
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
# ── Direct-value datasets (actual EUR figures) ───────────────────────────
|
||||
"nrg_pc_205": {
|
||||
# Electricity prices for non-household consumers, EUR/kWh, excl. taxes
|
||||
"filters": {"freq": "S", "nrg_cons": "MWH500-1999", "currency": "EUR", "tax": "I_TAX"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"nrg_pc_203": {
|
||||
# Gas prices for non-household consumers, EUR/GJ, excl. taxes
|
||||
"filters": {"freq": "S", "nrg_cons": "GJ1000-9999", "currency": "EUR", "tax": "I_TAX"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"lc_lci_lev": {
|
||||
# Labour cost levels EUR/hour — NACE N (administrative/support services)
|
||||
# Stored in dim_countries for future staffed-scenario calculations.
|
||||
"filters": {"lcstruct": "D1_D2_A_HW", "nace_r2": "N", "currency": "EUR"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
# ── Price level indices (relative scaling, EU27=100) ─────────────────────
|
||||
# Five entries share the prc_ppp_ind dataset with different ppp_cat filters.
|
||||
# dataset_code points to the real API endpoint; the dict key is the landing filename.
|
||||
"prc_ppp_ind_construction": {
|
||||
"dataset_code": "prc_ppp_ind",
|
||||
"filters": {"ppp_cat": "A050202", "na_item": "PLI_EU27_2020"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"prc_ppp_ind_housing": {
|
||||
"dataset_code": "prc_ppp_ind",
|
||||
"filters": {"ppp_cat": "A0104", "na_item": "PLI_EU27_2020"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"prc_ppp_ind_services": {
|
||||
"dataset_code": "prc_ppp_ind",
|
||||
"filters": {"ppp_cat": "P0201", "na_item": "PLI_EU27_2020"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"prc_ppp_ind_misc": {
|
||||
"dataset_code": "prc_ppp_ind",
|
||||
"filters": {"ppp_cat": "A0112", "na_item": "PLI_EU27_2020"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
"prc_ppp_ind_government": {
|
||||
"dataset_code": "prc_ppp_ind",
|
||||
"filters": {"ppp_cat": "P0202", "na_item": "PLI_EU27_2020"},
|
||||
"geo_dim": "geo",
|
||||
"time_dim": "time",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -196,22 +253,25 @@ def extract(
|
||||
files_skipped = 0
|
||||
bytes_written_total = 0
|
||||
|
||||
for dataset_code, config in DATASETS.items():
|
||||
url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
|
||||
for dataset_key, config in DATASETS.items():
|
||||
# Use dataset_code (if set) for the API URL; fall back to the dict key.
|
||||
# This lets multiple entries share one Eurostat dataset with different filters.
|
||||
api_code = config.get("dataset_code", dataset_key)
|
||||
url = f"{EUROSTAT_BASE_URL}/{api_code}?format=JSON&lang=EN"
|
||||
for key, val in config.get("filters", {}).items():
|
||||
url += f"&{key}={val}"
|
||||
dest_dir = landing_path(landing_dir, "eurostat", year, month)
|
||||
dest = dest_dir / f"{dataset_code}.json.gz"
|
||||
dest = dest_dir / f"{dataset_key}.json.gz"
|
||||
|
||||
logger.info("GET %s", dataset_code)
|
||||
logger.info("GET %s", dataset_key)
|
||||
bytes_written = _fetch_with_etag(url, dest, session, config)
|
||||
|
||||
if bytes_written > 0:
|
||||
logger.info("%s updated — %s bytes compressed", dataset_code, f"{bytes_written:,}")
|
||||
logger.info("%s updated — %s bytes compressed", dataset_key, f"{bytes_written:,}")
|
||||
files_written += 1
|
||||
bytes_written_total += bytes_written
|
||||
else:
|
||||
logger.info("%s not modified (304)", dataset_code)
|
||||
logger.info("%s not modified (304)", dataset_key)
|
||||
files_skipped += 1
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user