merge(worktree): individualise article costs with per-country Eurostat data + tiered proxy tenant work

# Conflicts: # CHANGELOG.md # transform/sqlmesh_padelnomics/models/foundation/dim_cities.sql # transform/sqlmesh_padelnomics/models/foundation/dim_locations.sql
2026-03-04 12:44:56 +01:00
parent 7af6f32a2b 2e68cfbe4f
commit 61c197d233
12 changed files with 679 additions and 36 deletions
--- a/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/eurostat.py
@@ -26,6 +26,10 @@ EUROSTAT_BASE_URL = "https://ec.europa.eu/eurostat/api/dissemination/statistics/

 # Dataset configs: filters fix dimension values, geo_dim/time_dim are iterated.
 # All other dimensions must either be in filters or have size=1.
+#
+# Optional `dataset_code` field: when present, used for the API URL instead of the dict key.
+# This allows multiple entries to share the same Eurostat dataset with different filters
+# (e.g. five prc_ppp_ind entries with different ppp_cat values).
 DATASETS: dict[str, dict] = {
    "urb_cpop1": {
        "filters": {"indic_ur": "DE1001V"},  # Population on 1 January, total
@@ -51,6 +55,59 @@ DATASETS: dict[str, dict] = {
        "geo_dim": "geo",
        "time_dim": "time",
    },
+    # ── Direct-value datasets (actual EUR figures) ───────────────────────────
+    "nrg_pc_205": {
+        # Electricity prices for non-household consumers, EUR/kWh, excl. taxes
+        "filters": {"freq": "S", "nrg_cons": "MWH500-1999", "currency": "EUR", "tax": "I_TAX"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    "nrg_pc_203": {
+        # Gas prices for non-household consumers, EUR/GJ, excl. taxes
+        "filters": {"freq": "S", "nrg_cons": "GJ1000-9999", "currency": "EUR", "tax": "I_TAX"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    "lc_lci_lev": {
+        # Labour cost levels EUR/hour — NACE N (administrative/support services)
+        # Stored in dim_countries for future staffed-scenario calculations.
+        "filters": {"lcstruct": "D1_D2_A_HW", "nace_r2": "N", "currency": "EUR"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    # ── Price level indices (relative scaling, EU27=100) ─────────────────────
+    # Five entries share the prc_ppp_ind dataset with different ppp_cat filters.
+    # dataset_code points to the real API endpoint; the dict key is the landing filename.
+    "prc_ppp_ind_construction": {
+        "dataset_code": "prc_ppp_ind",
+        "filters": {"ppp_cat": "A050202", "na_item": "PLI_EU27_2020"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    "prc_ppp_ind_housing": {
+        "dataset_code": "prc_ppp_ind",
+        "filters": {"ppp_cat": "A0104", "na_item": "PLI_EU27_2020"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    "prc_ppp_ind_services": {
+        "dataset_code": "prc_ppp_ind",
+        "filters": {"ppp_cat": "P0201", "na_item": "PLI_EU27_2020"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    "prc_ppp_ind_misc": {
+        "dataset_code": "prc_ppp_ind",
+        "filters": {"ppp_cat": "A0112", "na_item": "PLI_EU27_2020"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
+    "prc_ppp_ind_government": {
+        "dataset_code": "prc_ppp_ind",
+        "filters": {"ppp_cat": "P0202", "na_item": "PLI_EU27_2020"},
+        "geo_dim": "geo",
+        "time_dim": "time",
+    },
 }


@@ -196,22 +253,25 @@ def extract(
    files_skipped = 0
    bytes_written_total = 0

-    for dataset_code, config in DATASETS.items():
-        url = f"{EUROSTAT_BASE_URL}/{dataset_code}?format=JSON&lang=EN"
+    for dataset_key, config in DATASETS.items():
+        # Use dataset_code (if set) for the API URL; fall back to the dict key.
+        # This lets multiple entries share one Eurostat dataset with different filters.
+        api_code = config.get("dataset_code", dataset_key)
+        url = f"{EUROSTAT_BASE_URL}/{api_code}?format=JSON&lang=EN"
        for key, val in config.get("filters", {}).items():
            url += f"&{key}={val}"
        dest_dir = landing_path(landing_dir, "eurostat", year, month)
-        dest = dest_dir / f"{dataset_code}.json.gz"
+        dest = dest_dir / f"{dataset_key}.json.gz"

-        logger.info("GET %s", dataset_code)
+        logger.info("GET %s", dataset_key)
        bytes_written = _fetch_with_etag(url, dest, session, config)

        if bytes_written > 0:
-            logger.info("%s updated — %s bytes compressed", dataset_code, f"{bytes_written:,}")
+            logger.info("%s updated — %s bytes compressed", dataset_key, f"{bytes_written:,}")
            files_written += 1
            bytes_written_total += bytes_written
        else:
-            logger.info("%s not modified (304)", dataset_code)
+            logger.info("%s not modified (304)", dataset_key)
            files_skipped += 1

    return {