From 0a993579a7fa5f5656b0800e2addebf28c9968eb Mon Sep 17 00:00:00 2001
From: Simon <simon.dreesmann@gmail.com>
Date: Wed, 11 Jun 2025 10:25:50 +0100
Subject: [PATCH] third versuch

---
 .gitignore                    |   1 +
 notebooks/03_Extraction.ipynb | 151 ++++++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 notebooks/03_Extraction.ipynb

diff --git a/.gitignore b/.gitignore
index 0a19790..26b30df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+data/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/notebooks/03_Extraction.ipynb b/notebooks/03_Extraction.ipynb
new file mode 100644
index 0000000..9f2ec50
--- /dev/null
+++ b/notebooks/03_Extraction.ipynb
@@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\ndf.rename(columns={\\n    \\'Commodity_Description\\': \\'commodity\\',\\n    \\'Country_Name\\': \\'country\\',\\n    \\'Attribute_Description\\': \\'attribute\\',\\n    \\'Value\\': \\'value\\',\\n    \\'Market_Year\\': \\'market_year\\'\\n}, inplace=True)\\n\\ndf[\\'report_date\\'] = pd.to_datetime(df[\\'market_year\\'].astype(int).astype(str) + \\'-01-01\\')\\ndf = df.dropna(subset=[\\'commodity\\', \\'country\\', \\'attribute\\', \\'value\\'])\\n\\n# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\\nsilver_dir = \"silver_layer_pivots\"\\nos.makedirs(silver_dir, exist_ok=True)\\ncommodity_pivots = {}\\n\\nfor commodity, group in df.groupby(\\'commodity\\'):\\n    pivot = group.pivot_table(\\n        index=[\\'report_date\\', \\'country\\'],\\n        columns=\\'attribute\\',\\n        values=\\'value\\',\\n        aggfunc=\\'sum\\',\\n        fill_value=0  # fill missing combinations with 0\\n    ).reset_index()\\n\\n    # Save each silver-layer file\\n    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n    filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\\n    pivot.to_csv(filepath, index=False)\\n    commodity_pivots[commodity] = filepath\\n\\nprint(\"✅ Silver layer saved to \\'silver_layer_pivots/\\'\")\\n\\n# === 3. Create Gold Layer with Derived Metrics ===\\ngold_dir = \"gold_layer_pivots\"\\nos.makedirs(gold_dir, exist_ok=True)\\n\\nall_global_summaries = []\\n\\nfor commodity, filepath in commodity_pivots.items():\\n    df = pd.read_csv(filepath)\\n\\n    # Ensure numeric types\\n    required_cols = [\\'Production\\', \\'Imports\\', \\'Exports\\', \\'Total Distribution\\', \\'Ending Stocks\\']\\n    for col in required_cols:\\n        if col not in df.columns:\\n            df[col] = 0  # Fill missing columns with 0\\n        else:\\n            df[col] = pd.to_numeric(df[col], errors=\\'coerce\\').fillna(0)\\n\\n    # Derived metrics per country\\n    df[\\'Net Supply\\'] = df[\\'Production\\'] + df[\\'Imports\\'] - df[\\'Exports\\']\\n    df[\\'Trade Balance\\'] = df[\\'Exports\\'] - df[\\'Imports\\']\\n    df[\\'Supply-Demand Balance\\'] = df[\\'Net Supply\\'] - df[\\'Total Distribution\\']\\n    df[\\'Stock-to-Use Ratio (%)\\'] = df[\\'Ending Stocks\\'] / df[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n\\n    # Global aggregates\\n    global_summary = df.groupby(\\'report_date\\').agg({\\n        \\'Production\\': \\'sum\\',\\n        \\'Imports\\': \\'sum\\',\\n        \\'Exports\\': \\'sum\\',\\n        \\'Total Distribution\\': \\'sum\\',\\n        \\'Ending Stocks\\': \\'sum\\'\\n    }).reset_index()\\n\\n    global_summary[\\'Net Supply\\'] = global_summary[\\'Production\\'] + global_summary[\\'Imports\\'] - global_summary[\\'Exports\\']\\n    global_summary[\\'Supply-Demand Balance\\'] = global_summary[\\'Net Supply\\'] - global_summary[\\'Total Distribution\\']\\n    global_summary[\\'Stock-to-Use Ratio (%)\\'] = global_summary[\\'Ending Stocks\\'] / global_summary[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n    global_summary[\\'country\\'] = \\'Global\\'\\n    global_summary[\\'commodity\\'] = commodity\\n\\n    all_global_summaries.append(global_summary)\\n\\n    # Add YoY Production change\\n    df.sort_values([\\'country\\', \\'report_date\\'], inplace=True)\\n    df[\\'Production_YoY (%)\\'] = df.groupby(\\'country\\')[\\'Production\\'].transform(lambda x: x.pct_change() * 100)\\n\\n    # Save gold-layer file\\n    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n    filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\\n    df.to_csv(filepath, index=False)\\n\\nprint(\"🌟 Gold layer saved to \\'gold_layer_pivots/\\'\")\\n\\n# === 4. Combined Global Summary\\ncombined_global = pd.concat(all_global_summaries, ignore_index=True)\\ncombined_global = combined_global[[\\'commodity\\', \\'report_date\\', \\'Production\\', \\'Imports\\', \\'Exports\\',\\n                                   \\'Total Distribution\\', \\'Ending Stocks\\', \\'Net Supply\\',\\n                                   \\'Supply-Demand Balance\\', \\'Stock-to-Use Ratio (%)\\']]\\ncombined_global.to_csv(\"global_summary_all.csv\", index=False)\\nprint(\"🌐 Combined global summary saved as \\'global_summary_all.csv\\'\")\\n'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "\n",
+    "# === 1. Load and Clean the Dataset ===\n",
+    "\n",
+    "data = \"../data/\"\n",
+    "df = pd.read_csv(\"../data/psd_alldata.csv\", encoding=\"latin1\")\n",
+    "\"\"\"\n",
+    "df.rename(columns={\n",
+    "    'Commodity_Description': 'commodity',\n",
+    "    'Country_Name': 'country',\n",
+    "    'Attribute_Description': 'attribute',\n",
+    "    'Value': 'value',\n",
+    "    'Market_Year': 'market_year'\n",
+    "}, inplace=True)\n",
+    "\n",
+    "df['report_date'] = pd.to_datetime(df['market_year'].astype(int).astype(str) + '-01-01')\n",
+    "df = df.dropna(subset=['commodity', 'country', 'attribute', 'value'])\n",
+    "\n",
+    "# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\n",
+    "silver_dir = \"silver_layer_pivots\"\n",
+    "os.makedirs(silver_dir, exist_ok=True)\n",
+    "commodity_pivots = {}\n",
+    "\n",
+    "for commodity, group in df.groupby('commodity'):\n",
+    "    pivot = group.pivot_table(\n",
+    "        index=['report_date', 'country'],\n",
+    "        columns='attribute',\n",
+    "        values='value',\n",
+    "        aggfunc='sum',\n",
+    "        fill_value=0  # fill missing combinations with 0\n",
+    "    ).reset_index()\n",
+    "\n",
+    "    # Save each silver-layer file\n",
+    "    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n",
+    "    filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\n",
+    "    pivot.to_csv(filepath, index=False)\n",
+    "    commodity_pivots[commodity] = filepath\n",
+    "\n",
+    "print(\"✅ Silver layer saved to 'silver_layer_pivots/'\")\n",
+    "\n",
+    "# === 3. Create Gold Layer with Derived Metrics ===\n",
+    "gold_dir = \"gold_layer_pivots\"\n",
+    "os.makedirs(gold_dir, exist_ok=True)\n",
+    "\n",
+    "all_global_summaries = []\n",
+    "\n",
+    "for commodity, filepath in commodity_pivots.items():\n",
+    "    df = pd.read_csv(filepath)\n",
+    "\n",
+    "    # Ensure numeric types\n",
+    "    required_cols = ['Production', 'Imports', 'Exports', 'Total Distribution', 'Ending Stocks']\n",
+    "    for col in required_cols:\n",
+    "        if col not in df.columns:\n",
+    "            df[col] = 0  # Fill missing columns with 0\n",
+    "        else:\n",
+    "            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)\n",
+    "\n",
+    "    # Derived metrics per country\n",
+    "    df['Net Supply'] = df['Production'] + df['Imports'] - df['Exports']\n",
+    "    df['Trade Balance'] = df['Exports'] - df['Imports']\n",
+    "    df['Supply-Demand Balance'] = df['Net Supply'] - df['Total Distribution']\n",
+    "    df['Stock-to-Use Ratio (%)'] = df['Ending Stocks'] / df['Total Distribution'].replace(0, pd.NA) * 100\n",
+    "\n",
+    "    # Global aggregates\n",
+    "    global_summary = df.groupby('report_date').agg({\n",
+    "        'Production': 'sum',\n",
+    "        'Imports': 'sum',\n",
+    "        'Exports': 'sum',\n",
+    "        'Total Distribution': 'sum',\n",
+    "        'Ending Stocks': 'sum'\n",
+    "    }).reset_index()\n",
+    "\n",
+    "    global_summary['Net Supply'] = global_summary['Production'] + global_summary['Imports'] - global_summary['Exports']\n",
+    "    global_summary['Supply-Demand Balance'] = global_summary['Net Supply'] - global_summary['Total Distribution']\n",
+    "    global_summary['Stock-to-Use Ratio (%)'] = global_summary['Ending Stocks'] / global_summary['Total Distribution'].replace(0, pd.NA) * 100\n",
+    "    global_summary['country'] = 'Global'\n",
+    "    global_summary['commodity'] = commodity\n",
+    "\n",
+    "    all_global_summaries.append(global_summary)\n",
+    "\n",
+    "    # Add YoY Production change\n",
+    "    df.sort_values(['country', 'report_date'], inplace=True)\n",
+    "    df['Production_YoY (%)'] = df.groupby('country')['Production'].transform(lambda x: x.pct_change() * 100)\n",
+    "\n",
+    "    # Save gold-layer file\n",
+    "    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n",
+    "    filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\n",
+    "    df.to_csv(filepath, index=False)\n",
+    "\n",
+    "print(\"🌟 Gold layer saved to 'gold_layer_pivots/'\")\n",
+    "\n",
+    "# === 4. Combined Global Summary\n",
+    "combined_global = pd.concat(all_global_summaries, ignore_index=True)\n",
+    "combined_global = combined_global[['commodity', 'report_date', 'Production', 'Imports', 'Exports',\n",
+    "                                   'Total Distribution', 'Ending Stocks', 'Net Supply',\n",
+    "                                   'Supply-Demand Balance', 'Stock-to-Use Ratio (%)']]\n",
+    "combined_global.to_csv(\"global_summary_all.csv\", index=False)\n",
+    "print(\"🌐 Combined global summary saved as 'global_summary_all.csv'\")\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}