From 0a993579a7fa5f5656b0800e2addebf28c9968eb Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 11 Jun 2025 10:25:50 +0100 Subject: [PATCH] third versuch --- .gitignore | 1 + notebooks/03_Extraction.ipynb | 151 ++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 notebooks/03_Extraction.ipynb diff --git a/.gitignore b/.gitignore index 0a19790..26b30df 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/notebooks/03_Extraction.ipynb b/notebooks/03_Extraction.ipynb new file mode 100644 index 0000000..9f2ec50 --- /dev/null +++ b/notebooks/03_Extraction.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\ndf.rename(columns={\\n \\'Commodity_Description\\': \\'commodity\\',\\n \\'Country_Name\\': \\'country\\',\\n \\'Attribute_Description\\': \\'attribute\\',\\n \\'Value\\': \\'value\\',\\n \\'Market_Year\\': \\'market_year\\'\\n}, inplace=True)\\n\\ndf[\\'report_date\\'] = pd.to_datetime(df[\\'market_year\\'].astype(int).astype(str) + \\'-01-01\\')\\ndf = df.dropna(subset=[\\'commodity\\', \\'country\\', \\'attribute\\', \\'value\\'])\\n\\n# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\\nsilver_dir = \"silver_layer_pivots\"\\nos.makedirs(silver_dir, exist_ok=True)\\ncommodity_pivots = {}\\n\\nfor commodity, group in df.groupby(\\'commodity\\'):\\n pivot = group.pivot_table(\\n index=[\\'report_date\\', \\'country\\'],\\n columns=\\'attribute\\',\\n values=\\'value\\',\\n aggfunc=\\'sum\\',\\n fill_value=0 # fill missing combinations with 0\\n ).reset_index()\\n\\n # Save each silver-layer file\\n safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\\n pivot.to_csv(filepath, index=False)\\n commodity_pivots[commodity] = filepath\\n\\nprint(\"✅ Silver layer saved to \\'silver_layer_pivots/\\'\")\\n\\n# === 3. Create Gold Layer with Derived Metrics ===\\ngold_dir = \"gold_layer_pivots\"\\nos.makedirs(gold_dir, exist_ok=True)\\n\\nall_global_summaries = []\\n\\nfor commodity, filepath in commodity_pivots.items():\\n df = pd.read_csv(filepath)\\n\\n # Ensure numeric types\\n required_cols = [\\'Production\\', \\'Imports\\', \\'Exports\\', \\'Total Distribution\\', \\'Ending Stocks\\']\\n for col in required_cols:\\n if col not in df.columns:\\n df[col] = 0 # Fill missing columns with 0\\n else:\\n df[col] = pd.to_numeric(df[col], errors=\\'coerce\\').fillna(0)\\n\\n # Derived metrics per country\\n df[\\'Net Supply\\'] = df[\\'Production\\'] + df[\\'Imports\\'] - df[\\'Exports\\']\\n df[\\'Trade Balance\\'] = df[\\'Exports\\'] - df[\\'Imports\\']\\n df[\\'Supply-Demand Balance\\'] = df[\\'Net Supply\\'] - df[\\'Total Distribution\\']\\n df[\\'Stock-to-Use Ratio (%)\\'] = df[\\'Ending Stocks\\'] / df[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n\\n # Global aggregates\\n global_summary = df.groupby(\\'report_date\\').agg({\\n \\'Production\\': \\'sum\\',\\n \\'Imports\\': \\'sum\\',\\n \\'Exports\\': \\'sum\\',\\n \\'Total Distribution\\': \\'sum\\',\\n \\'Ending Stocks\\': \\'sum\\'\\n }).reset_index()\\n\\n global_summary[\\'Net Supply\\'] = global_summary[\\'Production\\'] + global_summary[\\'Imports\\'] - global_summary[\\'Exports\\']\\n global_summary[\\'Supply-Demand Balance\\'] = global_summary[\\'Net Supply\\'] - global_summary[\\'Total Distribution\\']\\n global_summary[\\'Stock-to-Use Ratio (%)\\'] = global_summary[\\'Ending Stocks\\'] / global_summary[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n global_summary[\\'country\\'] = \\'Global\\'\\n global_summary[\\'commodity\\'] = commodity\\n\\n all_global_summaries.append(global_summary)\\n\\n # Add YoY Production change\\n df.sort_values([\\'country\\', \\'report_date\\'], inplace=True)\\n df[\\'Production_YoY (%)\\'] = df.groupby(\\'country\\')[\\'Production\\'].transform(lambda x: x.pct_change() * 100)\\n\\n # Save gold-layer file\\n safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\\n df.to_csv(filepath, index=False)\\n\\nprint(\"🌟 Gold layer saved to \\'gold_layer_pivots/\\'\")\\n\\n# === 4. Combined Global Summary\\ncombined_global = pd.concat(all_global_summaries, ignore_index=True)\\ncombined_global = combined_global[[\\'commodity\\', \\'report_date\\', \\'Production\\', \\'Imports\\', \\'Exports\\',\\n \\'Total Distribution\\', \\'Ending Stocks\\', \\'Net Supply\\',\\n \\'Supply-Demand Balance\\', \\'Stock-to-Use Ratio (%)\\']]\\ncombined_global.to_csv(\"global_summary_all.csv\", index=False)\\nprint(\"🌐 Combined global summary saved as \\'global_summary_all.csv\\'\")\\n'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "\n", + "# === 1. Load and Clean the Dataset ===\n", + "\n", + "data = \"../data/\"\n", + "df = pd.read_csv(\"../data/psd_alldata.csv\", encoding=\"latin1\")\n", + "\"\"\"\n", + "df.rename(columns={\n", + " 'Commodity_Description': 'commodity',\n", + " 'Country_Name': 'country',\n", + " 'Attribute_Description': 'attribute',\n", + " 'Value': 'value',\n", + " 'Market_Year': 'market_year'\n", + "}, inplace=True)\n", + "\n", + "df['report_date'] = pd.to_datetime(df['market_year'].astype(int).astype(str) + '-01-01')\n", + "df = df.dropna(subset=['commodity', 'country', 'attribute', 'value'])\n", + "\n", + "# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\n", + "silver_dir = \"silver_layer_pivots\"\n", + "os.makedirs(silver_dir, exist_ok=True)\n", + "commodity_pivots = {}\n", + "\n", + "for commodity, group in df.groupby('commodity'):\n", + " pivot = group.pivot_table(\n", + " index=['report_date', 'country'],\n", + " columns='attribute',\n", + " values='value',\n", + " aggfunc='sum',\n", + " fill_value=0 # fill missing combinations with 0\n", + " ).reset_index()\n", + "\n", + " # Save each silver-layer file\n", + " safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n", + " filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\n", + " pivot.to_csv(filepath, index=False)\n", + " commodity_pivots[commodity] = filepath\n", + "\n", + "print(\"✅ Silver layer saved to 'silver_layer_pivots/'\")\n", + "\n", + "# === 3. Create Gold Layer with Derived Metrics ===\n", + "gold_dir = \"gold_layer_pivots\"\n", + "os.makedirs(gold_dir, exist_ok=True)\n", + "\n", + "all_global_summaries = []\n", + "\n", + "for commodity, filepath in commodity_pivots.items():\n", + " df = pd.read_csv(filepath)\n", + "\n", + " # Ensure numeric types\n", + " required_cols = ['Production', 'Imports', 'Exports', 'Total Distribution', 'Ending Stocks']\n", + " for col in required_cols:\n", + " if col not in df.columns:\n", + " df[col] = 0 # Fill missing columns with 0\n", + " else:\n", + " df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)\n", + "\n", + " # Derived metrics per country\n", + " df['Net Supply'] = df['Production'] + df['Imports'] - df['Exports']\n", + " df['Trade Balance'] = df['Exports'] - df['Imports']\n", + " df['Supply-Demand Balance'] = df['Net Supply'] - df['Total Distribution']\n", + " df['Stock-to-Use Ratio (%)'] = df['Ending Stocks'] / df['Total Distribution'].replace(0, pd.NA) * 100\n", + "\n", + " # Global aggregates\n", + " global_summary = df.groupby('report_date').agg({\n", + " 'Production': 'sum',\n", + " 'Imports': 'sum',\n", + " 'Exports': 'sum',\n", + " 'Total Distribution': 'sum',\n", + " 'Ending Stocks': 'sum'\n", + " }).reset_index()\n", + "\n", + " global_summary['Net Supply'] = global_summary['Production'] + global_summary['Imports'] - global_summary['Exports']\n", + " global_summary['Supply-Demand Balance'] = global_summary['Net Supply'] - global_summary['Total Distribution']\n", + " global_summary['Stock-to-Use Ratio (%)'] = global_summary['Ending Stocks'] / global_summary['Total Distribution'].replace(0, pd.NA) * 100\n", + " global_summary['country'] = 'Global'\n", + " global_summary['commodity'] = commodity\n", + "\n", + " all_global_summaries.append(global_summary)\n", + "\n", + " # Add YoY Production change\n", + " df.sort_values(['country', 'report_date'], inplace=True)\n", + " df['Production_YoY (%)'] = df.groupby('country')['Production'].transform(lambda x: x.pct_change() * 100)\n", + "\n", + " # Save gold-layer file\n", + " safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n", + " filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\n", + " df.to_csv(filepath, index=False)\n", + "\n", + "print(\"🌟 Gold layer saved to 'gold_layer_pivots/'\")\n", + "\n", + "# === 4. Combined Global Summary\n", + "combined_global = pd.concat(all_global_summaries, ignore_index=True)\n", + "combined_global = combined_global[['commodity', 'report_date', 'Production', 'Imports', 'Exports',\n", + " 'Total Distribution', 'Ending Stocks', 'Net Supply',\n", + " 'Supply-Demand Balance', 'Stock-to-Use Ratio (%)']]\n", + "combined_global.to_csv(\"global_summary_all.csv\", index=False)\n", + "print(\"🌐 Combined global summary saved as 'global_summary_all.csv'\")\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}