beanflows/notebooks/03_Extraction.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\ndf.rename(columns={\\n    \\'Commodity_Description\\': \\'commodity\\',\\n    \\'Country_Name\\': \\'country\\',\\n    \\'Attribute_Description\\': \\'attribute\\',\\n    \\'Value\\': \\'value\\',\\n    \\'Market_Year\\': \\'market_year\\'\\n}, inplace=True)\\n\\ndf[\\'report_date\\'] = pd.to_datetime(df[\\'market_year\\'].astype(int).astype(str) + \\'-01-01\\')\\ndf = df.dropna(subset=[\\'commodity\\', \\'country\\', \\'attribute\\', \\'value\\'])\\n\\n# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\\nsilver_dir = \"silver_layer_pivots\"\\nos.makedirs(silver_dir, exist_ok=True)\\ncommodity_pivots = {}\\n\\nfor commodity, group in df.groupby(\\'commodity\\'):\\n    pivot = group.pivot_table(\\n        index=[\\'report_date\\', \\'country\\'],\\n        columns=\\'attribute\\',\\n        values=\\'value\\',\\n        aggfunc=\\'sum\\',\\n        fill_value=0  # fill missing combinations with 0\\n    ).reset_index()\\n\\n    # Save each silver-layer file\\n    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n    filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\\n    pivot.to_csv(filepath, index=False)\\n    commodity_pivots[commodity] = filepath\\n\\nprint(\"✅ Silver layer saved to \\'silver_layer_pivots/\\'\")\\n\\n# === 3. Create Gold Layer with Derived Metrics ===\\ngold_dir = \"gold_layer_pivots\"\\nos.makedirs(gold_dir, exist_ok=True)\\n\\nall_global_summaries = []\\n\\nfor commodity, filepath in commodity_pivots.items():\\n    df = pd.read_csv(filepath)\\n\\n    # Ensure numeric types\\n    required_cols = [\\'Production\\', \\'Imports\\', \\'Exports\\', \\'Total Distribution\\', \\'Ending Stocks\\']\\n    for col in required_cols:\\n        if col not in df.columns:\\n            df[col] = 0  # Fill missing columns with 0\\n        else:\\n            df[col] = pd.to_numeric(df[col], errors=\\'coerce\\').fillna(0)\\n\\n    # Derived metrics per country\\n    df[\\'Net Supply\\'] = df[\\'Production\\'] + df[\\'Imports\\'] - df[\\'Exports\\']\\n    df[\\'Trade Balance\\'] = df[\\'Exports\\'] - df[\\'Imports\\']\\n    df[\\'Supply-Demand Balance\\'] = df[\\'Net Supply\\'] - df[\\'Total Distribution\\']\\n    df[\\'Stock-to-Use Ratio (%)\\'] = df[\\'Ending Stocks\\'] / df[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n\\n    # Global aggregates\\n    global_summary = df.groupby(\\'report_date\\').agg({\\n        \\'Production\\': \\'sum\\',\\n        \\'Imports\\': \\'sum\\',\\n        \\'Exports\\': \\'sum\\',\\n        \\'Total Distribution\\': \\'sum\\',\\n        \\'Ending Stocks\\': \\'sum\\'\\n    }).reset_index()\\n\\n    global_summary[\\'Net Supply\\'] = global_summary[\\'Production\\'] + global_summary[\\'Imports\\'] - global_summary[\\'Exports\\']\\n    global_summary[\\'Supply-Demand Balance\\'] = global_summary[\\'Net Supply\\'] - global_summary[\\'Total Distribution\\']\\n    global_summary[\\'Stock-to-Use Ratio (%)\\'] = global_summary[\\'Ending Stocks\\'] / global_summary[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n    global_summary[\\'country\\'] = \\'Global\\'\\n    global_summary[\\'commodity\\'] = commodity\\n\\n    all_global_summaries.append(global_summary)\\n\\n    # Add YoY Production change\\n    df.sort_values([\\'country\\', \\'report_date\\'], inplace=True)\\n    df[\\'Production_YoY (%)\\'] = df.groupby(\\'country\\')[\\'Production\\'].transform(lambda x: x.pct_change() * 100)\\n\\n    # Save gold-layer file\\n    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n    filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\\n    df.to_csv(filepath, index=False)\\n\\nprint(\"🌟 Gold layer saved to \\'gold_layer_pivots/\\'\")\\n\\n# === 4. Combined Global Summary\\ncombined_global = pd.concat(all_global_summaries, ignore_index=True)\\ncombined_global = combined_global[[\\'commodity\\', \\'report_date\\', \\'Production\\', \\'Imports\\', \\'Exports\\',\\n                                   \\'Total Distribution\\', \\'Ending Stocks\\', \\'Net Supply\\',\\n                                   \\'Supply-Demand Balance\\', \\'Stock-to-Use Ratio (%)\\']]\\ncombined_global.to_csv(\"global_summary_all.csv\", index=False)\\nprint(\"🌐 Combined global summary saved as \\'global_summary_all.csv\\'\")\\n'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "# === 1. Load and Clean the Dataset ===\n",
    "\n",
    "data = \"../data/\"\n",
    "df = pd.read_csv(\"../data/psd_alldata.csv\", encoding=\"latin1\")\n",
    "\n",
    "df.rename(columns={\n",
    "    'Commodity_Description': 'commodity',\n",
    "    'Country_Name': 'country',\n",
    "    'Attribute_Description': 'attribute',\n",
    "    'Value': 'value',\n",
    "    'Market_Year': 'market_year'\n",
    "}, inplace=True)\n",
    "\n",
    "df['report_date'] = pd.to_datetime(df['market_year'].astype(int).astype(str) + '-01-01')\n",
    "df = df.dropna(subset=['commodity', 'country', 'attribute', 'value'])\n",
    "\n",
    "# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\n",
    "silver_dir = \"silver_layer_pivots\"\n",
    "os.makedirs(silver_dir, exist_ok=True)\n",
    "commodity_pivots = {}\n",
    "\n",
    "for commodity, group in df.groupby('commodity'):\n",
    "    pivot = group.pivot_table(\n",
    "        index=['report_date', 'country'],\n",
    "        columns='attribute',\n",
    "        values='value',\n",
    "        aggfunc='sum',\n",
    "        fill_value=0  # fill missing combinations with 0\n",
    "    ).reset_index()\n",
    "\n",
    "    # Save each silver-layer file\n",
    "    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n",
    "    filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\n",
    "    pivot.to_csv(filepath, index=False)\n",
    "    commodity_pivots[commodity] = filepath\n",
    "\n",
    "print(\"✅ Silver layer saved to 'silver_layer_pivots/'\")\n",
    "\n",
    "# === 3. Create Gold Layer with Derived Metrics ===\n",
    "gold_dir = \"gold_layer_pivots\"\n",
    "os.makedirs(gold_dir, exist_ok=True)\n",
    "\n",
    "all_global_summaries = []\n",
    "\n",
    "for commodity, filepath in commodity_pivots.items():\n",
    "    df = pd.read_csv(filepath)\n",
    "\n",
    "    # Ensure numeric types\n",
    "    required_cols = ['Production', 'Imports', 'Exports', 'Total Distribution', 'Ending Stocks']\n",
    "    for col in required_cols:\n",
    "        if col not in df.columns:\n",
    "            df[col] = 0  # Fill missing columns with 0\n",
    "        else:\n",
    "            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)\n",
    "\n",
    "    # Derived metrics per country\n",
    "    df['Net Supply'] = df['Production'] + df['Imports'] - df['Exports']\n",
    "    df['Trade Balance'] = df['Exports'] - df['Imports']\n",
    "    df['Supply-Demand Balance'] = df['Net Supply'] - df['Total Distribution']\n",
    "    df['Stock-to-Use Ratio (%)'] = df['Ending Stocks'] / df['Total Distribution'].replace(0, pd.NA) * 100\n",
    "\n",
    "    # Global aggregates\n",
    "    global_summary = df.groupby('report_date').agg({\n",
    "        'Production': 'sum',\n",
    "        'Imports': 'sum',\n",
    "        'Exports': 'sum',\n",
    "        'Total Distribution': 'sum',\n",
    "        'Ending Stocks': 'sum'\n",
    "    }).reset_index()\n",
    "\n",
    "    global_summary['Net Supply'] = global_summary['Production'] + global_summary['Imports'] - global_summary['Exports']\n",
    "    global_summary['Supply-Demand Balance'] = global_summary['Net Supply'] - global_summary['Total Distribution']\n",
    "    global_summary['Stock-to-Use Ratio (%)'] = global_summary['Ending Stocks'] / global_summary['Total Distribution'].replace(0, pd.NA) * 100\n",
    "    global_summary['country'] = 'Global'\n",
    "    global_summary['commodity'] = commodity\n",
    "\n",
    "    all_global_summaries.append(global_summary)\n",
    "\n",
    "    # Add YoY Production change\n",
    "    df.sort_values(['country', 'report_date'], inplace=True)\n",
    "    df['Production_YoY (%)'] = df.groupby('country')['Production'].transform(lambda x: x.pct_change() * 100)\n",
    "\n",
    "    # Save gold-layer file\n",
    "    safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n",
    "    filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\n",
    "    df.to_csv(filepath, index=False)\n",
    "\n",
    "print(\"🌟 Gold layer saved to 'gold_layer_pivots/'\")\n",
    "\n",
    "# === 4. Combined Global Summary\n",
    "combined_global = pd.concat(all_global_summaries, ignore_index=True)\n",
    "combined_global = combined_global[['commodity', 'report_date', 'Production', 'Imports', 'Exports',\n",
    "                                   'Total Distribution', 'Ending Stocks', 'Net Supply',\n",
    "                                   'Supply-Demand Balance', 'Stock-to-Use Ratio (%)']]\n",
    "combined_global.to_csv(\"global_summary_all.csv\", index=False)\n",
    "print(\"🌐 Combined global summary saved as 'global_summary_all.csv'\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}