Files
beanflows/notebooks/03_Extraction.ipynb
Simon Dmsn 5588be152b Update 3 files
- /notebooks/03_Extraction.ipynb
- /transform/sqlmesh_materia/models/staging/stg_psd_alldata_1_filter_silver_layer.sql
- /transform/sqlmesh_materia/models/staging/stg_psd_alldata_2_filter_gold_layer.sql
2025-08-01 14:52:55 +00:00

151 lines
9.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\ndf.rename(columns={\\n \\'Commodity_Description\\': \\'commodity\\',\\n \\'Country_Name\\': \\'country\\',\\n \\'Attribute_Description\\': \\'attribute\\',\\n \\'Value\\': \\'value\\',\\n \\'Market_Year\\': \\'market_year\\'\\n}, inplace=True)\\n\\ndf[\\'report_date\\'] = pd.to_datetime(df[\\'market_year\\'].astype(int).astype(str) + \\'-01-01\\')\\ndf = df.dropna(subset=[\\'commodity\\', \\'country\\', \\'attribute\\', \\'value\\'])\\n\\n# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\\nsilver_dir = \"silver_layer_pivots\"\\nos.makedirs(silver_dir, exist_ok=True)\\ncommodity_pivots = {}\\n\\nfor commodity, group in df.groupby(\\'commodity\\'):\\n pivot = group.pivot_table(\\n index=[\\'report_date\\', \\'country\\'],\\n columns=\\'attribute\\',\\n values=\\'value\\',\\n aggfunc=\\'sum\\',\\n fill_value=0 # fill missing combinations with 0\\n ).reset_index()\\n\\n # Save each silver-layer file\\n safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\\n pivot.to_csv(filepath, index=False)\\n commodity_pivots[commodity] = filepath\\n\\nprint(\"✅ Silver layer saved to \\'silver_layer_pivots/\\'\")\\n\\n# === 3. Create Gold Layer with Derived Metrics ===\\ngold_dir = \"gold_layer_pivots\"\\nos.makedirs(gold_dir, exist_ok=True)\\n\\nall_global_summaries = []\\n\\nfor commodity, filepath in commodity_pivots.items():\\n df = pd.read_csv(filepath)\\n\\n # Ensure numeric types\\n required_cols = [\\'Production\\', \\'Imports\\', \\'Exports\\', \\'Total Distribution\\', \\'Ending Stocks\\']\\n for col in required_cols:\\n if col not in df.columns:\\n df[col] = 0 # Fill missing columns with 0\\n else:\\n df[col] = pd.to_numeric(df[col], errors=\\'coerce\\').fillna(0)\\n\\n # Derived metrics per country\\n df[\\'Net Supply\\'] = df[\\'Production\\'] + df[\\'Imports\\'] - df[\\'Exports\\']\\n df[\\'Trade Balance\\'] = df[\\'Exports\\'] - df[\\'Imports\\']\\n df[\\'Supply-Demand Balance\\'] = df[\\'Net Supply\\'] - df[\\'Total Distribution\\']\\n df[\\'Stock-to-Use Ratio (%)\\'] = df[\\'Ending Stocks\\'] / df[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n\\n # Global aggregates\\n global_summary = df.groupby(\\'report_date\\').agg({\\n \\'Production\\': \\'sum\\',\\n \\'Imports\\': \\'sum\\',\\n \\'Exports\\': \\'sum\\',\\n \\'Total Distribution\\': \\'sum\\',\\n \\'Ending Stocks\\': \\'sum\\'\\n }).reset_index()\\n\\n global_summary[\\'Net Supply\\'] = global_summary[\\'Production\\'] + global_summary[\\'Imports\\'] - global_summary[\\'Exports\\']\\n global_summary[\\'Supply-Demand Balance\\'] = global_summary[\\'Net Supply\\'] - global_summary[\\'Total Distribution\\']\\n global_summary[\\'Stock-to-Use Ratio (%)\\'] = global_summary[\\'Ending Stocks\\'] / global_summary[\\'Total Distribution\\'].replace(0, pd.NA) * 100\\n global_summary[\\'country\\'] = \\'Global\\'\\n global_summary[\\'commodity\\'] = commodity\\n\\n all_global_summaries.append(global_summary)\\n\\n # Add YoY Production change\\n df.sort_values([\\'country\\', \\'report_date\\'], inplace=True)\\n df[\\'Production_YoY (%)\\'] = df.groupby(\\'country\\')[\\'Production\\'].transform(lambda x: x.pct_change() * 100)\\n\\n # Save gold-layer file\\n safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\\n filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\\n df.to_csv(filepath, index=False)\\n\\nprint(\"🌟 Gold layer saved to \\'gold_layer_pivots/\\'\")\\n\\n# === 4. Combined Global Summary\\ncombined_global = pd.concat(all_global_summaries, ignore_index=True)\\ncombined_global = combined_global[[\\'commodity\\', \\'report_date\\', \\'Production\\', \\'Imports\\', \\'Exports\\',\\n \\'Total Distribution\\', \\'Ending Stocks\\', \\'Net Supply\\',\\n \\'Supply-Demand Balance\\', \\'Stock-to-Use Ratio (%)\\']]\\ncombined_global.to_csv(\"global_summary_all.csv\", index=False)\\nprint(\"🌐 Combined global summary saved as \\'global_summary_all.csv\\'\")\\n'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"# === 1. Load and Clean the Dataset ===\n",
"\n",
"data = \"../data/\"\n",
"df = pd.read_csv(\"../data/psd_alldata.csv\", encoding=\"latin1\")\n",
"\n",
"df.rename(columns={\n",
" 'Commodity_Description': 'commodity',\n",
" 'Country_Name': 'country',\n",
" 'Attribute_Description': 'attribute',\n",
" 'Value': 'value',\n",
" 'Market_Year': 'market_year'\n",
"}, inplace=True)\n",
"\n",
"df['report_date'] = pd.to_datetime(df['market_year'].astype(int).astype(str) + '-01-01')\n",
"df = df.dropna(subset=['commodity', 'country', 'attribute', 'value'])\n",
"\n",
"# === 2. Create Pivot Tables Per Commodity (Silver Layer) ===\n",
"silver_dir = \"silver_layer_pivots\"\n",
"os.makedirs(silver_dir, exist_ok=True)\n",
"commodity_pivots = {}\n",
"\n",
"for commodity, group in df.groupby('commodity'):\n",
" pivot = group.pivot_table(\n",
" index=['report_date', 'country'],\n",
" columns='attribute',\n",
" values='value',\n",
" aggfunc='sum',\n",
" fill_value=0 # fill missing combinations with 0\n",
" ).reset_index()\n",
"\n",
" # Save each silver-layer file\n",
" safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n",
" filepath = os.path.join(silver_dir, f\"{safe_name}.csv\")\n",
" pivot.to_csv(filepath, index=False)\n",
" commodity_pivots[commodity] = filepath\n",
"\n",
"print(\"✅ Silver layer saved to 'silver_layer_pivots/'\")\n",
"\n",
"# === 3. Create Gold Layer with Derived Metrics ===\n",
"gold_dir = \"gold_layer_pivots\"\n",
"os.makedirs(gold_dir, exist_ok=True)\n",
"\n",
"all_global_summaries = []\n",
"\n",
"for commodity, filepath in commodity_pivots.items():\n",
" df = pd.read_csv(filepath)\n",
"\n",
" # Ensure numeric types\n",
" required_cols = ['Production', 'Imports', 'Exports', 'Total Distribution', 'Ending Stocks']\n",
" for col in required_cols:\n",
" if col not in df.columns:\n",
" df[col] = 0 # Fill missing columns with 0\n",
" else:\n",
" df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)\n",
"\n",
" # Derived metrics per country\n",
" df['Net Supply'] = df['Production'] + df['Imports'] - df['Exports']\n",
" df['Trade Balance'] = df['Exports'] - df['Imports']\n",
" df['Supply-Demand Balance'] = df['Net Supply'] - df['Total Distribution']\n",
" df['Stock-to-Use Ratio (%)'] = df['Ending Stocks'] / df['Total Distribution'].replace(0, pd.NA) * 100\n",
"\n",
" # Global aggregates\n",
" global_summary = df.groupby('report_date').agg({\n",
" 'Production': 'sum',\n",
" 'Imports': 'sum',\n",
" 'Exports': 'sum',\n",
" 'Total Distribution': 'sum',\n",
" 'Ending Stocks': 'sum'\n",
" }).reset_index()\n",
"\n",
" global_summary['Net Supply'] = global_summary['Production'] + global_summary['Imports'] - global_summary['Exports']\n",
" global_summary['Supply-Demand Balance'] = global_summary['Net Supply'] - global_summary['Total Distribution']\n",
" global_summary['Stock-to-Use Ratio (%)'] = global_summary['Ending Stocks'] / global_summary['Total Distribution'].replace(0, pd.NA) * 100\n",
" global_summary['country'] = 'Global'\n",
" global_summary['commodity'] = commodity\n",
"\n",
" all_global_summaries.append(global_summary)\n",
"\n",
" # Add YoY Production change\n",
" df.sort_values(['country', 'report_date'], inplace=True)\n",
" df['Production_YoY (%)'] = df.groupby('country')['Production'].transform(lambda x: x.pct_change() * 100)\n",
"\n",
" # Save gold-layer file\n",
" safe_name = commodity.replace(\" \", \"_\").replace(\"/\", \"_\").lower()\n",
" filepath = os.path.join(gold_dir, f\"{safe_name}.csv\")\n",
" df.to_csv(filepath, index=False)\n",
"\n",
"print(\"🌟 Gold layer saved to 'gold_layer_pivots/'\")\n",
"\n",
"# === 4. Combined Global Summary\n",
"combined_global = pd.concat(all_global_summaries, ignore_index=True)\n",
"combined_global = combined_global[['commodity', 'report_date', 'Production', 'Imports', 'Exports',\n",
" 'Total Distribution', 'Ending Stocks', 'Net Supply',\n",
" 'Supply-Demand Balance', 'Stock-to-Use Ratio (%)']]\n",
"combined_global.to_csv(\"global_summary_all.csv\", index=False)\n",
"print(\"🌐 Combined global summary saved as 'global_summary_all.csv'\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}