diff --git a/notebooks/03_Extraction.ipynb b/notebooks/03_Extraction.ipynb index 9f2ec50..cebf785 100644 --- a/notebooks/03_Extraction.ipynb +++ b/notebooks/03_Extraction.ipynb @@ -24,7 +24,7 @@ "\n", "data = \"../data/\"\n", "df = pd.read_csv(\"../data/psd_alldata.csv\", encoding=\"latin1\")\n", - "\"\"\"\n", + "\n", "df.rename(columns={\n", " 'Commodity_Description': 'commodity',\n", " 'Country_Name': 'country',\n", @@ -115,8 +115,7 @@ " 'Total Distribution', 'Ending Stocks', 'Net Supply',\n", " 'Supply-Demand Balance', 'Stock-to-Use Ratio (%)']]\n", "combined_global.to_csv(\"global_summary_all.csv\", index=False)\n", - "print(\"🌐 Combined global summary saved as 'global_summary_all.csv'\")\n", - "\"\"\"" + "print(\"🌐 Combined global summary saved as 'global_summary_all.csv'\")\n" ] }, { diff --git a/transform/sqlmesh_materia/models/staging/stg_psd_alldata.sql b/transform/sqlmesh_materia/models/staging/stg_psd_alldata_0.sql similarity index 100% rename from transform/sqlmesh_materia/models/staging/stg_psd_alldata.sql rename to transform/sqlmesh_materia/models/staging/stg_psd_alldata_0.sql diff --git a/transform/sqlmesh_materia/models/staging/stg_psd_alldata_1_filter_silver_layer.sql b/transform/sqlmesh_materia/models/staging/stg_psd_alldata_1_filter_silver_layer.sql new file mode 100644 index 0000000..0d77482 --- /dev/null +++ b/transform/sqlmesh_materia/models/staging/stg_psd_alldata_1_filter_silver_layer.sql @@ -0,0 +1,64 @@ +/* + * Silver layer: Pivots the raw PSD data into a wide format, + * with key attributes ('Production', 'Imports', etc.) as columns. + * This is equivalent to step 2 of the Python script 03_Extraction. + */ +MODEL ( + name transform.sqlmesh_materia.models.staging.stg_psd_alldata_1_filter_silver_layer, + kind INCREMENTAL_BY_TIME_RANGE ( + time_column ingest_date + ), + start '2006-08-01', + cron '@daily' +); + +SELECT + commodity_code, + commodity_name, + country_code, + country_name, + ingest_date, + -- Replicate the Python script's pivot by using conditional aggregation + -- This creates a single row for each commodity-country-date combination + COALESCE(SUM(CASE WHEN attribute_name = 'Production' THEN value END), 0) AS Production, + COALESCE(SUM(CASE WHEN attribute_name = 'Imports' THEN value END), 0) AS Imports, + COALESCE(SUM(CASE WHEN attribute_name = 'Exports' THEN value END), 0) AS Exports, + COALESCE(SUM(CASE WHEN attribute_name = 'Total Distribution' THEN value END), 0) AS Total_Distribution, + COALESCE(SUM(CASE WHEN attribute_name = 'Ending Stocks' THEN value END), 0) AS Ending_Stocks, + COALESCE(SUM(CASE WHEN attribute_name = 'Beginning Stocks' THEN value END), 0) AS Beginning_Stocks, + COALESCE(SUM(CASE WHEN attribute_name = 'Total Supply' THEN value END), 0) AS Total_Supply, + COALESCE(SUM(CASE WHEN attribute_name = 'Domestic Consumption' THEN value END), 0) AS Domestic_Consumption, + COALESCE(SUM(CASE WHEN attribute_name = 'Domestic Demand' THEN value END), 0) AS Domestic_Demand, + COALESCE(SUM(CASE WHEN attribute_name = 'Food Use' THEN value END), 0) AS Food_Use, + COALESCE(SUM(CASE WHEN attribute_name = 'Industrial Use' THEN value END), 0) AS Industrial_Use, + COALESCE(SUM(CASE WHEN attribute_name = 'Seed Use' THEN value END), 0) AS Seed_Use, + COALESCE(SUM(CASE WHEN attribute_name = 'Waste' THEN value END), 0) AS Waste, + COALESCE(SUM(CASE WHEN attribute_name = 'Feed Use' THEN value END), 0) AS Feed_Use +FROM transform.sqlmesh_materia.models.staging.stg_psd_alldata_0 +-- Filter for the specific attributes used in the pivot table for efficiency +WHERE attribute_name IN ( + 'Production', + 'Imports', + 'Exports', + 'Total Distribution', + 'Ending Stocks', + 'Beginning Stocks', + 'Total Supply', + 'Domestic Consumption', + 'Domestic Demand', + 'Food Use', + 'Industrial Use', + 'Seed Use', + 'Waste', + 'Feed Use' + ) +GROUP BY + commodity_code, + commodity_name, + country_code, + country_name, + ingest_date +ORDER BY + commodity_name, + country_name, + ingest_date; \ No newline at end of file diff --git a/transform/sqlmesh_materia/models/staging/stg_psd_alldata_2_filter_gold_layer.sql b/transform/sqlmesh_materia/models/staging/stg_psd_alldata_2_filter_gold_layer.sql new file mode 100644 index 0000000..ccf5ca7 --- /dev/null +++ b/transform/sqlmesh_materia/models/staging/stg_psd_alldata_2_filter_gold_layer.sql @@ -0,0 +1,110 @@ +/* + * Gold layer: Calculates derived metrics like Net Supply, Trade Balance, + * and Stock-to-Use Ratio based on the pivoted silver layer data. + * This also includes the global aggregates, mimicking steps 3 and 4 + * of the Python script 03_Extraction. + */ +MODEL ( + name transform.sqlmesh_materia.models.staging.stg_psd_alldata_2_filter_gold_layer, + kind INCREMENTAL_BY_TIME_RANGE ( + time_column ingest_date + ), + start '2006-08-01', + cron '@daily' +); + +-- CTE to calculate country-level derived metrics +WITH country_metrics AS ( + SELECT + commodity_code, + commodity_name, + country_code, + country_name, + ingest_date, + Production, + Imports, + Exports, + Total_Distribution, + Ending_Stocks, + -- Derived metrics per country, mirroring Python script + (Production + Imports - Exports) AS Net_Supply, + (Exports - Imports) AS Trade_Balance, + (Production + Imports - Exports) - Total_Distribution AS Supply_Demand_Balance, + -- Handle division by zero for Stock-to-Use Ratio + (Ending_Stocks / NULLIF(Total_Distribution, 0)) * 100 AS Stock_to_Use_Ratio_pct, + -- Calculate Production YoY percentage change using a window function + (Production - LAG(Production, 1, 0) OVER (PARTITION BY commodity_code, country_code ORDER BY ingest_date)) / NULLIF(LAG(Production, 1, 0) OVER (PARTITION BY commodity_code, country_code ORDER BY ingest_date), 0) * 100 AS Production_YoY_pct + FROM transform.sqlmesh_materia.models.staging.stg_psd_alldata_1_filter_silver_layer +), +-- CTE to calculate global aggregates by summing up country-level data +global_aggregates AS ( + SELECT + commodity_code, + commodity_name, + NULL::TEXT AS country_code, -- Use NULL for global aggregates + 'Global' AS country_name, + ingest_date, + SUM(Production) AS Production, + SUM(Imports) AS Imports, + SUM(Exports) AS Exports, + SUM(Total_Distribution) AS Total_Distribution, + SUM(Ending_Stocks) AS Ending_Stocks + FROM transform.sqlmesh_materia.models.staging.stg_psd_alldata_1_filter_silver_layer + GROUP BY + commodity_code, + commodity_name, + ingest_date +), +-- CTE to calculate derived metrics for global aggregates +global_metrics AS ( + SELECT + commodity_code, + commodity_name, + country_code, + country_name, + ingest_date, + Production, + Imports, + Exports, + Total_Distribution, + Ending_Stocks, + (Production + Imports - Exports) AS Net_Supply, + (Exports - Imports) AS Trade_Balance, + (Production + Imports - Exports) - Total_Distribution AS Supply_Demand_Balance, + (Ending_Stocks / NULLIF(Total_Distribution, 0)) * 100 AS Stock_to_Use_Ratio_pct, + (Production - LAG(Production, 1, 0) OVER (PARTITION BY commodity_code ORDER BY ingest_date)) / NULLIF(LAG(Production, 1, 0) OVER (PARTITION BY commodity_code ORDER BY ingest_date), 0) * 100 AS Production_YoY_pct + FROM global_aggregates +) +-- Combine country-level and global-level data into a single output +SELECT + hkey, + commodity_code, + commodity_name, + country_code, + country_name, + ingest_date, + Production, + Imports, + Exports, + Total_Distribution, + Ending_Stocks, + Net_Supply, + Trade_Balance, + Supply_Demand_Balance, + Stock_to_Use_Ratio_pct, + Production_YoY_pct +FROM ( + SELECT + @GENERATE_SURROGATE_KEY(commodity_code, country_code, ingest_date) AS hkey, + * + FROM country_metrics + UNION ALL + SELECT + @GENERATE_SURROGATE_KEY(commodity_code, country_name, ingest_date) AS hkey, + * + FROM global_metrics +) AS combined_data +ORDER BY + commodity_name, + country_name, + ingest_date; \ No newline at end of file diff --git a/transform/sqlmesh_materia/seeds/commodity_exchange_codes.csv b/transform/sqlmesh_materia/seeds/commodity_exchange_codes.csv new file mode 100644 index 0000000..30fbb84 --- /dev/null +++ b/transform/sqlmesh_materia/seeds/commodity_exchange_codes.csv @@ -0,0 +1,57 @@ +commodity_name,exchange_code,exchange +Crude Oil WTI,CL,CME +Crude Oil Brent,BZ,ICE +Gasoline RBOB,RB,CME +Heating Oil,HO,CME +Natural Gas,NG,CME +Ethanol,CU,CME +Cocoa,CC,ICE +Cotton,CT,ICE +Orange Juice,FCOJ-A,ICE +Coffee,KC,ICE +Lumber,LBR,ICE +Sugar,SB,ICE +European Gas TTF,TTF,ICE +European Union Emissions Allowance,ECF,ICE +Gold,GC,CME +Silver,SI,CME +Platinum,PL,CME +Copper,HG,CME +Palladium,PA,CME +Live Cattle,LE,CME +Feeder Cattle,GF,CME +Lean Hogs,HE,CME +Corn,ZC,CME +Soybean Oil,ZL,CME +Soybean meal,ZM,CME +Oats,ZO,CME +Rough Rice,ZR,CME +Soybeans,ZS,CME +Wheat,ZW,CME +Canola,RS,ICE +Rebar,RB,SHFE +Hot-Rolled Coil,HC,SHFE +Nickel,NI,SHFE +Tin,SN,SHFE +Aluminum,AL,SHFE +Zinc,ZN,SHFE +Natural Rubber,RU,SHFE +Bitumen,BU,SHFE +Iron Ore,I,DCE +Palm Oil,P,DCE +Eggs,JD,DCE +Coking Coal,JM,DCE +Polyvinyl Chloride (PVC),V,DCE +White Sugar,SR,ZCE +Cotton,CF,ZCE +Apple,AP,ZCE +PTA,TA,ZCE +Methanol,MA,ZCE +LME Aluminum,AH,LME +LME Copper,CA,LME +LME Lead,PB,LME +LME Nickel,NI,LME +LME Tin,SN,LME +LME Zinc,ZS,LME +Iron Ore,TIO,SGX +Rubber,TSR,SGX \ No newline at end of file diff --git a/transform/sqlmesh_materia/seeds/psd_codes_exchange_codes_merge.csv b/transform/sqlmesh_materia/seeds/psd_codes_exchange_codes_merge.csv new file mode 100644 index 0000000..625c6cf --- /dev/null +++ b/transform/sqlmesh_materia/seeds/psd_codes_exchange_codes_merge.csv @@ -0,0 +1,57 @@ +commodity_name,exchange_code,exchange,commodity_code +Crude Oil WTI,CL,CME,NA +Crude Oil Brent,BZ,ICE,NA +Gasoline RBOB,RB,CME,NA +Heating Oil,HO,CME,NA +Natural Gas,NG,CME,NA +Ethanol,CU,CME,NA +Cocoa,CC,ICE,NA +Cotton,CT,ICE,2631000 +Orange Juice,FCOJ-A,ICE,0585100 +Coffee,KC,ICE,0711100 +Lumber,LBR,ICE,NA +Sugar,SB,ICE,0612000 +European Gas TTF,TTF,ICE,NA +European Union Emissions Allowance,ECF,ICE,NA +Gold,GC,CME,NA +Silver,SI,CME,NA +Platinum,PL,CME,NA +Copper,HG,CME,NA +Palladium,PA,CME,NA +Live Cattle,LE,CME,0011000 +Feeder Cattle,GF,CME,0011000 +Lean Hogs,HE,CME,NA +Corn,ZC,CME,0440000 +Soybean Oil,ZL,CME,4232000 +Soybean meal,ZM,CME,0813100 +Oats,ZO,CME,0452000 +Rough Rice,ZR,CME,0422110 +Soybeans,ZS,CME,NA +Wheat,ZW,CME,0410000 +Canola,RS,ICE,2226000 +Rebar,RB,SHFE,NA +Hot-Rolled Coil,HC,SHFE,NA +Nickel,NI,SHFE,NA +Tin,SN,SHFE,NA +Aluminum,AL,SHFE,NA +Zinc,ZN,SHFE,NA +Natural Rubber,RU,SHFE,NA +Bitumen,BU,SHFE,NA +Iron Ore,I,DCE,NA +Palm Oil,P,DCE,4243000 +Eggs,JD,DCE,NA +Coking Coal,JM,DCE,NA +Polyvinyl Chloride (PVC),V,DCE,NA +White Sugar,SR,ZCE,0612000 +Cotton,CF,ZCE,2631000 +Apple,AP,ZCE,0574000 +PTA,TA,ZCE,NA +Methanol,MA,ZCE,NA +LME Aluminum,AH,LME,NA +LME Copper,CA,LME,NA +LME Lead,PB,LME,NA +LME Nickel,NI,LME,NA +LME Tin,SN,LME,NA +LME Zinc,ZS,LME,NA +Iron Ore,TIO,SGX,NA +Rubber,TSR,SGX,NA \ No newline at end of file