Fix extract and SQLMesh pipeline to build DuckDB lakehouse

extract: wrap response.content in BytesIO before passing to normalize_zipped_csv, and call .read() on the returned BytesIO before write_bytes (two bugs: wrong type in, wrong type out) sqlmesh: {{ var() }} inside SQL string literals is not substituted by SQLMesh's Jinja (SQL parser treats them as opaque strings). Replace with a @psd_glob() macro that evaluates LANDING_DIR at render time and returns a quoted glob path string. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-20 17:02:59 +01:00
parent d05e522c88
commit 423fb8c619
3 changed files with 14 additions and 3 deletions
--- a/transform/sqlmesh_materia/macros/init.py
+++ b/transform/sqlmesh_materia/macros/init.py
@@ -0,0 +1,10 @@
+import os
+
+from sqlmesh import macro
+
+
+@macro()
+def psd_glob(evaluator) -> str:
+    """Return a quoted glob path for all PSD CSV gzip files under LANDING_DIR."""
+    landing_dir = evaluator.var("LANDING_DIR") or os.environ.get("LANDING_DIR", "data/landing")
+    return f"'{landing_dir}/psd/**/*.csv.gzip'"
--- a/transform/sqlmesh_materia/models/raw/psd_data.sql
+++ b/transform/sqlmesh_materia/models/raw/psd_data.sql
@@ -21,4 +21,4 @@ MODEL (
 )
 );
 select *
-FROM read_csv('{{ var("LANDING_DIR") }}/psd/**/*.csv.gzip', delim=',', encoding='utf-8', compression='gzip', max_line_size=10000000, header=true, union_by_name=true, filename=true, names = ['commodity_code', 'commodity_description', 'country_code', 'country_name', 'market_year', 'calendar_year', 'month', 'attribute_id', 'attribute_description', 'unit_id', 'unit_description', 'value'], all_varchar=true)
+FROM read_csv(@psd_glob(), delim=',', encoding='utf-8', compression='gzip', max_line_size=10000000, header=true, union_by_name=true, filename=true, names = ['commodity_code', 'commodity_description', 'country_code', 'country_name', 'market_year', 'calendar_year', 'month', 'attribute_id', 'attribute_description', 'unit_id', 'unit_description', 'value'], all_varchar=true)