diff --git a/extract/psdonline/src/psdonline/execute.py b/extract/psdonline/src/psdonline/execute.py index b5e97f8..56104b3 100644 --- a/extract/psdonline/src/psdonline/execute.py +++ b/extract/psdonline/src/psdonline/execute.py @@ -1,11 +1,8 @@ import niquests -import io -import zipfile import pathlib import logging import sys from datetime import datetime -import asyncio logging.basicConfig( level=logging.INFO, @@ -15,42 +12,45 @@ logging.basicConfig( logging.StreamHandler(sys.stdout) ] ) - +logger = logging.getLogger("PSD Extraction") output_dir = pathlib.Path(__file__).parent / "data" output_dir.mkdir(parents=True, exist_ok=True) -logging.info(f"Output dir: {output_dir}") - -PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month}/psd_alldata_csv.zip" +logger.info(f"Output dir: {output_dir}") +#TODO: adapt to environment values, so this writes to s3 in prod +PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip" PSD_LATEST_URL = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip" FIRST_YEAR = 2006 FIRST_MONTH = 8 -async def extract_psd_file(url:str, http_session: niquests.AsyncSession): - logging.info(f"Start downloading {url} ...") - latest_data = await http_session.get(url) - logging.info("Download done.") - if latest_data.status_code != 200: - logging.info(f"Status code not ok, STATUS={latest_data.status_code}") +def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session): + logger.info(f"Start downloading {url} ...") + response = http_session.get(url) + if response.status_code != 200: + logger.error(f"Status code not ok, STATUS={response.status_code}") return - latest_buf=io.BytesIO() - latest_buf.write(latest_data.content) - latest_buf.seek(0) + logger.info(f"Storing file to {extract_to_path}") + extract_to_path.parent.mkdir(parents=True, exist_ok=True) + extract_to_path.write_bytes(response.content) + logger.info("Download done.") - logging.info("Extracting Zipfile ...") - zipfile.ZipFile(latest_buf).extract('psd_alldata.csv', output_dir) - logging.info("Extracting Zipfile done.") -async def extract_historical_psd_dataset(): +def extract_historical_psd_dataset(): today = datetime.now() years = list(range(FIRST_YEAR, today.year)) months = list(range(1,13)) - historical_data_extraction_urls = [PSD_HISTORICAL_URL.format(year=year, month=month) for year in years for month in months] - logging.info(f"Downloading {len(historical_data_extraction_urls)} urls") - async with niquests.AsyncSession() as session: - async with asyncio.TaskGroup() as tg: - for url in historical_data_extraction_urls: - tg.create_task(extract_psd_file(url, session)) + logger.info(f"Downloading {len(years) * len(months)} urls") + for year in years: + for month in months: + url = PSD_HISTORICAL_URL.format(year=year, month=month) + target_path = output_dir / f"{year}"/f"{month:02d}" / "psd_alldata_csv.zip" + with niquests.Session() as session: + logger.info(f"Downloading psd_alldata_csv.zip for {year}/{month:02d}") + try: + extract_psd_file(url=url, http_session=session, extract_to_path=target_path) + except Exception as e: + logger.error("Error trying to download file. Likely the file does not exist", e) + if __name__ == "__main__": - asyncio.run(extract_historical_psd_dataset()) + extract_historical_psd_dataset() diff --git a/pyproject.toml b/pyproject.toml index fb4ba74..74ed528 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ authors = [ ] requires-python = ">=3.13" dependencies = [ + "pyarrow>=20.0.0", "python-dotenv>=1.1.0", ] diff --git a/transform/sqlmesh-duckdb/README.md b/transform/sqlmesh-duckdb/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/audits/.gitkeep b/transform/sqlmesh-duckdb/audits/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/audits/assert_positive_order_ids.sql b/transform/sqlmesh-duckdb/audits/assert_positive_order_ids.sql deleted file mode 100644 index 4b66f40..0000000 --- a/transform/sqlmesh-duckdb/audits/assert_positive_order_ids.sql +++ /dev/null @@ -1,9 +0,0 @@ -AUDIT ( - name assert_positive_order_ids, -); - -SELECT * -FROM @this_model -WHERE - item_id < 0 - \ No newline at end of file diff --git a/transform/sqlmesh-duckdb/config.yaml b/transform/sqlmesh-duckdb/config.yaml deleted file mode 100644 index feb572b..0000000 --- a/transform/sqlmesh-duckdb/config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# --- Gateway Connection --- -gateways: - duckdb: - connection: - # For more information on configuring the connection to your execution engine, visit: - # https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#connection - # https://sqlmesh.readthedocs.io/en/stable/integrations/engines/duckdb/#connection-options - type: duckdb - database: db.db - # concurrent_tasks: 1 - # register_comments: True - # pre_ping: False - # pretty_sql: False - # catalogs: - # extensions: - # connector_config: - # secrets: - # filesystems: - # token: - -default_gateway: duckdb - -# --- Model Defaults --- -# https://sqlmesh.readthedocs.io/en/stable/reference/model_configuration/#model-defaults - -model_defaults: - dialect: duckdb - start: 2025-07-07 # Start date for backfill history - cron: '@daily' # Run models daily at 12am UTC (can override per model) - -# --- Linting Rules --- -# Enforce standards for your team -# https://sqlmesh.readthedocs.io/en/stable/guides/linter/ - -linter: - enabled: true - rules: - - ambiguousorinvalidcolumn - - invalidselectstarexpansion - -# FLOW: Minimal prompts, automatic changes, summary output -# https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#plan - -plan: - no_diff: true # Hide detailed text differences for changed models - no_prompts: true # No interactive prompts - auto_apply: true # Apply changes automatically - -# --- Optional: Set a default target environment --- -# This is intended for local development to prevent users from accidentally applying plans to the prod environment. -# It is a development only config and should NOT be committed to your git repo. -# https://sqlmesh.readthedocs.io/en/stable/guides/configuration/#default-target-environment - -# Uncomment the following line to use a default target environment derived from the logged in user's name. -# default_target_environment: dev_{{ user() }} - -# Example usage: -# sqlmesh plan # Automatically resolves to: sqlmesh plan dev_yourname -# sqlmesh plan prod # Specify `prod` to apply changes to production diff --git a/transform/sqlmesh-duckdb/db.db b/transform/sqlmesh-duckdb/db.db deleted file mode 100644 index e7635a5..0000000 Binary files a/transform/sqlmesh-duckdb/db.db and /dev/null differ diff --git a/transform/sqlmesh-duckdb/macros/.gitkeep b/transform/sqlmesh-duckdb/macros/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/macros/__init__.py b/transform/sqlmesh-duckdb/macros/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/models/.gitkeep b/transform/sqlmesh-duckdb/models/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/models/example/full_model.sql b/transform/sqlmesh-duckdb/models/example/full_model.sql deleted file mode 100644 index 362b049..0000000 --- a/transform/sqlmesh-duckdb/models/example/full_model.sql +++ /dev/null @@ -1,15 +0,0 @@ -MODEL ( - name sqlmesh_example.full_model, - kind FULL, - cron '@daily', - grain item_id, - audits (assert_positive_order_ids), -); - -SELECT - item_id, - COUNT(DISTINCT id) AS num_orders, -FROM - sqlmesh_example.incremental_model -GROUP BY item_id - \ No newline at end of file diff --git a/transform/sqlmesh-duckdb/models/example/incremental_model.sql b/transform/sqlmesh-duckdb/models/example/incremental_model.sql deleted file mode 100644 index d2db527..0000000 --- a/transform/sqlmesh-duckdb/models/example/incremental_model.sql +++ /dev/null @@ -1,19 +0,0 @@ -MODEL ( - name sqlmesh_example.incremental_model, - kind INCREMENTAL_BY_TIME_RANGE ( - time_column event_date - ), - start '2020-01-01', - cron '@daily', - grain (id, event_date) -); - -SELECT - id, - item_id, - event_date, -FROM - sqlmesh_example.seed_model -WHERE - event_date BETWEEN @start_date AND @end_date - \ No newline at end of file diff --git a/transform/sqlmesh-duckdb/models/example/seed_model.sql b/transform/sqlmesh-duckdb/models/example/seed_model.sql deleted file mode 100644 index 192d2df..0000000 --- a/transform/sqlmesh-duckdb/models/example/seed_model.sql +++ /dev/null @@ -1,13 +0,0 @@ -MODEL ( - name sqlmesh_example.seed_model, - kind SEED ( - path '../seeds/seed_data.csv' - ), - columns ( - id INTEGER, - item_id INTEGER, - event_date DATE - ), - grain (id, event_date) -); - \ No newline at end of file diff --git a/transform/sqlmesh-duckdb/pyproject.toml b/transform/sqlmesh-duckdb/pyproject.toml deleted file mode 100644 index 809df83..0000000 --- a/transform/sqlmesh-duckdb/pyproject.toml +++ /dev/null @@ -1,16 +0,0 @@ -[project] -name = "sqlmesh-duckdb" -version = "0.1.0" -description = "Add your description here" -readme = "README.md" -authors = [ - { name = "Deeman", email = "hendriknote@gmail.com" } -] -requires-python = ">=3.13" -dependencies = [ - "sqlmesh>=0.200.0", -] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" diff --git a/transform/sqlmesh-duckdb/seeds/.gitkeep b/transform/sqlmesh-duckdb/seeds/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/seeds/seed_data.csv b/transform/sqlmesh-duckdb/seeds/seed_data.csv deleted file mode 100644 index 2e3902e..0000000 --- a/transform/sqlmesh-duckdb/seeds/seed_data.csv +++ /dev/null @@ -1,8 +0,0 @@ -id,item_id,event_date -1,2,2020-01-01 -2,1,2020-01-01 -3,3,2020-01-03 -4,1,2020-01-04 -5,1,2020-01-05 -6,1,2020-01-06 -7,1,2020-01-07 diff --git a/transform/sqlmesh-duckdb/tests/.gitkeep b/transform/sqlmesh-duckdb/tests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/transform/sqlmesh-duckdb/tests/test_full_model.yaml b/transform/sqlmesh-duckdb/tests/test_full_model.yaml deleted file mode 100644 index 390a80e..0000000 --- a/transform/sqlmesh-duckdb/tests/test_full_model.yaml +++ /dev/null @@ -1,19 +0,0 @@ -test_example_full_model: - model: sqlmesh_example.full_model - inputs: - sqlmesh_example.incremental_model: - rows: - - id: 1 - item_id: 1 - - id: 2 - item_id: 1 - - id: 3 - item_id: 2 - outputs: - query: - rows: - - item_id: 1 - num_orders: 2 - - item_id: 2 - num_orders: 1 - \ No newline at end of file diff --git a/uv.lock b/uv.lock index 9f2af24..7237a88 100644 --- a/uv.lock +++ b/uv.lock @@ -6,7 +6,7 @@ requires-python = ">=3.13" members = [ "materia", "psdonline", - "sqlmesh-duckdb", + "sqlmesh-materia", ] [[package]] @@ -516,6 +516,7 @@ name = "materia" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "pyarrow" }, { name = "python-dotenv" }, ] @@ -529,7 +530,10 @@ exploration = [ ] [package.metadata] -requires-dist = [{ name = "python-dotenv", specifier = ">=1.1.0" }] +requires-dist = [ + { name = "pyarrow", specifier = ">=20.0.0" }, + { name = "python-dotenv", specifier = ">=1.1.0" }, +] [package.metadata.requires-dev] dev = [ @@ -759,6 +763,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] +[[package]] +name = "pyarrow" +version = "20.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload-time = "2025-04-27T12:30:48.351Z" }, + { url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload-time = "2025-04-27T12:30:55.238Z" }, + { url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload-time = "2025-04-27T12:31:05.587Z" }, + { url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload-time = "2025-04-27T12:31:15.675Z" }, + { url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload-time = "2025-04-27T12:31:24.631Z" }, + { url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload-time = "2025-04-27T12:31:31.311Z" }, + { url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload-time = "2025-04-27T12:31:39.406Z" }, + { url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload-time = "2025-04-27T12:31:45.997Z" }, + { url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload-time = "2025-04-27T12:31:54.11Z" }, + { url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload-time = "2025-04-27T12:31:59.215Z" }, + { url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload-time = "2025-04-27T12:32:05.369Z" }, + { url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload-time = "2025-04-27T12:32:11.814Z" }, + { url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload-time = "2025-04-27T12:32:20.766Z" }, + { url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload-time = "2025-04-27T12:32:28.1Z" }, + { url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload-time = "2025-04-27T12:32:35.792Z" }, + { url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload-time = "2025-04-27T12:32:46.64Z" }, + { url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload-time = "2025-04-27T12:32:56.503Z" }, + { url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" }, +] + [[package]] name = "pycparser" version = "2.22" @@ -1126,9 +1156,9 @@ wheels = [ ] [[package]] -name = "sqlmesh-duckdb" +name = "sqlmesh-materia" version = "0.1.0" -source = { editable = "transform/sqlmesh-duckdb" } +source = { editable = "transform/sqlmesh-materia" } dependencies = [ { name = "sqlmesh" }, ]