finish historical extraction

This commit is contained in:
Deeman
2025-07-13 23:20:50 +02:00
parent 70bd8a52db
commit b8ad73202c
19 changed files with 62 additions and 189 deletions

View File

@@ -1,11 +1,8 @@
import niquests import niquests
import io
import zipfile
import pathlib import pathlib
import logging import logging
import sys import sys
from datetime import datetime from datetime import datetime
import asyncio
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@@ -15,42 +12,45 @@ logging.basicConfig(
logging.StreamHandler(sys.stdout) logging.StreamHandler(sys.stdout)
] ]
) )
logger = logging.getLogger("PSD Extraction")
output_dir = pathlib.Path(__file__).parent / "data" output_dir = pathlib.Path(__file__).parent / "data"
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
logging.info(f"Output dir: {output_dir}") logger.info(f"Output dir: {output_dir}")
#TODO: adapt to environment values, so this writes to s3 in prod
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month}/psd_alldata_csv.zip" PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
PSD_LATEST_URL = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip" PSD_LATEST_URL = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"
FIRST_YEAR = 2006 FIRST_YEAR = 2006
FIRST_MONTH = 8 FIRST_MONTH = 8
async def extract_psd_file(url:str, http_session: niquests.AsyncSession): def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session):
logging.info(f"Start downloading {url} ...") logger.info(f"Start downloading {url} ...")
latest_data = await http_session.get(url) response = http_session.get(url)
logging.info("Download done.") if response.status_code != 200:
if latest_data.status_code != 200: logger.error(f"Status code not ok, STATUS={response.status_code}")
logging.info(f"Status code not ok, STATUS={latest_data.status_code}")
return return
latest_buf=io.BytesIO() logger.info(f"Storing file to {extract_to_path}")
latest_buf.write(latest_data.content) extract_to_path.parent.mkdir(parents=True, exist_ok=True)
latest_buf.seek(0) extract_to_path.write_bytes(response.content)
logger.info("Download done.")
logging.info("Extracting Zipfile ...")
zipfile.ZipFile(latest_buf).extract('psd_alldata.csv', output_dir)
logging.info("Extracting Zipfile done.")
async def extract_historical_psd_dataset(): def extract_historical_psd_dataset():
today = datetime.now() today = datetime.now()
years = list(range(FIRST_YEAR, today.year)) years = list(range(FIRST_YEAR, today.year))
months = list(range(1,13)) months = list(range(1,13))
historical_data_extraction_urls = [PSD_HISTORICAL_URL.format(year=year, month=month) for year in years for month in months] logger.info(f"Downloading {len(years) * len(months)} urls")
logging.info(f"Downloading {len(historical_data_extraction_urls)} urls") for year in years:
async with niquests.AsyncSession() as session: for month in months:
async with asyncio.TaskGroup() as tg: url = PSD_HISTORICAL_URL.format(year=year, month=month)
for url in historical_data_extraction_urls: target_path = output_dir / f"{year}"/f"{month:02d}" / "psd_alldata_csv.zip"
tg.create_task(extract_psd_file(url, session)) with niquests.Session() as session:
logger.info(f"Downloading psd_alldata_csv.zip for {year}/{month:02d}")
try:
extract_psd_file(url=url, http_session=session, extract_to_path=target_path)
except Exception as e:
logger.error("Error trying to download file. Likely the file does not exist", e)
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(extract_historical_psd_dataset()) extract_historical_psd_dataset()

View File

@@ -8,6 +8,7 @@ authors = [
] ]
requires-python = ">=3.13" requires-python = ">=3.13"
dependencies = [ dependencies = [
"pyarrow>=20.0.0",
"python-dotenv>=1.1.0", "python-dotenv>=1.1.0",
] ]

View File

@@ -1,9 +0,0 @@
AUDIT (
name assert_positive_order_ids,
);
SELECT *
FROM @this_model
WHERE
item_id < 0

View File

@@ -1,59 +0,0 @@
# --- Gateway Connection ---
gateways:
duckdb:
connection:
# For more information on configuring the connection to your execution engine, visit:
# https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#connection
# https://sqlmesh.readthedocs.io/en/stable/integrations/engines/duckdb/#connection-options
type: duckdb
database: db.db
# concurrent_tasks: 1
# register_comments: True
# pre_ping: False
# pretty_sql: False
# catalogs:
# extensions:
# connector_config:
# secrets:
# filesystems:
# token:
default_gateway: duckdb
# --- Model Defaults ---
# https://sqlmesh.readthedocs.io/en/stable/reference/model_configuration/#model-defaults
model_defaults:
dialect: duckdb
start: 2025-07-07 # Start date for backfill history
cron: '@daily' # Run models daily at 12am UTC (can override per model)
# --- Linting Rules ---
# Enforce standards for your team
# https://sqlmesh.readthedocs.io/en/stable/guides/linter/
linter:
enabled: true
rules:
- ambiguousorinvalidcolumn
- invalidselectstarexpansion
# FLOW: Minimal prompts, automatic changes, summary output
# https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#plan
plan:
no_diff: true # Hide detailed text differences for changed models
no_prompts: true # No interactive prompts
auto_apply: true # Apply changes automatically
# --- Optional: Set a default target environment ---
# This is intended for local development to prevent users from accidentally applying plans to the prod environment.
# It is a development only config and should NOT be committed to your git repo.
# https://sqlmesh.readthedocs.io/en/stable/guides/configuration/#default-target-environment
# Uncomment the following line to use a default target environment derived from the logged in user's name.
# default_target_environment: dev_{{ user() }}
# Example usage:
# sqlmesh plan # Automatically resolves to: sqlmesh plan dev_yourname
# sqlmesh plan prod # Specify `prod` to apply changes to production

Binary file not shown.

View File

@@ -1,15 +0,0 @@
MODEL (
name sqlmesh_example.full_model,
kind FULL,
cron '@daily',
grain item_id,
audits (assert_positive_order_ids),
);
SELECT
item_id,
COUNT(DISTINCT id) AS num_orders,
FROM
sqlmesh_example.incremental_model
GROUP BY item_id

View File

@@ -1,19 +0,0 @@
MODEL (
name sqlmesh_example.incremental_model,
kind INCREMENTAL_BY_TIME_RANGE (
time_column event_date
),
start '2020-01-01',
cron '@daily',
grain (id, event_date)
);
SELECT
id,
item_id,
event_date,
FROM
sqlmesh_example.seed_model
WHERE
event_date BETWEEN @start_date AND @end_date

View File

@@ -1,13 +0,0 @@
MODEL (
name sqlmesh_example.seed_model,
kind SEED (
path '../seeds/seed_data.csv'
),
columns (
id INTEGER,
item_id INTEGER,
event_date DATE
),
grain (id, event_date)
);

View File

@@ -1,16 +0,0 @@
[project]
name = "sqlmesh-duckdb"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "Deeman", email = "hendriknote@gmail.com" }
]
requires-python = ">=3.13"
dependencies = [
"sqlmesh>=0.200.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

View File

@@ -1,8 +0,0 @@
id,item_id,event_date
1,2,2020-01-01
2,1,2020-01-01
3,3,2020-01-03
4,1,2020-01-04
5,1,2020-01-05
6,1,2020-01-06
7,1,2020-01-07
1 id item_id event_date
2 1 2 2020-01-01
3 2 1 2020-01-01
4 3 3 2020-01-03
5 4 1 2020-01-04
6 5 1 2020-01-05
7 6 1 2020-01-06
8 7 1 2020-01-07

View File

@@ -1,19 +0,0 @@
test_example_full_model:
model: sqlmesh_example.full_model
inputs:
sqlmesh_example.incremental_model:
rows:
- id: 1
item_id: 1
- id: 2
item_id: 1
- id: 3
item_id: 2
outputs:
query:
rows:
- item_id: 1
num_orders: 2
- item_id: 2
num_orders: 1

38
uv.lock generated
View File

@@ -6,7 +6,7 @@ requires-python = ">=3.13"
members = [ members = [
"materia", "materia",
"psdonline", "psdonline",
"sqlmesh-duckdb", "sqlmesh-materia",
] ]
[[package]] [[package]]
@@ -516,6 +516,7 @@ name = "materia"
version = "0.1.0" version = "0.1.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "pyarrow" },
{ name = "python-dotenv" }, { name = "python-dotenv" },
] ]
@@ -529,7 +530,10 @@ exploration = [
] ]
[package.metadata] [package.metadata]
requires-dist = [{ name = "python-dotenv", specifier = ">=1.1.0" }] requires-dist = [
{ name = "pyarrow", specifier = ">=20.0.0" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
]
[package.metadata.requires-dev] [package.metadata.requires-dev]
dev = [ dev = [
@@ -759,6 +763,32 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
] ]
[[package]]
name = "pyarrow"
version = "20.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload-time = "2025-04-27T12:30:48.351Z" },
{ url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload-time = "2025-04-27T12:30:55.238Z" },
{ url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload-time = "2025-04-27T12:31:05.587Z" },
{ url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload-time = "2025-04-27T12:31:15.675Z" },
{ url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload-time = "2025-04-27T12:31:24.631Z" },
{ url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload-time = "2025-04-27T12:31:31.311Z" },
{ url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload-time = "2025-04-27T12:31:39.406Z" },
{ url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload-time = "2025-04-27T12:31:45.997Z" },
{ url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload-time = "2025-04-27T12:31:54.11Z" },
{ url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload-time = "2025-04-27T12:31:59.215Z" },
{ url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload-time = "2025-04-27T12:32:05.369Z" },
{ url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload-time = "2025-04-27T12:32:11.814Z" },
{ url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload-time = "2025-04-27T12:32:20.766Z" },
{ url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload-time = "2025-04-27T12:32:28.1Z" },
{ url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload-time = "2025-04-27T12:32:35.792Z" },
{ url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload-time = "2025-04-27T12:32:46.64Z" },
{ url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload-time = "2025-04-27T12:32:56.503Z" },
{ url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" },
]
[[package]] [[package]]
name = "pycparser" name = "pycparser"
version = "2.22" version = "2.22"
@@ -1126,9 +1156,9 @@ wheels = [
] ]
[[package]] [[package]]
name = "sqlmesh-duckdb" name = "sqlmesh-materia"
version = "0.1.0" version = "0.1.0"
source = { editable = "transform/sqlmesh-duckdb" } source = { editable = "transform/sqlmesh-materia" }
dependencies = [ dependencies = [
{ name = "sqlmesh" }, { name = "sqlmesh" },
] ]