finish historical extraction
This commit is contained in:
@@ -1,11 +1,8 @@
|
||||
import niquests
|
||||
import io
|
||||
import zipfile
|
||||
import pathlib
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import asyncio
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -15,42 +12,45 @@ logging.basicConfig(
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
|
||||
logger = logging.getLogger("PSD Extraction")
|
||||
output_dir = pathlib.Path(__file__).parent / "data"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
logging.info(f"Output dir: {output_dir}")
|
||||
|
||||
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month}/psd_alldata_csv.zip"
|
||||
logger.info(f"Output dir: {output_dir}")
|
||||
#TODO: adapt to environment values, so this writes to s3 in prod
|
||||
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
|
||||
PSD_LATEST_URL = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"
|
||||
FIRST_YEAR = 2006
|
||||
FIRST_MONTH = 8
|
||||
|
||||
async def extract_psd_file(url:str, http_session: niquests.AsyncSession):
|
||||
logging.info(f"Start downloading {url} ...")
|
||||
latest_data = await http_session.get(url)
|
||||
logging.info("Download done.")
|
||||
if latest_data.status_code != 200:
|
||||
logging.info(f"Status code not ok, STATUS={latest_data.status_code}")
|
||||
def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session):
|
||||
logger.info(f"Start downloading {url} ...")
|
||||
response = http_session.get(url)
|
||||
if response.status_code != 200:
|
||||
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
||||
return
|
||||
latest_buf=io.BytesIO()
|
||||
latest_buf.write(latest_data.content)
|
||||
latest_buf.seek(0)
|
||||
logger.info(f"Storing file to {extract_to_path}")
|
||||
extract_to_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
extract_to_path.write_bytes(response.content)
|
||||
logger.info("Download done.")
|
||||
|
||||
logging.info("Extracting Zipfile ...")
|
||||
zipfile.ZipFile(latest_buf).extract('psd_alldata.csv', output_dir)
|
||||
logging.info("Extracting Zipfile done.")
|
||||
|
||||
async def extract_historical_psd_dataset():
|
||||
def extract_historical_psd_dataset():
|
||||
today = datetime.now()
|
||||
years = list(range(FIRST_YEAR, today.year))
|
||||
months = list(range(1,13))
|
||||
historical_data_extraction_urls = [PSD_HISTORICAL_URL.format(year=year, month=month) for year in years for month in months]
|
||||
logging.info(f"Downloading {len(historical_data_extraction_urls)} urls")
|
||||
async with niquests.AsyncSession() as session:
|
||||
async with asyncio.TaskGroup() as tg:
|
||||
for url in historical_data_extraction_urls:
|
||||
tg.create_task(extract_psd_file(url, session))
|
||||
logger.info(f"Downloading {len(years) * len(months)} urls")
|
||||
for year in years:
|
||||
for month in months:
|
||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||
target_path = output_dir / f"{year}"/f"{month:02d}" / "psd_alldata_csv.zip"
|
||||
with niquests.Session() as session:
|
||||
logger.info(f"Downloading psd_alldata_csv.zip for {year}/{month:02d}")
|
||||
try:
|
||||
extract_psd_file(url=url, http_session=session, extract_to_path=target_path)
|
||||
except Exception as e:
|
||||
logger.error("Error trying to download file. Likely the file does not exist", e)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(extract_historical_psd_dataset())
|
||||
extract_historical_psd_dataset()
|
||||
|
||||
@@ -8,6 +8,7 @@ authors = [
|
||||
]
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"pyarrow>=20.0.0",
|
||||
"python-dotenv>=1.1.0",
|
||||
]
|
||||
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
AUDIT (
|
||||
name assert_positive_order_ids,
|
||||
);
|
||||
|
||||
SELECT *
|
||||
FROM @this_model
|
||||
WHERE
|
||||
item_id < 0
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
# --- Gateway Connection ---
|
||||
gateways:
|
||||
duckdb:
|
||||
connection:
|
||||
# For more information on configuring the connection to your execution engine, visit:
|
||||
# https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#connection
|
||||
# https://sqlmesh.readthedocs.io/en/stable/integrations/engines/duckdb/#connection-options
|
||||
type: duckdb
|
||||
database: db.db
|
||||
# concurrent_tasks: 1
|
||||
# register_comments: True
|
||||
# pre_ping: False
|
||||
# pretty_sql: False
|
||||
# catalogs:
|
||||
# extensions:
|
||||
# connector_config:
|
||||
# secrets:
|
||||
# filesystems:
|
||||
# token:
|
||||
|
||||
default_gateway: duckdb
|
||||
|
||||
# --- Model Defaults ---
|
||||
# https://sqlmesh.readthedocs.io/en/stable/reference/model_configuration/#model-defaults
|
||||
|
||||
model_defaults:
|
||||
dialect: duckdb
|
||||
start: 2025-07-07 # Start date for backfill history
|
||||
cron: '@daily' # Run models daily at 12am UTC (can override per model)
|
||||
|
||||
# --- Linting Rules ---
|
||||
# Enforce standards for your team
|
||||
# https://sqlmesh.readthedocs.io/en/stable/guides/linter/
|
||||
|
||||
linter:
|
||||
enabled: true
|
||||
rules:
|
||||
- ambiguousorinvalidcolumn
|
||||
- invalidselectstarexpansion
|
||||
|
||||
# FLOW: Minimal prompts, automatic changes, summary output
|
||||
# https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#plan
|
||||
|
||||
plan:
|
||||
no_diff: true # Hide detailed text differences for changed models
|
||||
no_prompts: true # No interactive prompts
|
||||
auto_apply: true # Apply changes automatically
|
||||
|
||||
# --- Optional: Set a default target environment ---
|
||||
# This is intended for local development to prevent users from accidentally applying plans to the prod environment.
|
||||
# It is a development only config and should NOT be committed to your git repo.
|
||||
# https://sqlmesh.readthedocs.io/en/stable/guides/configuration/#default-target-environment
|
||||
|
||||
# Uncomment the following line to use a default target environment derived from the logged in user's name.
|
||||
# default_target_environment: dev_{{ user() }}
|
||||
|
||||
# Example usage:
|
||||
# sqlmesh plan # Automatically resolves to: sqlmesh plan dev_yourname
|
||||
# sqlmesh plan prod # Specify `prod` to apply changes to production
|
||||
Binary file not shown.
@@ -1,15 +0,0 @@
|
||||
MODEL (
|
||||
name sqlmesh_example.full_model,
|
||||
kind FULL,
|
||||
cron '@daily',
|
||||
grain item_id,
|
||||
audits (assert_positive_order_ids),
|
||||
);
|
||||
|
||||
SELECT
|
||||
item_id,
|
||||
COUNT(DISTINCT id) AS num_orders,
|
||||
FROM
|
||||
sqlmesh_example.incremental_model
|
||||
GROUP BY item_id
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
MODEL (
|
||||
name sqlmesh_example.incremental_model,
|
||||
kind INCREMENTAL_BY_TIME_RANGE (
|
||||
time_column event_date
|
||||
),
|
||||
start '2020-01-01',
|
||||
cron '@daily',
|
||||
grain (id, event_date)
|
||||
);
|
||||
|
||||
SELECT
|
||||
id,
|
||||
item_id,
|
||||
event_date,
|
||||
FROM
|
||||
sqlmesh_example.seed_model
|
||||
WHERE
|
||||
event_date BETWEEN @start_date AND @end_date
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
MODEL (
|
||||
name sqlmesh_example.seed_model,
|
||||
kind SEED (
|
||||
path '../seeds/seed_data.csv'
|
||||
),
|
||||
columns (
|
||||
id INTEGER,
|
||||
item_id INTEGER,
|
||||
event_date DATE
|
||||
),
|
||||
grain (id, event_date)
|
||||
);
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
[project]
|
||||
name = "sqlmesh-duckdb"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name = "Deeman", email = "hendriknote@gmail.com" }
|
||||
]
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"sqlmesh>=0.200.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
@@ -1,8 +0,0 @@
|
||||
id,item_id,event_date
|
||||
1,2,2020-01-01
|
||||
2,1,2020-01-01
|
||||
3,3,2020-01-03
|
||||
4,1,2020-01-04
|
||||
5,1,2020-01-05
|
||||
6,1,2020-01-06
|
||||
7,1,2020-01-07
|
||||
|
@@ -1,19 +0,0 @@
|
||||
test_example_full_model:
|
||||
model: sqlmesh_example.full_model
|
||||
inputs:
|
||||
sqlmesh_example.incremental_model:
|
||||
rows:
|
||||
- id: 1
|
||||
item_id: 1
|
||||
- id: 2
|
||||
item_id: 1
|
||||
- id: 3
|
||||
item_id: 2
|
||||
outputs:
|
||||
query:
|
||||
rows:
|
||||
- item_id: 1
|
||||
num_orders: 2
|
||||
- item_id: 2
|
||||
num_orders: 1
|
||||
|
||||
38
uv.lock
generated
38
uv.lock
generated
@@ -6,7 +6,7 @@ requires-python = ">=3.13"
|
||||
members = [
|
||||
"materia",
|
||||
"psdonline",
|
||||
"sqlmesh-duckdb",
|
||||
"sqlmesh-materia",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -516,6 +516,7 @@ name = "materia"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "pyarrow" },
|
||||
{ name = "python-dotenv" },
|
||||
]
|
||||
|
||||
@@ -529,7 +530,10 @@ exploration = [
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [{ name = "python-dotenv", specifier = ">=1.1.0" }]
|
||||
requires-dist = [
|
||||
{ name = "pyarrow", specifier = ">=20.0.0" },
|
||||
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
||||
]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
@@ -759,6 +763,32 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyarrow"
|
||||
version = "20.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload-time = "2025-04-27T12:30:48.351Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload-time = "2025-04-27T12:30:55.238Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload-time = "2025-04-27T12:31:05.587Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload-time = "2025-04-27T12:31:15.675Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload-time = "2025-04-27T12:31:24.631Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload-time = "2025-04-27T12:31:31.311Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload-time = "2025-04-27T12:31:39.406Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload-time = "2025-04-27T12:31:45.997Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload-time = "2025-04-27T12:31:54.11Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload-time = "2025-04-27T12:31:59.215Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload-time = "2025-04-27T12:32:05.369Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload-time = "2025-04-27T12:32:11.814Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload-time = "2025-04-27T12:32:20.766Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload-time = "2025-04-27T12:32:28.1Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload-time = "2025-04-27T12:32:35.792Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload-time = "2025-04-27T12:32:46.64Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload-time = "2025-04-27T12:32:56.503Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "2.22"
|
||||
@@ -1126,9 +1156,9 @@ wheels = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlmesh-duckdb"
|
||||
name = "sqlmesh-materia"
|
||||
version = "0.1.0"
|
||||
source = { editable = "transform/sqlmesh-duckdb" }
|
||||
source = { editable = "transform/sqlmesh-materia" }
|
||||
dependencies = [
|
||||
{ name = "sqlmesh" },
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user