simplify using etags

This commit is contained in:
Deeman
2025-07-26 22:08:35 +02:00
parent bd65ddcac8
commit 4fd1b96114
3 changed files with 55 additions and 22 deletions

View File

@@ -9,6 +9,7 @@ authors = [
requires-python = ">=3.13" requires-python = ">=3.13"
dependencies = [ dependencies = [
"niquests>=3.14.1", "niquests>=3.14.1",
"pendulum>=3.1.0",
] ]
[build-system] [build-system]

View File

@@ -3,6 +3,7 @@ import pathlib
import logging import logging
import sys import sys
from datetime import datetime from datetime import datetime
import pendulum
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@@ -12,37 +13,39 @@ logging.basicConfig(
logging.StreamHandler(sys.stdout) logging.StreamHandler(sys.stdout)
] ]
) )
logger = logging.getLogger("PSD Extraction") logger = logging.getLogger("PSDOnline Extractor")
output_dir = pathlib.Path(__file__).parent / "data" OUTPUT_DIR = pathlib.Path(__file__).parent / "data"
output_dir.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Output dir: {output_dir}") logger.info(f"Output dir: {OUTPUT_DIR}")
#TODO: adapt to environment values, so this writes to s3 in prod #TODO: adapt to environment values, so this writes to s3 in prod
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip" PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
PSD_LATEST_URL = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"
FIRST_YEAR = 2006 FIRST_YEAR = 2006
FIRST_MONTH = 8 FIRST_MONTH = 8
def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session): def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session):
logger.info(f"Requesting file {url} ...") logger.info(f"Requesting file {url} ...")
response = http_session.get(url) extracted_etags = list(map(lambda file: file.stem, OUTPUT_DIR.rglob("*.zip")))
response = http_session.head(url)
if response.status_code == 404: if response.status_code == 404:
logger.error("File doesn't exist on server, received status code 404 Not Found") logger.error("File doesn't exist on server, received status code 404 Not Found")
return return
elif response.status_code != 200: elif response.status_code != 200:
logger.error(f"Status code not ok, STATUS={response.status_code}") logger.error(f"Status code not ok, STATUS={response.status_code}")
return return
etag = response.headers.get("etag").replace('"',"").replace(":","_")
if etag in extracted_etags:
return
else:
response = http_session.get(url)
extract_to_path = extract_to_path / f"{etag}.zip"
logger.info(f"Storing file to {extract_to_path}") logger.info(f"Storing file to {extract_to_path}")
extract_to_path.parent.mkdir(parents=True, exist_ok=True) extract_to_path.parent.mkdir(parents=True, exist_ok=True)
extract_to_path.write_bytes(response.content) extract_to_path.write_bytes(response.content)
logger.info("Download done.") logger.info("Download done.")
def extraction_status():
extracted_files = list(output_dir.rglob("*.zip"))
extracted_months = [tuple(map(int, str(file).split("/")[-3:-1])) for file in extracted_files]
return extracted_months
def extract_historical_psd_dataset(): def extract_psd_dataset():
status = extraction_status()
today = datetime.now() today = datetime.now()
years = list(range(FIRST_YEAR, today.year+1)) years = list(range(FIRST_YEAR, today.year+1))
for year in years: for year in years:
@@ -54,18 +57,20 @@ def extract_historical_psd_dataset():
months = list(range(1,13)) months = list(range(1,13))
for month in months: for month in months:
if (year, month) in status:
continue
url = PSD_HISTORICAL_URL.format(year=year, month=month) url = PSD_HISTORICAL_URL.format(year=year, month=month)
target_path = output_dir / f"{year}"/f"{month:02d}" / "psd_alldata_csv.zip" target_dir = OUTPUT_DIR / f"{year}"/f"{month:02d}"
with niquests.Session() as session: with niquests.Session() as session:
logger.info(f"Downloading psd_alldata_csv.zip for {year}/{month:02d}") extract_psd_file(url=url, http_session=session, extract_to_path=target_dir)
try:
extract_psd_file(url=url, http_session=session, extract_to_path=target_path)
except Exception as e:
logger.error("Error trying to download file. Likely the file does not exist", e)
def parse_last_modified(last_modified:str) -> pendulum.datetime:
last_modified = last_modified.split(",")[1].strip()
day, month, year, time, timezone = last_modified.split(" ")
last_modified = f"{year}-{month}-{day}T{time}"
last_modified = pendulum.from_format(last_modified, fmt="YYYY-MMM-DDTHH:mm:ss", tz=timezone)
return last_modified
if __name__ == "__main__": if __name__ == "__main__":
extract_historical_psd_dataset() extract_psd_dataset()

29
uv.lock generated
View File

@@ -689,6 +689,29 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" },
] ]
[[package]]
name = "pendulum"
version = "3.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "python-dateutil" },
{ name = "tzdata" },
]
sdist = { url = "https://files.pythonhosted.org/packages/23/7c/009c12b86c7cc6c403aec80f8a4308598dfc5995e5c523a5491faaa3952e/pendulum-3.1.0.tar.gz", hash = "sha256:66f96303560f41d097bee7d2dc98ffca716fbb3a832c4b3062034c2d45865015", size = 85930, upload-time = "2025-04-19T14:30:01.675Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8e/1f/af928ba4aa403dac9569f787adcf024005e7654433d71f7a84e608716837/pendulum-3.1.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:28658b0baf4b30eb31d096a375983cfed033e60c0a7bbe94fa23f06cd779b50b", size = 336209, upload-time = "2025-04-19T14:01:42.775Z" },
{ url = "https://files.pythonhosted.org/packages/b6/16/b010643007ba964c397da7fa622924423883c1bbff1a53f9d1022cd7f024/pendulum-3.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b114dcb99ce511cb8f5495c7b6f0056b2c3dba444ef1ea6e48030d7371bd531a", size = 323132, upload-time = "2025-04-19T14:01:44.577Z" },
{ url = "https://files.pythonhosted.org/packages/64/19/c3c47aeecb5d9bceb0e89faafd800d39809b696c5b7bba8ec8370ad5052c/pendulum-3.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2404a6a54c80252ea393291f0b7f35525a61abae3d795407f34e118a8f133a18", size = 341509, upload-time = "2025-04-19T14:01:46.084Z" },
{ url = "https://files.pythonhosted.org/packages/38/cf/c06921ff6b860ff7e62e70b8e5d4dc70e36f5abb66d168bd64d51760bc4e/pendulum-3.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d06999790d9ee9962a1627e469f98568bf7ad1085553fa3c30ed08b3944a14d7", size = 378674, upload-time = "2025-04-19T14:01:47.727Z" },
{ url = "https://files.pythonhosted.org/packages/62/0b/a43953b9eba11e82612b033ac5133f716f1b76b6108a65da6f408b3cc016/pendulum-3.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94751c52f6b7c306734d1044c2c6067a474237e1e5afa2f665d1fbcbbbcf24b3", size = 436133, upload-time = "2025-04-19T14:01:49.126Z" },
{ url = "https://files.pythonhosted.org/packages/eb/a0/ec3d70b3b96e23ae1d039f132af35e17704c22a8250d1887aaefea4d78a6/pendulum-3.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5553ac27be05e997ec26d7f004cf72788f4ce11fe60bb80dda604a64055b29d0", size = 351232, upload-time = "2025-04-19T14:01:50.575Z" },
{ url = "https://files.pythonhosted.org/packages/f4/97/aba23f1716b82f6951ba2b1c9178a2d107d1e66c102762a9bf19988547ea/pendulum-3.1.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f8dee234ca6142bf0514368d01a72945a44685aaa2fc4c14c98d09da9437b620", size = 521563, upload-time = "2025-04-19T14:01:51.9Z" },
{ url = "https://files.pythonhosted.org/packages/01/33/2c0d5216cc53d16db0c4b3d510f141ee0a540937f8675948541190fbd48b/pendulum-3.1.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7378084fe54faab4ee481897a00b710876f2e901ded6221671e827a253e643f2", size = 523221, upload-time = "2025-04-19T14:01:53.275Z" },
{ url = "https://files.pythonhosted.org/packages/51/89/8de955c339c31aeae77fd86d3225509b998c81875e9dba28cb88b8cbf4b3/pendulum-3.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:8539db7ae2c8da430ac2515079e288948c8ebf7eb1edd3e8281b5cdf433040d6", size = 260501, upload-time = "2025-04-19T14:01:54.749Z" },
{ url = "https://files.pythonhosted.org/packages/15/c3/226a3837363e94f8722461848feec18bfdd7d5172564d53aa3c3397ff01e/pendulum-3.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:1ce26a608e1f7387cd393fba2a129507c4900958d4f47b90757ec17656856571", size = 253087, upload-time = "2025-04-19T14:01:55.998Z" },
{ url = "https://files.pythonhosted.org/packages/6e/23/e98758924d1b3aac11a626268eabf7f3cf177e7837c28d47bf84c64532d0/pendulum-3.1.0-py3-none-any.whl", hash = "sha256:f9178c2a8e291758ade1e8dd6371b1d26d08371b4c7730a6e9a3ef8b16ebae0f", size = 111799, upload-time = "2025-04-19T14:02:34.739Z" },
]
[[package]] [[package]]
name = "pexpect" name = "pexpect"
version = "4.9.0" version = "4.9.0"
@@ -744,10 +767,14 @@ version = "0.1.0"
source = { editable = "extract/psdonline" } source = { editable = "extract/psdonline" }
dependencies = [ dependencies = [
{ name = "niquests" }, { name = "niquests" },
{ name = "pendulum" },
] ]
[package.metadata] [package.metadata]
requires-dist = [{ name = "niquests", specifier = ">=3.14.1" }] requires-dist = [
{ name = "niquests", specifier = ">=3.14.1" },
{ name = "pendulum", specifier = ">=3.1.0" },
]
[[package]] [[package]]
name = "psutil" name = "psutil"