From 4fd1b96114a2d94cccb2b917001ba42c3480d5de Mon Sep 17 00:00:00 2001 From: Deeman Date: Sat, 26 Jul 2025 22:08:35 +0200 Subject: [PATCH] simplify using etags --- extract/psdonline/pyproject.toml | 1 + extract/psdonline/src/psdonline/execute.py | 47 ++++++++++++---------- uv.lock | 29 ++++++++++++- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/extract/psdonline/pyproject.toml b/extract/psdonline/pyproject.toml index 309147c..9b371de 100644 --- a/extract/psdonline/pyproject.toml +++ b/extract/psdonline/pyproject.toml @@ -9,6 +9,7 @@ authors = [ requires-python = ">=3.13" dependencies = [ "niquests>=3.14.1", + "pendulum>=3.1.0", ] [build-system] diff --git a/extract/psdonline/src/psdonline/execute.py b/extract/psdonline/src/psdonline/execute.py index c41bcd2..7425621 100644 --- a/extract/psdonline/src/psdonline/execute.py +++ b/extract/psdonline/src/psdonline/execute.py @@ -3,6 +3,7 @@ import pathlib import logging import sys from datetime import datetime +import pendulum logging.basicConfig( level=logging.INFO, @@ -12,37 +13,39 @@ logging.basicConfig( logging.StreamHandler(sys.stdout) ] ) -logger = logging.getLogger("PSD Extraction") -output_dir = pathlib.Path(__file__).parent / "data" -output_dir.mkdir(parents=True, exist_ok=True) -logger.info(f"Output dir: {output_dir}") +logger = logging.getLogger("PSDOnline Extractor") +OUTPUT_DIR = pathlib.Path(__file__).parent / "data" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +logger.info(f"Output dir: {OUTPUT_DIR}") #TODO: adapt to environment values, so this writes to s3 in prod PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip" -PSD_LATEST_URL = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip" FIRST_YEAR = 2006 FIRST_MONTH = 8 def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session): logger.info(f"Requesting file {url} ...") - response = http_session.get(url) + extracted_etags = list(map(lambda file: file.stem, OUTPUT_DIR.rglob("*.zip"))) + + response = http_session.head(url) if response.status_code == 404: logger.error("File doesn't exist on server, received status code 404 Not Found") return elif response.status_code != 200: logger.error(f"Status code not ok, STATUS={response.status_code}") return + etag = response.headers.get("etag").replace('"',"").replace(":","_") + if etag in extracted_etags: + return + else: + response = http_session.get(url) + extract_to_path = extract_to_path / f"{etag}.zip" logger.info(f"Storing file to {extract_to_path}") extract_to_path.parent.mkdir(parents=True, exist_ok=True) extract_to_path.write_bytes(response.content) logger.info("Download done.") -def extraction_status(): - extracted_files = list(output_dir.rglob("*.zip")) - extracted_months = [tuple(map(int, str(file).split("/")[-3:-1])) for file in extracted_files] - return extracted_months -def extract_historical_psd_dataset(): - status = extraction_status() +def extract_psd_dataset(): today = datetime.now() years = list(range(FIRST_YEAR, today.year+1)) for year in years: @@ -54,18 +57,20 @@ def extract_historical_psd_dataset(): months = list(range(1,13)) for month in months: - if (year, month) in status: - continue url = PSD_HISTORICAL_URL.format(year=year, month=month) - target_path = output_dir / f"{year}"/f"{month:02d}" / "psd_alldata_csv.zip" + target_dir = OUTPUT_DIR / f"{year}"/f"{month:02d}" with niquests.Session() as session: - logger.info(f"Downloading psd_alldata_csv.zip for {year}/{month:02d}") - try: - extract_psd_file(url=url, http_session=session, extract_to_path=target_path) - except Exception as e: - logger.error("Error trying to download file. Likely the file does not exist", e) + extract_psd_file(url=url, http_session=session, extract_to_path=target_dir) +def parse_last_modified(last_modified:str) -> pendulum.datetime: + last_modified = last_modified.split(",")[1].strip() + day, month, year, time, timezone = last_modified.split(" ") + last_modified = f"{year}-{month}-{day}T{time}" + last_modified = pendulum.from_format(last_modified, fmt="YYYY-MMM-DDTHH:mm:ss", tz=timezone) + return last_modified if __name__ == "__main__": - extract_historical_psd_dataset() + extract_psd_dataset() + + diff --git a/uv.lock b/uv.lock index 79a9ac0..457ee04 100644 --- a/uv.lock +++ b/uv.lock @@ -689,6 +689,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, ] +[[package]] +name = "pendulum" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/7c/009c12b86c7cc6c403aec80f8a4308598dfc5995e5c523a5491faaa3952e/pendulum-3.1.0.tar.gz", hash = "sha256:66f96303560f41d097bee7d2dc98ffca716fbb3a832c4b3062034c2d45865015", size = 85930, upload-time = "2025-04-19T14:30:01.675Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/1f/af928ba4aa403dac9569f787adcf024005e7654433d71f7a84e608716837/pendulum-3.1.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:28658b0baf4b30eb31d096a375983cfed033e60c0a7bbe94fa23f06cd779b50b", size = 336209, upload-time = "2025-04-19T14:01:42.775Z" }, + { url = "https://files.pythonhosted.org/packages/b6/16/b010643007ba964c397da7fa622924423883c1bbff1a53f9d1022cd7f024/pendulum-3.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b114dcb99ce511cb8f5495c7b6f0056b2c3dba444ef1ea6e48030d7371bd531a", size = 323132, upload-time = "2025-04-19T14:01:44.577Z" }, + { url = "https://files.pythonhosted.org/packages/64/19/c3c47aeecb5d9bceb0e89faafd800d39809b696c5b7bba8ec8370ad5052c/pendulum-3.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2404a6a54c80252ea393291f0b7f35525a61abae3d795407f34e118a8f133a18", size = 341509, upload-time = "2025-04-19T14:01:46.084Z" }, + { url = "https://files.pythonhosted.org/packages/38/cf/c06921ff6b860ff7e62e70b8e5d4dc70e36f5abb66d168bd64d51760bc4e/pendulum-3.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d06999790d9ee9962a1627e469f98568bf7ad1085553fa3c30ed08b3944a14d7", size = 378674, upload-time = "2025-04-19T14:01:47.727Z" }, + { url = "https://files.pythonhosted.org/packages/62/0b/a43953b9eba11e82612b033ac5133f716f1b76b6108a65da6f408b3cc016/pendulum-3.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94751c52f6b7c306734d1044c2c6067a474237e1e5afa2f665d1fbcbbbcf24b3", size = 436133, upload-time = "2025-04-19T14:01:49.126Z" }, + { url = "https://files.pythonhosted.org/packages/eb/a0/ec3d70b3b96e23ae1d039f132af35e17704c22a8250d1887aaefea4d78a6/pendulum-3.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5553ac27be05e997ec26d7f004cf72788f4ce11fe60bb80dda604a64055b29d0", size = 351232, upload-time = "2025-04-19T14:01:50.575Z" }, + { url = "https://files.pythonhosted.org/packages/f4/97/aba23f1716b82f6951ba2b1c9178a2d107d1e66c102762a9bf19988547ea/pendulum-3.1.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f8dee234ca6142bf0514368d01a72945a44685aaa2fc4c14c98d09da9437b620", size = 521563, upload-time = "2025-04-19T14:01:51.9Z" }, + { url = "https://files.pythonhosted.org/packages/01/33/2c0d5216cc53d16db0c4b3d510f141ee0a540937f8675948541190fbd48b/pendulum-3.1.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7378084fe54faab4ee481897a00b710876f2e901ded6221671e827a253e643f2", size = 523221, upload-time = "2025-04-19T14:01:53.275Z" }, + { url = "https://files.pythonhosted.org/packages/51/89/8de955c339c31aeae77fd86d3225509b998c81875e9dba28cb88b8cbf4b3/pendulum-3.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:8539db7ae2c8da430ac2515079e288948c8ebf7eb1edd3e8281b5cdf433040d6", size = 260501, upload-time = "2025-04-19T14:01:54.749Z" }, + { url = "https://files.pythonhosted.org/packages/15/c3/226a3837363e94f8722461848feec18bfdd7d5172564d53aa3c3397ff01e/pendulum-3.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:1ce26a608e1f7387cd393fba2a129507c4900958d4f47b90757ec17656856571", size = 253087, upload-time = "2025-04-19T14:01:55.998Z" }, + { url = "https://files.pythonhosted.org/packages/6e/23/e98758924d1b3aac11a626268eabf7f3cf177e7837c28d47bf84c64532d0/pendulum-3.1.0-py3-none-any.whl", hash = "sha256:f9178c2a8e291758ade1e8dd6371b1d26d08371b4c7730a6e9a3ef8b16ebae0f", size = 111799, upload-time = "2025-04-19T14:02:34.739Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -744,10 +767,14 @@ version = "0.1.0" source = { editable = "extract/psdonline" } dependencies = [ { name = "niquests" }, + { name = "pendulum" }, ] [package.metadata] -requires-dist = [{ name = "niquests", specifier = ">=3.14.1" }] +requires-dist = [ + { name = "niquests", specifier = ">=3.14.1" }, + { name = "pendulum", specifier = ">=3.1.0" }, +] [[package]] name = "psutil"