Refactor PSD extraction: simplify to latest-only + add R2 support
## Key Changes
1. **Simplified extraction logic**
- Changed from downloading 220+ historical archives to checking only latest available month
- Tries current month and falls back up to 3 months (handles USDA publication lag)
- Architecture advisor insight: ETags naturally deduplicate, historical year/month structure was unnecessary
2. **Flat storage structure**
- Old: `data/{year}/{month}/{etag}.zip`
- New: `data/{etag}.zip` (local) or `psd/{etag}.zip` (R2)
- Migrated 226 existing files to flat structure
3. **Dual storage modes**
- **Local mode**: Downloads to local directory (development)
- **R2 mode**: Uploads to Cloudflare R2 (production)
- Mode determined by presence of R2 environment variables
- Added boto3 dependency for S3-compatible R2 API
4. **Updated raw SQLMesh model**
- Changed pattern from `**/*.zip` to `*.zip` to match flat structure
## Benefits
- Simpler: Single file check instead of 220+ URL attempts
- Efficient: ETag-based deduplication works naturally
- Flexible: Supports both local dev and production R2 storage
- Maintainable: Removed unnecessary complexity
## Testing
- ✅ Local extraction works and respects ETags
- ✅ Falls back correctly when current month unavailable
- ✅ Linting passes
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
15
CLAUDE.md
15
CLAUDE.md
@@ -36,13 +36,24 @@ This is a uv workspace with three main components:
|
|||||||
### 1. Extract Layer (`extract/`)
|
### 1. Extract Layer (`extract/`)
|
||||||
Contains extraction packages for pulling data from external sources.
|
Contains extraction packages for pulling data from external sources.
|
||||||
|
|
||||||
- **`extract/psdonline/`**: Extracts USDA PSD commodity data from archives dating back to 2006
|
- **`extract/psdonline/`**: Extracts USDA PSD commodity data
|
||||||
- Entry point: `extract_psd` CLI command (defined in `extract/psdonline/src/psdonline/execute.py`)
|
- Entry point: `extract_psd` CLI command (defined in `extract/psdonline/src/psdonline/execute.py`)
|
||||||
- Downloads monthly zip archives to `extract/psdonline/src/psdonline/data/`
|
- Checks latest available monthly snapshot (tries current month and 3 months back)
|
||||||
- Uses ETags to avoid re-downloading unchanged files
|
- Uses ETags to avoid re-downloading unchanged files
|
||||||
|
- Storage modes:
|
||||||
|
- **Local mode** (no R2 credentials): Downloads to `extract/psdonline/src/psdonline/data/{etag}.zip`
|
||||||
|
- **R2 mode** (R2 credentials present): Uploads to `s3://bucket/psd/{etag}.zip`
|
||||||
|
- Flat structure: files named by ETag for natural deduplication
|
||||||
|
|
||||||
**Run extraction:**
|
**Run extraction:**
|
||||||
```bash
|
```bash
|
||||||
|
extract_psd # Local mode (default)
|
||||||
|
|
||||||
|
# R2 mode (requires env vars: R2_ENDPOINT, R2_BUCKET, R2_ACCESS_KEY, R2_SECRET_KEY)
|
||||||
|
export R2_ENDPOINT=...
|
||||||
|
export R2_BUCKET=...
|
||||||
|
export R2_ACCESS_KEY=...
|
||||||
|
export R2_SECRET_KEY=...
|
||||||
extract_psd
|
extract_psd
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ authors = [
|
|||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"boto3>=1.40.55",
|
||||||
"niquests>=3.14.1",
|
"niquests>=3.14.1",
|
||||||
"pendulum>=3.1.0",
|
"pendulum>=3.1.0",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
import boto3
|
||||||
import niquests
|
import niquests
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
@@ -17,14 +20,45 @@ logger = logging.getLogger("PSDOnline Extractor")
|
|||||||
OUTPUT_DIR = pathlib.Path(__file__).parent / "data"
|
OUTPUT_DIR = pathlib.Path(__file__).parent / "data"
|
||||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
logger.info(f"Output dir: {OUTPUT_DIR}")
|
logger.info(f"Output dir: {OUTPUT_DIR}")
|
||||||
#TODO: adapt to environment values, so this writes to s3 in prod
|
|
||||||
|
# R2 configuration from environment
|
||||||
|
R2_ENDPOINT = os.getenv('R2_ENDPOINT')
|
||||||
|
R2_BUCKET = os.getenv('R2_BUCKET')
|
||||||
|
R2_ACCESS_KEY = os.getenv('R2_ACCESS_KEY')
|
||||||
|
R2_SECRET_KEY = os.getenv('R2_SECRET_KEY')
|
||||||
|
|
||||||
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
|
PSD_HISTORICAL_URL = "https://apps.fas.usda.gov/psdonline/downloads/archives/{year}/{month:02d}/psd_alldata_csv.zip"
|
||||||
FIRST_YEAR = 2006
|
FIRST_YEAR = 2006
|
||||||
FIRST_MONTH = 8
|
FIRST_MONTH = 8
|
||||||
|
|
||||||
def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session):
|
def check_r2_file_exists(etag: str, s3_client) -> bool:
|
||||||
|
"""Check if file exists in R2."""
|
||||||
|
r2_key = f"psd/{etag}.zip"
|
||||||
|
try:
|
||||||
|
s3_client.head_object(Bucket=R2_BUCKET, Key=r2_key)
|
||||||
|
logger.info(f"File {r2_key} already exists in R2, skipping")
|
||||||
|
return True
|
||||||
|
except ClientError as e:
|
||||||
|
if e.response['Error']['Code'] == '404':
|
||||||
|
return False
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def upload_to_r2(content: bytes, etag: str, s3_client):
|
||||||
|
"""Upload file content to R2."""
|
||||||
|
r2_key = f"psd/{etag}.zip"
|
||||||
|
logger.info(f"Uploading to R2: {r2_key}")
|
||||||
|
s3_client.put_object(Bucket=R2_BUCKET, Key=r2_key, Body=content)
|
||||||
|
logger.info("Upload complete")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niquests.Session, s3_client=None):
|
||||||
|
"""
|
||||||
|
Extract PSD file either to local storage or R2.
|
||||||
|
If s3_client is provided, uploads to R2 only (no local storage).
|
||||||
|
If s3_client is None, downloads to local storage.
|
||||||
|
"""
|
||||||
logger.info(f"Requesting file {url} ...")
|
logger.info(f"Requesting file {url} ...")
|
||||||
extracted_etags = [file.stem for file in OUTPUT_DIR.rglob("*.zip")]
|
|
||||||
|
|
||||||
response = http_session.head(url)
|
response = http_session.head(url)
|
||||||
if response.status_code == 404:
|
if response.status_code == 404:
|
||||||
@@ -33,34 +67,73 @@ def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: nique
|
|||||||
elif response.status_code != 200:
|
elif response.status_code != 200:
|
||||||
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
||||||
return
|
return
|
||||||
|
|
||||||
etag = response.headers.get("etag").replace('"',"").replace(":","_")
|
etag = response.headers.get("etag").replace('"',"").replace(":","_")
|
||||||
if etag in extracted_etags:
|
|
||||||
logger.info("File already extracted, skipping download.")
|
# R2 mode: check R2 and upload if needed
|
||||||
return
|
if s3_client:
|
||||||
else:
|
if check_r2_file_exists(etag, s3_client):
|
||||||
|
return
|
||||||
response = http_session.get(url)
|
response = http_session.get(url)
|
||||||
extract_to_path = extract_to_path / f"{etag}.zip"
|
upload_to_r2(response.content, etag, s3_client)
|
||||||
logger.info(f"Storing file to {extract_to_path}")
|
return
|
||||||
extract_to_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
extract_to_path.write_bytes(response.content)
|
# Local mode: check local and download if needed
|
||||||
logger.info("Download done.")
|
local_file = extract_to_path / f"{etag}.zip"
|
||||||
|
if local_file.exists():
|
||||||
|
logger.info(f"File {etag}.zip already exists locally, skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
response = http_session.get(url)
|
||||||
|
logger.info(f"Storing file to {local_file}")
|
||||||
|
extract_to_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
local_file.write_bytes(response.content)
|
||||||
|
logger.info("Download complete")
|
||||||
|
|
||||||
|
|
||||||
def extract_psd_dataset():
|
def extract_psd_dataset():
|
||||||
today = datetime.now()
|
today = datetime.now()
|
||||||
years = list(range(FIRST_YEAR, today.year+1))
|
|
||||||
for year in years:
|
# Check if R2 credentials are configured
|
||||||
months = list(range(1,13))
|
use_r2 = all([R2_ENDPOINT, R2_BUCKET, R2_ACCESS_KEY, R2_SECRET_KEY])
|
||||||
if year == FIRST_YEAR:
|
|
||||||
months = list(range(FIRST_MONTH, 13))
|
if use_r2:
|
||||||
if year == years[-1]:
|
logger.info("R2 credentials found, uploading to R2")
|
||||||
months = list(range(1, today.month+1))
|
s3_client = boto3.client(
|
||||||
logger.info(f"Year {year}, extracting months: {months}")
|
's3',
|
||||||
for month in months:
|
endpoint_url=R2_ENDPOINT,
|
||||||
|
aws_access_key_id=R2_ACCESS_KEY,
|
||||||
|
aws_secret_access_key=R2_SECRET_KEY
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("R2 credentials not found, downloading to local storage")
|
||||||
|
s3_client = None
|
||||||
|
|
||||||
|
# Try current month and previous 3 months (USDA data is published with lag)
|
||||||
|
with niquests.Session() as session:
|
||||||
|
for months_back in range(4):
|
||||||
|
year = today.year
|
||||||
|
month = today.month - months_back
|
||||||
|
# Handle year rollover
|
||||||
|
while month < 1:
|
||||||
|
month += 12
|
||||||
|
year -= 1
|
||||||
|
|
||||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||||
target_dir = OUTPUT_DIR / f"{year}"/f"{month:02d}"
|
logger.info(f"Trying {year}-{month:02d}...")
|
||||||
with niquests.Session() as session:
|
|
||||||
extract_psd_file(url=url, http_session=session, extract_to_path=target_dir)
|
# Check if URL exists
|
||||||
|
response = session.head(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
logger.info(f"Found latest data at {year}-{month:02d}")
|
||||||
|
extract_psd_file(url=url, http_session=session, extract_to_path=OUTPUT_DIR, s3_client=s3_client)
|
||||||
|
return
|
||||||
|
elif response.status_code == 404:
|
||||||
|
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
|
||||||
|
|
||||||
|
logger.error("Could not find any available data in the last 4 months")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -21,4 +21,4 @@ MODEL (
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM read_csv('zip://extract/psdonline/src/psdonline/data/**/*.zip/*.csv', header=true, union_by_name=true, filename=true, names = ['commodity_code', 'commodity_description', 'country_code', 'country_name', 'market_year', 'calendar_year', 'month', 'attribute_id', 'attribute_description', 'unit_id', 'unit_description', 'value'], all_varchar=true)
|
FROM read_csv('zip://extract/psdonline/src/psdonline/data/*.zip/*.csv', header=true, union_by_name=true, filename=true, names = ['commodity_code', 'commodity_description', 'country_code', 'country_name', 'market_year', 'calendar_year', 'month', 'attribute_id', 'attribute_description', 'unit_id', 'unit_description', 'value'], all_varchar=true)
|
||||||
|
|||||||
51
uv.lock
generated
51
uv.lock
generated
@@ -147,6 +147,34 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
|
{ url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "boto3"
|
||||||
|
version = "1.40.55"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "botocore" },
|
||||||
|
{ name = "jmespath" },
|
||||||
|
{ name = "s3transfer" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/50/d8/a279c054e0c9731172f05b3d118f3ffc9d74806657f84fc0c93c42d1bb5d/boto3-1.40.55.tar.gz", hash = "sha256:27e35b4fa9edd414ce06c1a748bf57cacd8203271847d93fc1053e4a4ec6e1a9", size = 111590, upload-time = "2025-10-17T19:34:56.753Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/42/8c/559c6145d857ed953536a83f3a94915bbd5d3d2d406db1abf8bf40be7645/boto3-1.40.55-py3-none-any.whl", hash = "sha256:2e30f5a0d49e107b8a5c0c487891afd300bfa410e1d918bf187ae45ac3839332", size = 139322, upload-time = "2025-10-17T19:34:55.028Z" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "botocore"
|
||||||
|
version = "1.40.55"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "jmespath" },
|
||||||
|
{ name = "python-dateutil" },
|
||||||
|
{ name = "urllib3" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/a4/92/dce4842b2e215d213d34b064fcdd13c6a782c43344e77336bcde586e9229/botocore-1.40.55.tar.gz", hash = "sha256:79b6472e2de92b3519d44fc1eec8c5feced7f99a0d10fdea6dc93133426057c1", size = 14446917, upload-time = "2025-10-17T19:34:47.44Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/21/30/f13bbc36e83b78777ff1abf50a084efcc3336b808e76560d8c5a0c9219e0/botocore-1.40.55-py3-none-any.whl", hash = "sha256:cdc38f7a4ddb30a2cd1cdd4fabde2a5a16e41b5a642292e1c30de5c4e46f5d44", size = 14116107, upload-time = "2025-10-17T19:34:44.398Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cattrs"
|
name = "cattrs"
|
||||||
version = "25.1.1"
|
version = "25.1.1"
|
||||||
@@ -765,6 +793,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
|
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "jmespath"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "json-stream"
|
name = "json-stream"
|
||||||
version = "2.3.3"
|
version = "2.3.3"
|
||||||
@@ -1225,12 +1262,14 @@ name = "psdonline"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { editable = "extract/psdonline" }
|
source = { editable = "extract/psdonline" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
{ name = "boto3" },
|
||||||
{ name = "niquests" },
|
{ name = "niquests" },
|
||||||
{ name = "pendulum" },
|
{ name = "pendulum" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
|
{ name = "boto3", specifier = ">=1.40.55" },
|
||||||
{ name = "niquests", specifier = ">=3.14.1" },
|
{ name = "niquests", specifier = ">=3.14.1" },
|
||||||
{ name = "pendulum", specifier = ">=3.1.0" },
|
{ name = "pendulum", specifier = ">=3.1.0" },
|
||||||
]
|
]
|
||||||
@@ -1740,6 +1779,18 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/24/3c/21cf283d67af33a8e6ed242396863af195a8a6134ec581524fd22b9811b6/ruff-0.12.10-py3-none-win_arm64.whl", hash = "sha256:cc138cc06ed9d4bfa9d667a65af7172b47840e1a98b02ce7011c391e54635ffc", size = 12074225, upload-time = "2025-08-21T18:23:20.137Z" },
|
{ url = "https://files.pythonhosted.org/packages/24/3c/21cf283d67af33a8e6ed242396863af195a8a6134ec581524fd22b9811b6/ruff-0.12.10-py3-none-win_arm64.whl", hash = "sha256:cc138cc06ed9d4bfa9d667a65af7172b47840e1a98b02ce7011c391e54635ffc", size = 12074225, upload-time = "2025-08-21T18:23:20.137Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "s3transfer"
|
||||||
|
version = "0.14.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "botocore" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/62/74/8d69dcb7a9efe8baa2046891735e5dfe433ad558ae23d9e3c14c633d1d58/s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125", size = 151547, upload-time = "2025-09-09T19:23:31.089Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "semver"
|
name = "semver"
|
||||||
version = "3.0.4"
|
version = "3.0.4"
|
||||||
|
|||||||
Reference in New Issue
Block a user