feat: extraction framework overhaul — extract_core shared package + SQLite state tracking
- Add extract/extract_core/ workspace package with three modules:
- state.py: SQLite run tracking (open_state_db, start_run, end_run, get_last_cursor)
- http.py: niquests session factory + etag normalization helpers
- files.py: landing_path, content_hash, write_bytes_atomic (atomic gzip writes)
- State lives at {LANDING_DIR}/.state.sqlite — no extra env var needed
- SQLite chosen over DuckDB: state tracking is OLTP (row inserts/updates), not analytical
- Refactor all 4 extractors (psdonline, cftc_cot, coffee_prices, ice_stocks):
- Replace inline boilerplate with extract_core helpers
- Add start_run/end_run tracking to every extraction entry point
- extract_cot_year returns int (bytes_written) instead of bool
- Update tests: assert result == 0 (not `is False`) for the return type change
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ version = "0.1.0"
|
|||||||
description = "CFTC Commitment of Traders data extractor"
|
description = "CFTC Commitment of Traders data extractor"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"extract_core",
|
||||||
"niquests>=3.14.1",
|
"niquests>=3.14.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -10,12 +10,20 @@ Landing path: LANDING_DIR/cot/{year}/{etag}.csv.gzip
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import niquests
|
import niquests
|
||||||
|
from extract_core import (
|
||||||
|
end_run,
|
||||||
|
landing_path,
|
||||||
|
normalize_etag,
|
||||||
|
open_state_db,
|
||||||
|
start_run,
|
||||||
|
write_bytes_atomic,
|
||||||
|
)
|
||||||
|
|
||||||
from .normalize import find_csv_inner_filename, normalize_zipped_csv
|
from .normalize import find_csv_inner_filename, normalize_zipped_csv
|
||||||
|
|
||||||
@@ -27,7 +35,7 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger("CFTC COT Extractor")
|
logger = logging.getLogger("CFTC COT Extractor")
|
||||||
|
|
||||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||||
|
|
||||||
# CFTC publishes yearly ZIPs for the disaggregated futures-only report.
|
# CFTC publishes yearly ZIPs for the disaggregated futures-only report.
|
||||||
# The file for the current year is updated each Friday at 3:30 PM ET.
|
# The file for the current year is updated each Friday at 3:30 PM ET.
|
||||||
@@ -52,10 +60,10 @@ def _synthetic_etag(year: int, headers: dict) -> str:
|
|||||||
return etag
|
return etag
|
||||||
|
|
||||||
|
|
||||||
def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
def extract_cot_year(year: int, http_session: niquests.Session) -> int:
|
||||||
"""Download and store COT data for a single year.
|
"""Download and store COT data for a single year.
|
||||||
|
|
||||||
Returns True if a new file was written, False if skipped or unavailable.
|
Returns bytes_written (0 if skipped or unavailable).
|
||||||
"""
|
"""
|
||||||
url = COT_URL_TEMPLATE.format(year=year)
|
url = COT_URL_TEMPLATE.format(year=year)
|
||||||
logger.info(f"Checking COT data for {year}: {url}")
|
logger.info(f"Checking COT data for {year}: {url}")
|
||||||
@@ -63,20 +71,20 @@ def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
|||||||
head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
head = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
if head.status_code == 404:
|
if head.status_code == 404:
|
||||||
logger.info(f"Year {year} not available (404) — skipping")
|
logger.info(f"Year {year} not available (404) — skipping")
|
||||||
return False
|
return 0
|
||||||
assert head.status_code == 200, (
|
assert head.status_code == 200, (
|
||||||
f"Unexpected HEAD status {head.status_code} for {url}"
|
f"Unexpected HEAD status {head.status_code} for {url}"
|
||||||
)
|
)
|
||||||
|
|
||||||
raw_etag = head.headers.get("etag", "")
|
raw_etag = head.headers.get("etag", "")
|
||||||
etag = raw_etag.replace('"', "").replace(":", "_") if raw_etag else _synthetic_etag(year, head.headers)
|
etag = normalize_etag(raw_etag) if raw_etag else _synthetic_etag(year, head.headers)
|
||||||
|
|
||||||
dest_dir = LANDING_DIR / "cot" / str(year)
|
dest_dir = landing_path(LANDING_DIR, "cot", str(year))
|
||||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||||
|
|
||||||
if local_file.exists():
|
if local_file.exists():
|
||||||
logger.info(f"Year {year}: {etag}.csv.gzip already exists, skipping")
|
logger.info(f"Year {year}: {etag}.csv.gzip already exists, skipping")
|
||||||
return False
|
return 0
|
||||||
|
|
||||||
logger.info(f"Downloading COT data for {year}...")
|
logger.info(f"Downloading COT data for {year}...")
|
||||||
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
@@ -89,14 +97,11 @@ def extract_cot_year(year: int, http_session: niquests.Session) -> bool:
|
|||||||
inner_filename = find_csv_inner_filename(BytesIO(response.content))
|
inner_filename = find_csv_inner_filename(BytesIO(response.content))
|
||||||
normalized = normalize_zipped_csv(zip_buffer, inner_filename)
|
normalized = normalize_zipped_csv(zip_buffer, inner_filename)
|
||||||
|
|
||||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
bytes_written = write_bytes_atomic(local_file, normalized.read())
|
||||||
local_file.write_bytes(normalized.read())
|
|
||||||
|
|
||||||
assert local_file.exists(), f"File was not written: {local_file}"
|
|
||||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
||||||
|
|
||||||
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
|
logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
|
||||||
return True
|
return bytes_written
|
||||||
|
|
||||||
|
|
||||||
def extract_cot_dataset():
|
def extract_cot_dataset():
|
||||||
@@ -113,16 +118,36 @@ def extract_cot_dataset():
|
|||||||
f"Year range {len(years)} exceeds MAX_YEARS={MAX_YEARS}"
|
f"Year range {len(years)} exceeds MAX_YEARS={MAX_YEARS}"
|
||||||
)
|
)
|
||||||
|
|
||||||
new_count = 0
|
conn = open_state_db(LANDING_DIR)
|
||||||
with niquests.Session() as session:
|
run_id = start_run(conn, "cftc_cot")
|
||||||
for year in years:
|
files_written = 0
|
||||||
try:
|
files_skipped = 0
|
||||||
if extract_cot_year(year, session):
|
bytes_written_total = 0
|
||||||
new_count += 1
|
try:
|
||||||
except Exception:
|
with niquests.Session() as session:
|
||||||
logger.exception(f"Failed to extract COT data for {year}, continuing")
|
for year in years:
|
||||||
|
try:
|
||||||
|
result = extract_cot_year(year, session)
|
||||||
|
if result > 0:
|
||||||
|
files_written += 1
|
||||||
|
bytes_written_total += result
|
||||||
|
else:
|
||||||
|
files_skipped += 1
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"Failed to extract COT data for {year}, continuing")
|
||||||
|
|
||||||
logger.info(f"COT extraction complete: {new_count} new file(s) downloaded")
|
logger.info(f"COT extraction complete: {files_written} new file(s) downloaded")
|
||||||
|
end_run(
|
||||||
|
conn, run_id, status="success",
|
||||||
|
files_written=files_written, files_skipped=files_skipped,
|
||||||
|
bytes_written=bytes_written_total,
|
||||||
|
cursor_value=str(current_year),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ version = "0.1.0"
|
|||||||
description = "KC=F Coffee C futures price extractor"
|
description = "KC=F Coffee C futures price extractor"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"extract_core",
|
||||||
"yfinance>=0.2.55",
|
"yfinance>=0.2.55",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -8,14 +8,15 @@ Landing path: LANDING_DIR/prices/coffee_kc/{hash8}.csv.gzip
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import gzip
|
import gzip
|
||||||
import hashlib
|
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import yfinance as yf
|
import yfinance as yf
|
||||||
|
from extract_core import content_hash, end_run, landing_path, open_state_db, start_run
|
||||||
|
from extract_core import write_bytes_atomic
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
@@ -25,7 +26,7 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger("Coffee Prices Extractor")
|
logger = logging.getLogger("Coffee Prices Extractor")
|
||||||
|
|
||||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||||
TICKER = "KC=F"
|
TICKER = "KC=F"
|
||||||
DEST_SUBDIR = "prices/coffee_kc"
|
DEST_SUBDIR = "prices/coffee_kc"
|
||||||
|
|
||||||
@@ -40,52 +41,54 @@ def extract_coffee_prices() -> None:
|
|||||||
On first run downloads full history (period='max'). On subsequent runs
|
On first run downloads full history (period='max'). On subsequent runs
|
||||||
the hash matches if no new trading days have closed since last run.
|
the hash matches if no new trading days have closed since last run.
|
||||||
"""
|
"""
|
||||||
logger.info(f"Downloading {TICKER} daily OHLCV from Yahoo Finance...")
|
conn = open_state_db(LANDING_DIR)
|
||||||
|
run_id = start_run(conn, "coffee_prices")
|
||||||
|
try:
|
||||||
|
logger.info(f"Downloading {TICKER} daily OHLCV from Yahoo Finance...")
|
||||||
|
|
||||||
ticker = yf.Ticker(TICKER)
|
ticker = yf.Ticker(TICKER)
|
||||||
df = ticker.history(period="max", interval="1d", auto_adjust=False, timeout=DOWNLOAD_TIMEOUT_SECONDS)
|
df = ticker.history(period="max", interval="1d", auto_adjust=False, timeout=DOWNLOAD_TIMEOUT_SECONDS)
|
||||||
|
|
||||||
assert df is not None and len(df) > 0, f"yfinance returned empty DataFrame for {TICKER}"
|
assert df is not None and len(df) > 0, f"yfinance returned empty DataFrame for {TICKER}"
|
||||||
|
|
||||||
# Reset index so Date becomes a plain column
|
# Reset index so Date becomes a plain column
|
||||||
df = df.reset_index()
|
df = df.reset_index()
|
||||||
|
|
||||||
# Keep standard OHLCV columns only; yfinance may return extra columns
|
# Keep standard OHLCV columns only; yfinance may return extra columns
|
||||||
keep_cols = [c for c in ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"] if c in df.columns]
|
keep_cols = [c for c in ["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"] if c in df.columns]
|
||||||
df = df[keep_cols]
|
df = df[keep_cols]
|
||||||
|
|
||||||
# Normalize Date to ISO string for CSV stability across timezones
|
# Normalize Date to ISO string for CSV stability across timezones
|
||||||
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
# Serialize to CSV bytes
|
# Serialize to CSV bytes
|
||||||
csv_buf = io.StringIO()
|
csv_buf = io.StringIO()
|
||||||
df.to_csv(csv_buf, index=False)
|
df.to_csv(csv_buf, index=False)
|
||||||
csv_bytes = csv_buf.getvalue().encode("utf-8")
|
csv_bytes = csv_buf.getvalue().encode("utf-8")
|
||||||
|
|
||||||
assert len(csv_bytes) > 0, "CSV serialization produced empty output"
|
assert len(csv_bytes) > 0, "CSV serialization produced empty output"
|
||||||
|
|
||||||
# Hash-based idempotency key (first 8 hex chars of SHA256)
|
etag = content_hash(csv_bytes)
|
||||||
sha256 = hashlib.sha256(csv_bytes).hexdigest()
|
dest_dir = landing_path(LANDING_DIR, DEST_SUBDIR)
|
||||||
etag = sha256[:8]
|
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||||
|
|
||||||
dest_dir = LANDING_DIR / DEST_SUBDIR
|
if local_file.exists():
|
||||||
local_file = dest_dir / f"{etag}.csv.gzip"
|
logger.info(f"File {local_file.name} already exists — no new data, skipping")
|
||||||
|
end_run(conn, run_id, status="success", files_skipped=1)
|
||||||
|
return
|
||||||
|
|
||||||
if local_file.exists():
|
compressed = gzip.compress(csv_bytes)
|
||||||
logger.info(f"File {local_file.name} already exists — no new data, skipping")
|
bytes_written = write_bytes_atomic(local_file, compressed)
|
||||||
return
|
|
||||||
|
|
||||||
# Compress and write
|
logger.info(
|
||||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
f"Stored {local_file} ({bytes_written:,} bytes, {len(df):,} rows)"
|
||||||
compressed = gzip.compress(csv_bytes)
|
)
|
||||||
local_file.write_bytes(compressed)
|
end_run(conn, run_id, status="success", files_written=1, bytes_written=bytes_written)
|
||||||
|
except Exception as e:
|
||||||
assert local_file.exists(), f"File was not written: {local_file}"
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
raise
|
||||||
|
finally:
|
||||||
logger.info(
|
conn.close()
|
||||||
f"Stored {local_file} ({local_file.stat().st_size:,} bytes, {len(df):,} rows)"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
15
extract/extract_core/pyproject.toml
Normal file
15
extract/extract_core/pyproject.toml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
[project]
|
||||||
|
name = "extract_core"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Shared extraction utilities: SQLite state tracking, HTTP helpers, file I/O"
|
||||||
|
requires-python = ">=3.13"
|
||||||
|
dependencies = [
|
||||||
|
"niquests>=3.14.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src/extract_core"]
|
||||||
17
extract/extract_core/src/extract_core/__init__.py
Normal file
17
extract/extract_core/src/extract_core/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from .files import content_hash, landing_path, write_bytes_atomic
|
||||||
|
from .http import create_session, head_etag, normalize_etag
|
||||||
|
from .state import end_run, get_last_cursor, get_recent_runs, open_state_db, start_run
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"open_state_db",
|
||||||
|
"start_run",
|
||||||
|
"end_run",
|
||||||
|
"get_last_cursor",
|
||||||
|
"get_recent_runs",
|
||||||
|
"create_session",
|
||||||
|
"head_etag",
|
||||||
|
"normalize_etag",
|
||||||
|
"landing_path",
|
||||||
|
"content_hash",
|
||||||
|
"write_bytes_atomic",
|
||||||
|
]
|
||||||
53
extract/extract_core/src/extract_core/files.py
Normal file
53
extract/extract_core/src/extract_core/files.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Landing zone file I/O helpers for extraction pipelines."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def landing_path(landing_dir: str | Path, *parts: str) -> Path:
|
||||||
|
"""Return path to a subdirectory of landing_dir, creating it if absent.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
dest_dir = landing_path("data/landing", "psd", "2025", "07")
|
||||||
|
# Returns Path("data/landing/psd/2025/07"), all directories created.
|
||||||
|
|
||||||
|
Use this instead of manually calling mkdir — it makes the intent clear
|
||||||
|
and ensures the directory always exists before the caller writes to it.
|
||||||
|
"""
|
||||||
|
assert landing_dir, "landing_dir must not be empty"
|
||||||
|
path = Path(landing_dir).joinpath(*parts)
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def content_hash(data: bytes, prefix_bytes: int = 8) -> str:
|
||||||
|
"""Return the first prefix_bytes hex chars of the SHA256 digest of data.
|
||||||
|
|
||||||
|
Used as a short idempotency key for content-addressed filenames. Two runs
|
||||||
|
that produce identical data will produce the same hash, so the file already
|
||||||
|
existing on disk is proof the content is unchanged.
|
||||||
|
|
||||||
|
prefix_bytes=8 gives 32-bit collision resistance — sufficient for a few
|
||||||
|
thousand files per extractor. Increase to 16 for very high-volume sources.
|
||||||
|
"""
|
||||||
|
assert data, "data must not be empty"
|
||||||
|
assert 1 <= prefix_bytes <= 64, f"prefix_bytes must be 1-64, got {prefix_bytes}"
|
||||||
|
return hashlib.sha256(data).hexdigest()[:prefix_bytes]
|
||||||
|
|
||||||
|
|
||||||
|
def write_bytes_atomic(path: Path, data: bytes) -> int:
|
||||||
|
"""Write data to path atomically via a .tmp sibling file.
|
||||||
|
|
||||||
|
Writes to {path}.tmp first, then renames to path. On Linux, rename() is
|
||||||
|
atomic when src and dst are on the same filesystem — a reader never sees
|
||||||
|
a partial file. Returns the number of bytes written.
|
||||||
|
|
||||||
|
Callers are responsible for compression (e.g. gzip.compress) before calling
|
||||||
|
this function.
|
||||||
|
"""
|
||||||
|
assert data, "data must not be empty"
|
||||||
|
tmp = path.with_suffix(path.suffix + ".tmp")
|
||||||
|
tmp.write_bytes(data)
|
||||||
|
tmp.rename(path)
|
||||||
|
assert path.exists(), f"File was not written: {path}"
|
||||||
|
return len(data)
|
||||||
43
extract/extract_core/src/extract_core/http.py
Normal file
43
extract/extract_core/src/extract_core/http.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
"""HTTP session factory and etag helpers for extraction pipelines."""
|
||||||
|
|
||||||
|
import niquests
|
||||||
|
|
||||||
|
|
||||||
|
def create_session(timeout_seconds: int = 60, max_retries: int = 3) -> niquests.Session:
|
||||||
|
"""Return a new niquests Session.
|
||||||
|
|
||||||
|
timeout_seconds is stored as an attribute so callers that need a consistent
|
||||||
|
timeout can reference it via session.timeout_seconds.
|
||||||
|
"""
|
||||||
|
assert timeout_seconds > 0, f"timeout_seconds must be positive, got {timeout_seconds}"
|
||||||
|
assert max_retries >= 0, f"max_retries must be non-negative, got {max_retries}"
|
||||||
|
session = niquests.Session()
|
||||||
|
session.timeout_seconds = timeout_seconds # type: ignore[attr-defined]
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_etag(raw: str) -> str:
|
||||||
|
"""Normalize an HTTP etag for use as a filename component.
|
||||||
|
|
||||||
|
Strips surrounding quotes and replaces colons with underscores so the
|
||||||
|
result is safe as a filename on all platforms.
|
||||||
|
|
||||||
|
Example: '"abc:def"' → 'abc_def'
|
||||||
|
"""
|
||||||
|
assert raw, "raw etag must not be empty"
|
||||||
|
return raw.strip().strip('"').replace(":", "_")
|
||||||
|
|
||||||
|
|
||||||
|
def head_etag(session: niquests.Session, url: str, timeout_seconds: int = 60) -> str | None:
|
||||||
|
"""Send a HEAD request and return the normalized etag, or None if absent.
|
||||||
|
|
||||||
|
Returns None (not raises) when the server omits the etag header, so callers
|
||||||
|
can fall back to a synthetic key (e.g. content-length + last-modified).
|
||||||
|
"""
|
||||||
|
assert url, "url must not be empty"
|
||||||
|
assert timeout_seconds > 0, f"timeout_seconds must be positive, got {timeout_seconds}"
|
||||||
|
response = session.head(url, timeout=timeout_seconds)
|
||||||
|
raw = response.headers.get("etag", "")
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
return normalize_etag(raw)
|
||||||
122
extract/extract_core/src/extract_core/state.py
Normal file
122
extract/extract_core/src/extract_core/state.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
"""SQLite-backed extraction run state.
|
||||||
|
|
||||||
|
State table lives at {LANDING_DIR}/.state.sqlite — derived from LANDING_DIR,
|
||||||
|
no extra env var required. SQLite is used (not DuckDB) because state tracking
|
||||||
|
is a transactional workload: single-row inserts and point-lookup updates, not
|
||||||
|
analytical scans. WAL mode allows concurrent readers while a run is in progress.
|
||||||
|
|
||||||
|
Schema is generic: extractor is a free-text name, cursor_value is a TEXT field
|
||||||
|
that can hold any cursor type (date string, etag, page number, or JSON for
|
||||||
|
multi-dimensional cursors). The schema should never need changing — add new
|
||||||
|
extractor names by just using them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_CREATE_TABLE_SQL = """
|
||||||
|
CREATE TABLE IF NOT EXISTS extraction_runs (
|
||||||
|
run_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
extractor TEXT NOT NULL,
|
||||||
|
started_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')),
|
||||||
|
finished_at TEXT,
|
||||||
|
status TEXT NOT NULL DEFAULT 'running',
|
||||||
|
files_written INTEGER DEFAULT 0,
|
||||||
|
files_skipped INTEGER DEFAULT 0,
|
||||||
|
bytes_written INTEGER DEFAULT 0,
|
||||||
|
cursor_value TEXT,
|
||||||
|
error_message TEXT
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def open_state_db(landing_dir: str | Path) -> sqlite3.Connection:
|
||||||
|
"""Open (or create) the state DB at {landing_dir}/.state.sqlite.
|
||||||
|
|
||||||
|
Enables WAL mode so concurrent readers (e.g. monitoring scripts) can query
|
||||||
|
the DB while an extraction run is in progress. Caller must close when done.
|
||||||
|
"""
|
||||||
|
assert landing_dir, "landing_dir must not be empty"
|
||||||
|
db_path = Path(landing_dir) / ".state.sqlite"
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
conn.execute(_CREATE_TABLE_SQL)
|
||||||
|
conn.commit()
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def start_run(conn: sqlite3.Connection, extractor: str) -> int:
|
||||||
|
"""Insert a 'running' row for extractor. Returns the new run_id."""
|
||||||
|
assert extractor, "extractor name must not be empty"
|
||||||
|
cur = conn.execute(
|
||||||
|
"INSERT INTO extraction_runs (extractor, status) VALUES (?, 'running')",
|
||||||
|
(extractor,),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
assert cur.lastrowid is not None, "INSERT did not return a row ID"
|
||||||
|
return cur.lastrowid
|
||||||
|
|
||||||
|
|
||||||
|
def end_run(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
run_id: int,
|
||||||
|
*,
|
||||||
|
status: str,
|
||||||
|
files_written: int = 0,
|
||||||
|
files_skipped: int = 0,
|
||||||
|
bytes_written: int = 0,
|
||||||
|
cursor_value: str | None = None,
|
||||||
|
error_message: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Update the run row to its final state (success or failed)."""
|
||||||
|
assert status in ("success", "failed"), f"status must be 'success' or 'failed', got {status!r}"
|
||||||
|
assert run_id > 0, f"run_id must be positive, got {run_id}"
|
||||||
|
assert files_written >= 0, f"files_written must be non-negative, got {files_written}"
|
||||||
|
assert files_skipped >= 0, f"files_skipped must be non-negative, got {files_skipped}"
|
||||||
|
assert bytes_written >= 0, f"bytes_written must be non-negative, got {bytes_written}"
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
UPDATE extraction_runs
|
||||||
|
SET finished_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
|
||||||
|
status = ?,
|
||||||
|
files_written = ?,
|
||||||
|
files_skipped = ?,
|
||||||
|
bytes_written = ?,
|
||||||
|
cursor_value = ?,
|
||||||
|
error_message = ?
|
||||||
|
WHERE run_id = ?
|
||||||
|
""",
|
||||||
|
(status, files_written, files_skipped, bytes_written, cursor_value, error_message, run_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_cursor(conn: sqlite3.Connection, extractor: str) -> str | None:
|
||||||
|
"""Return the cursor_value from the most recent successful run, or None."""
|
||||||
|
row = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT cursor_value FROM extraction_runs
|
||||||
|
WHERE extractor = ? AND status = 'success' AND cursor_value IS NOT NULL
|
||||||
|
ORDER BY run_id DESC
|
||||||
|
LIMIT 1
|
||||||
|
""",
|
||||||
|
(extractor,),
|
||||||
|
).fetchone()
|
||||||
|
return row["cursor_value"] if row else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_recent_runs(conn: sqlite3.Connection, extractor: str, limit: int = 10) -> list[dict]:
|
||||||
|
"""Return the most recent runs for an extractor, newest first."""
|
||||||
|
assert limit > 0, f"limit must be positive, got {limit}"
|
||||||
|
rows = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT * FROM extraction_runs
|
||||||
|
WHERE extractor = ?
|
||||||
|
ORDER BY run_id DESC
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(extractor, limit),
|
||||||
|
).fetchall()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
@@ -4,6 +4,7 @@ version = "0.1.0"
|
|||||||
description = "ICE certified warehouse stocks extractor"
|
description = "ICE certified warehouse stocks extractor"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"extract_core",
|
||||||
"niquests>=3.14.1",
|
"niquests>=3.14.1",
|
||||||
"xlrd>=2.0.1",
|
"xlrd>=2.0.1",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -20,16 +20,23 @@ CSV schemas:
|
|||||||
|
|
||||||
import csv
|
import csv
|
||||||
import gzip
|
import gzip
|
||||||
import hashlib
|
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import niquests
|
import niquests
|
||||||
import xlrd
|
import xlrd
|
||||||
|
from extract_core import (
|
||||||
|
content_hash,
|
||||||
|
end_run,
|
||||||
|
landing_path,
|
||||||
|
open_state_db,
|
||||||
|
start_run,
|
||||||
|
write_bytes_atomic,
|
||||||
|
)
|
||||||
|
|
||||||
from ice_stocks.ice_api import find_all_reports, find_latest_report
|
from ice_stocks.ice_api import find_all_reports, find_latest_report
|
||||||
from ice_stocks.xls_parse import OLE2_MAGIC, detect_file_format, xls_to_rows
|
from ice_stocks.xls_parse import OLE2_MAGIC, detect_file_format, xls_to_rows
|
||||||
@@ -42,7 +49,7 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger("ICE Stocks Extractor")
|
logger = logging.getLogger("ICE Stocks Extractor")
|
||||||
|
|
||||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||||
|
|
||||||
# ── ice_stocks (daily rolling) ──────────────────────────────────────────────
|
# ── ice_stocks (daily rolling) ──────────────────────────────────────────────
|
||||||
DEST_SUBDIR = "ice_stocks"
|
DEST_SUBDIR = "ice_stocks"
|
||||||
@@ -105,31 +112,30 @@ HISTORICAL_PORT_COLS = [
|
|||||||
|
|
||||||
# ── shared helpers ───────────────────────────────────────────────────────────
|
# ── shared helpers ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _write_landing_file(canonical_csv: bytes, dest_subdir: str, date_label: str) -> None:
|
def _write_landing_file(canonical_csv: bytes, dest_subdir: str, date_label: str) -> int:
|
||||||
"""SHA256-hash canonical_csv, skip if exists, else gzip and write."""
|
"""SHA256-hash canonical_csv, skip if exists, else gzip and write atomically.
|
||||||
|
|
||||||
|
Returns bytes_written (0 if skipped).
|
||||||
|
"""
|
||||||
assert canonical_csv, "canonical_csv must not be empty"
|
assert canonical_csv, "canonical_csv must not be empty"
|
||||||
assert dest_subdir, "dest_subdir must not be empty"
|
assert dest_subdir, "dest_subdir must not be empty"
|
||||||
assert date_label, "date_label must not be empty"
|
assert date_label, "date_label must not be empty"
|
||||||
|
|
||||||
sha256 = hashlib.sha256(canonical_csv).hexdigest()
|
etag = content_hash(canonical_csv)
|
||||||
etag = sha256[:8]
|
|
||||||
year = date_label[:4]
|
year = date_label[:4]
|
||||||
|
|
||||||
dest_dir = LANDING_DIR / dest_subdir / year
|
dest_dir = landing_path(LANDING_DIR, dest_subdir, year)
|
||||||
local_file = dest_dir / f"{date_label}_{etag}.csv.gzip"
|
local_file = dest_dir / f"{date_label}_{etag}.csv.gzip"
|
||||||
|
|
||||||
if local_file.exists():
|
if local_file.exists():
|
||||||
logger.info(f"File {local_file.name} already exists — content unchanged, skipping")
|
logger.info(f"File {local_file.name} already exists — content unchanged, skipping")
|
||||||
return
|
return 0
|
||||||
|
|
||||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
compressed = gzip.compress(canonical_csv)
|
compressed = gzip.compress(canonical_csv)
|
||||||
local_file.write_bytes(compressed)
|
bytes_written = write_bytes_atomic(local_file, compressed)
|
||||||
|
|
||||||
assert local_file.exists(), f"File was not written: {local_file}"
|
logger.info(f"Stored {local_file} ({bytes_written:,} bytes)")
|
||||||
assert local_file.stat().st_size > 0, f"Written file is empty: {local_file}"
|
return bytes_written
|
||||||
|
|
||||||
logger.info(f"Stored {local_file} ({local_file.stat().st_size:,} bytes)")
|
|
||||||
|
|
||||||
|
|
||||||
def _build_csv_bytes(fieldnames: list[str], rows: list[dict]) -> bytes:
|
def _build_csv_bytes(fieldnames: list[str], rows: list[dict]) -> bytes:
|
||||||
@@ -243,47 +249,66 @@ def extract_ice_stocks() -> None:
|
|||||||
discovery to find the latest 'Daily Warehouse Stocks' report.
|
discovery to find the latest 'Daily Warehouse Stocks' report.
|
||||||
Idempotent: skips if content hash already on disk.
|
Idempotent: skips if content hash already on disk.
|
||||||
"""
|
"""
|
||||||
with niquests.Session() as session:
|
conn = open_state_db(LANDING_DIR)
|
||||||
logger.info(f"Trying ICE rolling CSV: {ICE_ROLLING_CSV_URL}")
|
run_id = start_run(conn, "ice_stocks")
|
||||||
try:
|
try:
|
||||||
response = session.get(ICE_ROLLING_CSV_URL, timeout=HTTP_TIMEOUT_SECONDS)
|
with niquests.Session() as session:
|
||||||
except Exception as e:
|
logger.info(f"Trying ICE rolling CSV: {ICE_ROLLING_CSV_URL}")
|
||||||
logger.warning(f"Rolling CSV fetch failed: {e} — trying API discovery")
|
|
||||||
response = None
|
|
||||||
|
|
||||||
use_api = response is None or response.status_code == 404
|
|
||||||
|
|
||||||
if use_api:
|
|
||||||
logger.info("Falling back to ICE API discovery for Daily Warehouse Stocks")
|
|
||||||
report = find_latest_report(session, ICE_STOCKS_LABEL)
|
|
||||||
if not report:
|
|
||||||
logger.error("ICE API: no 'Daily Warehouse Stocks' report found")
|
|
||||||
return
|
|
||||||
logger.info(f"Found report via API: {report['download_label']} ({report['publish_date']})")
|
|
||||||
try:
|
try:
|
||||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
response = session.get(ICE_ROLLING_CSV_URL, timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download report from API URL: {e}")
|
logger.warning(f"Rolling CSV fetch failed: {e} — trying API discovery")
|
||||||
|
response = None
|
||||||
|
|
||||||
|
use_api = response is None or response.status_code == 404
|
||||||
|
|
||||||
|
if use_api:
|
||||||
|
logger.info("Falling back to ICE API discovery for Daily Warehouse Stocks")
|
||||||
|
report = find_latest_report(session, ICE_STOCKS_LABEL)
|
||||||
|
if not report:
|
||||||
|
logger.error("ICE API: no 'Daily Warehouse Stocks' report found")
|
||||||
|
end_run(conn, run_id, status="failed", error_message="No report found via API")
|
||||||
|
return
|
||||||
|
logger.info(f"Found report via API: {report['download_label']} ({report['publish_date']})")
|
||||||
|
try:
|
||||||
|
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download report from API URL: {e}")
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
return
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logger.error(f"Unexpected status {response.status_code}")
|
||||||
|
end_run(conn, run_id, status="failed", error_message=f"HTTP {response.status_code}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if response.status_code != 200:
|
assert len(response.content) > 0, "Downloaded empty file from ICE"
|
||||||
logger.error(f"Unexpected status {response.status_code}")
|
|
||||||
|
fmt = detect_file_format(response.content)
|
||||||
|
if fmt == "xls":
|
||||||
|
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
||||||
|
else:
|
||||||
|
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
||||||
|
|
||||||
|
if not canonical_csv:
|
||||||
|
logger.warning("ICE stocks parsed to 0 rows — check column mapping or XLS structure")
|
||||||
|
end_run(conn, run_id, status="failed", error_message="Parsed 0 rows")
|
||||||
return
|
return
|
||||||
|
|
||||||
assert len(response.content) > 0, "Downloaded empty file from ICE"
|
today = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
bytes_written = _write_landing_file(canonical_csv, DEST_SUBDIR, today)
|
||||||
fmt = detect_file_format(response.content)
|
end_run(
|
||||||
if fmt == "xls":
|
conn, run_id, status="success",
|
||||||
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
files_written=1 if bytes_written > 0 else 0,
|
||||||
else:
|
files_skipped=1 if bytes_written == 0 else 0,
|
||||||
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
bytes_written=bytes_written,
|
||||||
|
cursor_value=today,
|
||||||
if not canonical_csv:
|
)
|
||||||
logger.warning("ICE stocks parsed to 0 rows — check column mapping or XLS structure")
|
except Exception as e:
|
||||||
return
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
raise
|
||||||
today = datetime.now().strftime("%Y-%m-%d")
|
finally:
|
||||||
_write_landing_file(canonical_csv, DEST_SUBDIR, today)
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
# ── ice_aging (monthly aging report) ────────────────────────────────────────
|
# ── ice_aging (monthly aging report) ────────────────────────────────────────
|
||||||
@@ -309,65 +334,85 @@ def extract_ice_aging() -> None:
|
|||||||
Monthly report: stock quantities by age bucket × port.
|
Monthly report: stock quantities by age bucket × port.
|
||||||
Idempotent: skips if content hash already on disk.
|
Idempotent: skips if content hash already on disk.
|
||||||
"""
|
"""
|
||||||
with niquests.Session() as session:
|
conn = open_state_db(LANDING_DIR)
|
||||||
logger.info("Fetching latest ICE Aging Report via API")
|
run_id = start_run(conn, "ice_aging")
|
||||||
report = find_latest_report(session, ICE_AGING_LABEL)
|
try:
|
||||||
if not report:
|
with niquests.Session() as session:
|
||||||
logger.error(f"ICE API: no report matching {ICE_AGING_LABEL!r}")
|
logger.info("Fetching latest ICE Aging Report via API")
|
||||||
|
report = find_latest_report(session, ICE_AGING_LABEL)
|
||||||
|
if not report:
|
||||||
|
logger.error(f"ICE API: no report matching {ICE_AGING_LABEL!r}")
|
||||||
|
end_run(conn, run_id, status="failed", error_message="No aging report found via API")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Downloading: {report['download_label']} ({report['publish_date']})")
|
||||||
|
try:
|
||||||
|
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download aging report: {e}")
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
return
|
||||||
|
|
||||||
|
assert response.status_code == 200, f"HTTP {response.status_code}"
|
||||||
|
assert response.content[:4] == OLE2_MAGIC, "Aging report is not an XLS file"
|
||||||
|
|
||||||
|
rows = xls_to_rows(response.content)
|
||||||
|
|
||||||
|
report_date = _parse_aging_date(str(rows[0][0]) if rows else "")
|
||||||
|
if not report_date:
|
||||||
|
msg = f"Could not parse aging report date from row 0: {rows[0] if rows else '(empty)'!r}"
|
||||||
|
logger.error(msg)
|
||||||
|
end_run(conn, run_id, status="failed", error_message=msg)
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"Downloading: {report['download_label']} ({report['publish_date']})")
|
# Row 3+ are data rows; stop at row labelled "Total"
|
||||||
try:
|
fieldnames = ["report_date", "age_bucket"] + AGING_PORT_HEADERS
|
||||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
data_rows = []
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to download aging report: {e}")
|
|
||||||
return
|
|
||||||
|
|
||||||
assert response.status_code == 200, f"HTTP {response.status_code}"
|
for row in rows[3:]:
|
||||||
assert response.content[:4] == OLE2_MAGIC, "Aging report is not an XLS file"
|
if not row or not str(row[0]).strip():
|
||||||
|
continue
|
||||||
|
label = str(row[0]).strip()
|
||||||
|
if label.lower() == "total":
|
||||||
|
break
|
||||||
|
|
||||||
rows = xls_to_rows(response.content)
|
port_values = []
|
||||||
|
for cell in row[1:]:
|
||||||
|
if isinstance(cell, float):
|
||||||
|
port_values.append(str(int(cell)))
|
||||||
|
elif str(cell).strip() in ("-", ""):
|
||||||
|
port_values.append("0")
|
||||||
|
else:
|
||||||
|
port_values.append(str(cell).replace(",", "").strip())
|
||||||
|
|
||||||
report_date = _parse_aging_date(str(rows[0][0]) if rows else "")
|
while len(port_values) < len(AGING_PORT_HEADERS):
|
||||||
if not report_date:
|
|
||||||
logger.error(f"Could not parse aging report date from row 0: {rows[0] if rows else '(empty)'!r}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Row 3+ are data rows; stop at row labelled "Total"
|
|
||||||
fieldnames = ["report_date", "age_bucket"] + AGING_PORT_HEADERS
|
|
||||||
data_rows = []
|
|
||||||
|
|
||||||
for row in rows[3:]:
|
|
||||||
if not row or not str(row[0]).strip():
|
|
||||||
continue
|
|
||||||
label = str(row[0]).strip()
|
|
||||||
if label.lower() == "total":
|
|
||||||
break
|
|
||||||
|
|
||||||
port_values = []
|
|
||||||
for cell in row[1:]:
|
|
||||||
if isinstance(cell, float):
|
|
||||||
port_values.append(str(int(cell)))
|
|
||||||
elif str(cell).strip() in ("-", ""):
|
|
||||||
port_values.append("0")
|
port_values.append("0")
|
||||||
else:
|
port_values = port_values[:len(AGING_PORT_HEADERS)]
|
||||||
port_values.append(str(cell).replace(",", "").strip())
|
|
||||||
|
|
||||||
while len(port_values) < len(AGING_PORT_HEADERS):
|
record = {"report_date": report_date, "age_bucket": label}
|
||||||
port_values.append("0")
|
for col, val in zip(AGING_PORT_HEADERS, port_values):
|
||||||
port_values = port_values[:len(AGING_PORT_HEADERS)]
|
record[col] = val
|
||||||
|
data_rows.append(record)
|
||||||
|
|
||||||
record = {"report_date": report_date, "age_bucket": label}
|
if not data_rows:
|
||||||
for col, val in zip(AGING_PORT_HEADERS, port_values):
|
logger.warning("Aging report parsed to 0 data rows")
|
||||||
record[col] = val
|
end_run(conn, run_id, status="failed", error_message="Parsed 0 data rows")
|
||||||
data_rows.append(record)
|
return
|
||||||
|
|
||||||
if not data_rows:
|
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
||||||
logger.warning("Aging report parsed to 0 data rows")
|
bytes_written = _write_landing_file(canonical_csv, AGING_DEST_SUBDIR, report_date)
|
||||||
return
|
end_run(
|
||||||
|
conn, run_id, status="success",
|
||||||
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
files_written=1 if bytes_written > 0 else 0,
|
||||||
_write_landing_file(canonical_csv, AGING_DEST_SUBDIR, report_date)
|
files_skipped=1 if bytes_written == 0 else 0,
|
||||||
|
bytes_written=bytes_written,
|
||||||
|
cursor_value=report_date,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
# ── ice_stocks_by_port (historical) ─────────────────────────────────────────
|
# ── ice_stocks_by_port (historical) ─────────────────────────────────────────
|
||||||
@@ -387,63 +432,80 @@ def extract_ice_historical() -> None:
|
|||||||
Static URL updated monthly. Covers Nov 1996 to present.
|
Static URL updated monthly. Covers Nov 1996 to present.
|
||||||
Idempotent: skips if content hash already on disk.
|
Idempotent: skips if content hash already on disk.
|
||||||
"""
|
"""
|
||||||
logger.info(f"Downloading ICE historical by-port XLS: {ICE_HISTORICAL_URL}")
|
conn = open_state_db(LANDING_DIR)
|
||||||
|
run_id = start_run(conn, "ice_historical")
|
||||||
|
try:
|
||||||
|
logger.info(f"Downloading ICE historical by-port XLS: {ICE_HISTORICAL_URL}")
|
||||||
|
|
||||||
with niquests.Session() as session:
|
with niquests.Session() as session:
|
||||||
try:
|
try:
|
||||||
response = session.get(ICE_HISTORICAL_URL, timeout=HISTORICAL_HTTP_TIMEOUT_SECONDS)
|
response = session.get(ICE_HISTORICAL_URL, timeout=HISTORICAL_HTTP_TIMEOUT_SECONDS)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download historical XLS: {e}")
|
logger.error(f"Failed to download historical XLS: {e}")
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
return
|
||||||
|
|
||||||
|
assert response.status_code == 200, f"HTTP {response.status_code}"
|
||||||
|
assert response.content[:4] == OLE2_MAGIC, "Historical file is not an XLS"
|
||||||
|
|
||||||
|
book = xlrd.open_workbook(file_contents=response.content)
|
||||||
|
datemode = book.datemode
|
||||||
|
rows = xls_to_rows(response.content)
|
||||||
|
|
||||||
|
# Data starts at row 8 (0-indexed); rows 0-7 are headers
|
||||||
|
fieldnames = ["report_date"] + HISTORICAL_PORT_COLS
|
||||||
|
data_rows = []
|
||||||
|
|
||||||
|
for row in rows[8:]:
|
||||||
|
if not row or len(row) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
serial_cell = row[1]
|
||||||
|
if not isinstance(serial_cell, float) or serial_cell <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
report_date = _excel_serial_to_date(serial_cell, datemode)
|
||||||
|
if not report_date:
|
||||||
|
continue
|
||||||
|
|
||||||
|
port_cells = row[2:2 + len(HISTORICAL_PORT_COLS)]
|
||||||
|
port_values = []
|
||||||
|
for cell in port_cells:
|
||||||
|
if cell == "" or str(cell).strip() in ("-", ""):
|
||||||
|
port_values.append("0")
|
||||||
|
elif isinstance(cell, float):
|
||||||
|
port_values.append(str(int(cell)))
|
||||||
|
else:
|
||||||
|
port_values.append(str(cell).replace(",", "").strip())
|
||||||
|
|
||||||
|
while len(port_values) < len(HISTORICAL_PORT_COLS):
|
||||||
|
port_values.append("0")
|
||||||
|
|
||||||
|
record = {"report_date": report_date}
|
||||||
|
for col, val in zip(HISTORICAL_PORT_COLS, port_values):
|
||||||
|
record[col] = val
|
||||||
|
data_rows.append(record)
|
||||||
|
|
||||||
|
if not data_rows:
|
||||||
|
logger.warning("Historical XLS parsed to 0 data rows")
|
||||||
|
end_run(conn, run_id, status="failed", error_message="Parsed 0 data rows")
|
||||||
return
|
return
|
||||||
|
|
||||||
assert response.status_code == 200, f"HTTP {response.status_code}"
|
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
||||||
assert response.content[:4] == OLE2_MAGIC, "Historical file is not an XLS"
|
today = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
bytes_written = _write_landing_file(canonical_csv, HISTORICAL_DEST_SUBDIR, today)
|
||||||
book = xlrd.open_workbook(file_contents=response.content)
|
end_run(
|
||||||
datemode = book.datemode
|
conn, run_id, status="success",
|
||||||
rows = xls_to_rows(response.content)
|
files_written=1 if bytes_written > 0 else 0,
|
||||||
|
files_skipped=1 if bytes_written == 0 else 0,
|
||||||
# Data starts at row 8 (0-indexed); rows 0-7 are headers
|
bytes_written=bytes_written,
|
||||||
fieldnames = ["report_date"] + HISTORICAL_PORT_COLS
|
cursor_value=today,
|
||||||
data_rows = []
|
)
|
||||||
|
except Exception as e:
|
||||||
for row in rows[8:]:
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
if not row or len(row) < 2:
|
raise
|
||||||
continue
|
finally:
|
||||||
|
conn.close()
|
||||||
serial_cell = row[1]
|
|
||||||
if not isinstance(serial_cell, float) or serial_cell <= 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
report_date = _excel_serial_to_date(serial_cell, datemode)
|
|
||||||
if not report_date:
|
|
||||||
continue
|
|
||||||
|
|
||||||
port_cells = row[2:2 + len(HISTORICAL_PORT_COLS)]
|
|
||||||
port_values = []
|
|
||||||
for cell in port_cells:
|
|
||||||
if cell == "" or str(cell).strip() in ("-", ""):
|
|
||||||
port_values.append("0")
|
|
||||||
elif isinstance(cell, float):
|
|
||||||
port_values.append(str(int(cell)))
|
|
||||||
else:
|
|
||||||
port_values.append(str(cell).replace(",", "").strip())
|
|
||||||
|
|
||||||
while len(port_values) < len(HISTORICAL_PORT_COLS):
|
|
||||||
port_values.append("0")
|
|
||||||
|
|
||||||
record = {"report_date": report_date}
|
|
||||||
for col, val in zip(HISTORICAL_PORT_COLS, port_values):
|
|
||||||
record[col] = val
|
|
||||||
data_rows.append(record)
|
|
||||||
|
|
||||||
if not data_rows:
|
|
||||||
logger.warning("Historical XLS parsed to 0 data rows")
|
|
||||||
return
|
|
||||||
|
|
||||||
canonical_csv = _build_csv_bytes(fieldnames, data_rows)
|
|
||||||
today = datetime.now().strftime("%Y-%m-%d")
|
|
||||||
_write_landing_file(canonical_csv, HISTORICAL_DEST_SUBDIR, today)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
|
def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
|
||||||
@@ -458,50 +520,63 @@ def extract_ice_stocks_backfill(max_pages: int = 3) -> None:
|
|||||||
"""
|
"""
|
||||||
assert max_pages > 0, f"max_pages must be positive, got {max_pages}"
|
assert max_pages > 0, f"max_pages must be positive, got {max_pages}"
|
||||||
|
|
||||||
with niquests.Session() as session:
|
conn = open_state_db(LANDING_DIR)
|
||||||
logger.info(f"Fetching all available Daily Warehouse Stocks reports (max {max_pages} pages)...")
|
run_id = start_run(conn, "ice_stocks_backfill")
|
||||||
reports = find_all_reports(session, ICE_STOCKS_LABEL, max_pages=max_pages)
|
files_written = 0
|
||||||
|
files_skipped = 0
|
||||||
|
bytes_written_total = 0
|
||||||
|
try:
|
||||||
|
with niquests.Session() as session:
|
||||||
|
logger.info(f"Fetching all available Daily Warehouse Stocks reports (max {max_pages} pages)...")
|
||||||
|
reports = find_all_reports(session, ICE_STOCKS_LABEL, max_pages=max_pages)
|
||||||
|
|
||||||
if not reports:
|
if not reports:
|
||||||
logger.error("ICE API: no 'Daily Warehouse Stocks' reports found")
|
logger.error("ICE API: no 'Daily Warehouse Stocks' reports found")
|
||||||
return
|
end_run(conn, run_id, status="failed", error_message="No reports found via API")
|
||||||
|
return
|
||||||
|
|
||||||
logger.info(f"Found {len(reports)} reports: {reports[-1]['publish_date']} → {reports[0]['publish_date']}")
|
logger.info(f"Found {len(reports)} reports: {reports[-1]['publish_date']} → {reports[0]['publish_date']}")
|
||||||
downloaded = 0
|
|
||||||
skipped = 0
|
|
||||||
|
|
||||||
for report in reports:
|
for report in reports:
|
||||||
publish_date = report["publish_date"]
|
publish_date = report["publish_date"]
|
||||||
try:
|
try:
|
||||||
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
response = session.get(report["download_url"], timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to download {publish_date}: {e}")
|
logger.warning(f"Failed to download {publish_date}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
logger.warning(f"HTTP {response.status_code} for {publish_date} — skipping")
|
logger.warning(f"HTTP {response.status_code} for {publish_date} — skipping")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
fmt = detect_file_format(response.content)
|
fmt = detect_file_format(response.content)
|
||||||
if fmt == "xls":
|
if fmt == "xls":
|
||||||
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
canonical_csv = _build_canonical_csv_from_xls(response.content)
|
||||||
else:
|
else:
|
||||||
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
canonical_csv = _build_canonical_csv_from_csv(response.content)
|
||||||
|
|
||||||
if not canonical_csv:
|
if not canonical_csv:
|
||||||
logger.warning(f"Parsed 0 rows for {publish_date} — skipping")
|
logger.warning(f"Parsed 0 rows for {publish_date} — skipping")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Use the report's publish date as the file date label
|
result = _write_landing_file(canonical_csv, DEST_SUBDIR, publish_date)
|
||||||
file_count_before = sum(1 for _ in (LANDING_DIR / DEST_SUBDIR).rglob("*.csv.gzip"))
|
if result > 0:
|
||||||
_write_landing_file(canonical_csv, DEST_SUBDIR, publish_date)
|
files_written += 1
|
||||||
file_count_after = sum(1 for _ in (LANDING_DIR / DEST_SUBDIR).rglob("*.csv.gzip"))
|
bytes_written_total += result
|
||||||
if file_count_after > file_count_before:
|
else:
|
||||||
downloaded += 1
|
files_skipped += 1
|
||||||
else:
|
|
||||||
skipped += 1
|
|
||||||
|
|
||||||
logger.info(f"Backfill complete: {downloaded} new files downloaded, {skipped} already existed")
|
logger.info(f"Backfill complete: {files_written} new files downloaded, {files_skipped} already existed")
|
||||||
|
end_run(
|
||||||
|
conn, run_id, status="success",
|
||||||
|
files_written=files_written, files_skipped=files_skipped,
|
||||||
|
bytes_written=bytes_written_total,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
def extract_ice_all() -> None:
|
def extract_ice_all() -> None:
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ authors = [
|
|||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"extract_core",
|
||||||
"niquests>=3.14.1",
|
"niquests>=3.14.1",
|
||||||
]
|
]
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
from .normalize import normalize_zipped_csv
|
from .normalize import normalize_zipped_csv
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import niquests
|
import niquests
|
||||||
|
from extract_core import end_run, landing_path, normalize_etag, open_state_db, start_run
|
||||||
|
from extract_core import write_bytes_atomic
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
@@ -16,7 +18,7 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger("PSDOnline Extractor")
|
logger = logging.getLogger("PSDOnline Extractor")
|
||||||
|
|
||||||
LANDING_DIR = pathlib.Path(os.getenv("LANDING_DIR", "data/landing"))
|
LANDING_DIR = Path(os.getenv("LANDING_DIR", "data/landing"))
|
||||||
LANDING_DIR.mkdir(parents=True, exist_ok=True)
|
LANDING_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
logger.info(f"Landing dir: {LANDING_DIR}")
|
logger.info(f"Landing dir: {LANDING_DIR}")
|
||||||
|
|
||||||
@@ -27,61 +29,87 @@ FIRST_MONTH = 8
|
|||||||
HTTP_TIMEOUT_SECONDS = 60
|
HTTP_TIMEOUT_SECONDS = 60
|
||||||
|
|
||||||
|
|
||||||
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session):
|
def extract_psd_file(url: str, year: int, month: int, http_session: niquests.Session) -> int:
|
||||||
"""Extract PSD file to local year/month subdirectory."""
|
"""Extract PSD file to local year/month subdirectory.
|
||||||
|
|
||||||
|
Returns bytes_written (0 if the file already existed and was skipped).
|
||||||
|
"""
|
||||||
logger.info(f"Requesting file {url} ...")
|
logger.info(f"Requesting file {url} ...")
|
||||||
|
|
||||||
response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
response = http_session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
if response.status_code == 404:
|
if response.status_code == 404:
|
||||||
logger.error("File doesn't exist on server, received status code 404 Not Found")
|
logger.error("File doesn't exist on server, received status code 404 Not Found")
|
||||||
return
|
return 0
|
||||||
elif response.status_code != 200:
|
elif response.status_code != 200:
|
||||||
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
||||||
return
|
return 0
|
||||||
|
|
||||||
etag = response.headers.get("etag", "").replace('"', "").replace(":", "_")
|
raw_etag = response.headers.get("etag", "")
|
||||||
assert etag, "USDA response missing etag header"
|
assert raw_etag, "USDA response missing etag header"
|
||||||
|
etag = normalize_etag(raw_etag)
|
||||||
|
|
||||||
extract_to_path = LANDING_DIR / "psd" / str(year) / f"{month:02d}"
|
dest_dir = landing_path(LANDING_DIR, "psd", str(year), f"{month:02d}")
|
||||||
local_file = extract_to_path / f"{etag}.csv.gzip"
|
local_file = dest_dir / f"{etag}.csv.gzip"
|
||||||
if local_file.exists():
|
if local_file.exists():
|
||||||
logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
|
logger.info(f"File {etag}.csv.gzip already exists locally, skipping")
|
||||||
return
|
return 0
|
||||||
|
|
||||||
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
response = http_session.get(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
logger.info(f"Storing file to {local_file}")
|
logger.info(f"Storing file to {local_file}")
|
||||||
extract_to_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
normalized_content = normalize_zipped_csv(BytesIO(response.content))
|
normalized_content = normalize_zipped_csv(BytesIO(response.content))
|
||||||
local_file.write_bytes(normalized_content.read())
|
bytes_written = write_bytes_atomic(local_file, normalized_content.read())
|
||||||
assert local_file.exists(), f"File was not written: {local_file}"
|
|
||||||
logger.info("Download complete")
|
logger.info("Download complete")
|
||||||
|
return bytes_written
|
||||||
|
|
||||||
|
|
||||||
def extract_psd_dataset():
|
def extract_psd_dataset():
|
||||||
today = datetime.now()
|
conn = open_state_db(LANDING_DIR)
|
||||||
|
run_id = start_run(conn, "psdonline")
|
||||||
|
files_written = 0
|
||||||
|
files_skipped = 0
|
||||||
|
bytes_written = 0
|
||||||
|
cursor_value = None
|
||||||
|
try:
|
||||||
|
today = datetime.now()
|
||||||
|
with niquests.Session() as session:
|
||||||
|
for months_back in range(4):
|
||||||
|
year = today.year
|
||||||
|
month = today.month - months_back
|
||||||
|
while month < 1:
|
||||||
|
month += 12
|
||||||
|
year -= 1
|
||||||
|
|
||||||
with niquests.Session() as session:
|
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||||
for months_back in range(4):
|
logger.info(f"Trying {year}-{month:02d}...")
|
||||||
year = today.year
|
|
||||||
month = today.month - months_back
|
|
||||||
while month < 1:
|
|
||||||
month += 12
|
|
||||||
year -= 1
|
|
||||||
|
|
||||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
||||||
logger.info(f"Trying {year}-{month:02d}...")
|
if response.status_code == 200:
|
||||||
|
logger.info(f"Found latest data at {year}-{month:02d}")
|
||||||
response = session.head(url, timeout=HTTP_TIMEOUT_SECONDS)
|
result = extract_psd_file(url=url, year=year, month=month, http_session=session)
|
||||||
if response.status_code == 200:
|
if result > 0:
|
||||||
logger.info(f"Found latest data at {year}-{month:02d}")
|
files_written = 1
|
||||||
extract_psd_file(url=url, year=year, month=month, http_session=session)
|
bytes_written = result
|
||||||
return
|
else:
|
||||||
elif response.status_code == 404:
|
files_skipped = 1
|
||||||
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
|
cursor_value = f"{year}-{month:02d}"
|
||||||
|
break
|
||||||
|
elif response.status_code == 404:
|
||||||
|
logger.info(f"Month {year}-{month:02d} not found, trying earlier...")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unexpected status code {response.status_code} for {year}-{month:02d}")
|
logger.error("Could not find any available data in the last 4 months")
|
||||||
|
|
||||||
logger.error("Could not find any available data in the last 4 months")
|
end_run(
|
||||||
|
conn, run_id, status="success",
|
||||||
|
files_written=files_written, files_skipped=files_skipped,
|
||||||
|
bytes_written=bytes_written, cursor_value=cursor_value,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
end_run(conn, run_id, status="failed", error_message=str(e))
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ dev = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
|
extract_core = {workspace = true }
|
||||||
psdonline = {workspace = true }
|
psdonline = {workspace = true }
|
||||||
sqlmesh_materia = {workspace = true }
|
sqlmesh_materia = {workspace = true }
|
||||||
cftc_cot = {workspace = true }
|
cftc_cot = {workspace = true }
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ def test_extract_cot_year_skips_existing_file(tmp_path, monkeypatch):
|
|||||||
|
|
||||||
result = cot_execute.extract_cot_year(2024, mock_session)
|
result = cot_execute.extract_cot_year(2024, mock_session)
|
||||||
|
|
||||||
assert result is False
|
assert result == 0
|
||||||
mock_session.get.assert_not_called() # No download should occur
|
mock_session.get.assert_not_called() # No download should occur
|
||||||
|
|
||||||
|
|
||||||
@@ -143,5 +143,5 @@ def test_extract_cot_year_returns_false_on_404(tmp_path, monkeypatch):
|
|||||||
|
|
||||||
result = cot_execute.extract_cot_year(2006, mock_session)
|
result = cot_execute.extract_cot_year(2006, mock_session)
|
||||||
|
|
||||||
assert result is False
|
assert result == 0
|
||||||
mock_session.get.assert_not_called()
|
mock_session.get.assert_not_called()
|
||||||
|
|||||||
32
uv.lock
generated
32
uv.lock
generated
@@ -11,6 +11,7 @@ members = [
|
|||||||
"beanflows",
|
"beanflows",
|
||||||
"cftc-cot",
|
"cftc-cot",
|
||||||
"coffee-prices",
|
"coffee-prices",
|
||||||
|
"extract-core",
|
||||||
"ice-stocks",
|
"ice-stocks",
|
||||||
"materia",
|
"materia",
|
||||||
"psdonline",
|
"psdonline",
|
||||||
@@ -365,11 +366,15 @@ name = "cftc-cot"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { editable = "extract/cftc_cot" }
|
source = { editable = "extract/cftc_cot" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
{ name = "extract-core" },
|
||||||
{ name = "niquests" },
|
{ name = "niquests" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [{ name = "niquests", specifier = ">=3.14.1" }]
|
requires-dist = [
|
||||||
|
{ name = "extract-core", editable = "extract/extract_core" },
|
||||||
|
{ name = "niquests", specifier = ">=3.14.1" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
@@ -438,11 +443,15 @@ name = "coffee-prices"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { editable = "extract/coffee_prices" }
|
source = { editable = "extract/coffee_prices" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
{ name = "extract-core" },
|
||||||
{ name = "yfinance" },
|
{ name = "yfinance" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [{ name = "yfinance", specifier = ">=0.2.55" }]
|
requires-dist = [
|
||||||
|
{ name = "extract-core", editable = "extract/extract_core" },
|
||||||
|
{ name = "yfinance", specifier = ">=0.2.55" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorama"
|
name = "colorama"
|
||||||
@@ -740,6 +749,17 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
|
{ url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "extract-core"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = { editable = "extract/extract_core" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "niquests" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.metadata]
|
||||||
|
requires-dist = [{ name = "niquests", specifier = ">=3.14.1" }]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fakeredis"
|
name = "fakeredis"
|
||||||
version = "2.34.0"
|
version = "2.34.0"
|
||||||
@@ -1059,12 +1079,14 @@ name = "ice-stocks"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { editable = "extract/ice_stocks" }
|
source = { editable = "extract/ice_stocks" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
{ name = "extract-core" },
|
||||||
{ name = "niquests" },
|
{ name = "niquests" },
|
||||||
{ name = "xlrd" },
|
{ name = "xlrd" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
|
{ name = "extract-core", editable = "extract/extract_core" },
|
||||||
{ name = "niquests", specifier = ">=3.14.1" },
|
{ name = "niquests", specifier = ">=3.14.1" },
|
||||||
{ name = "xlrd", specifier = ">=2.0.1" },
|
{ name = "xlrd", specifier = ">=2.0.1" },
|
||||||
]
|
]
|
||||||
@@ -2067,11 +2089,15 @@ name = "psdonline"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { editable = "extract/psdonline" }
|
source = { editable = "extract/psdonline" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
{ name = "extract-core" },
|
||||||
{ name = "niquests" },
|
{ name = "niquests" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [{ name = "niquests", specifier = ">=3.14.1" }]
|
requires-dist = [
|
||||||
|
{ name = "extract-core", editable = "extract/extract_core" },
|
||||||
|
{ name = "niquests", specifier = ">=3.14.1" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "psutil"
|
name = "psutil"
|
||||||
|
|||||||
Reference in New Issue
Block a user