beanflows/tests/test_ice_extraction.py

"""Tests for ICE extraction: format detection, XLS parsing, API client."""

import csv
import gzip
import io
import struct
from unittest.mock import MagicMock, patch

import pytest
import xlwt  # noqa: F401 — needed to create XLS fixtures; skip tests if missing

from ice_stocks.ice_api import fetch_report_listings, find_latest_report
from ice_stocks.xls_parse import OLE2_MAGIC, detect_file_format, xls_to_rows

# ── helpers ──────────────────────────────────────────────────────────────────

def _make_xls_bytes(rows: list[list]) -> bytes:
    """Create a minimal in-memory XLS with one sheet."""
    book = xlwt.Workbook()
    sheet = book.add_sheet("Sheet1")
    for r, row in enumerate(rows):
        for c, val in enumerate(row):
            sheet.write(r, c, val)
    buf = io.BytesIO()
    book.save(buf)
    return buf.getvalue()


def _api_response(rows: list[dict]) -> dict:
    """Build a mock ICE API response payload."""
    return {"datasets": {"results": {"rows": rows}}}


def _make_api_row(label: str, url: str = "/download/test.xls", publish_date: str = "2026-02-01") -> dict:
    return {
        "publishDate": publish_date,
        "productName": "Coffee C",
        "download": {"url": url, "label": label},
    }


# ── detect_file_format ───────────────────────────────────────────────────────

def test_detect_file_format_xls():
    content = OLE2_MAGIC + b"\x00" * 100
    assert detect_file_format(content) == "xls"


def test_detect_file_format_xlsx():
    content = b"PK\x03\x04" + b"\x00" * 100
    assert detect_file_format(content) == "xlsx"


def test_detect_file_format_html():
    content = b"<html><body>foo</body></html>"
    assert detect_file_format(content) == "html"


def test_detect_file_format_csv():
    content = b"report_date,total_certified_bags\n2026-01-01,100000\n"
    assert detect_file_format(content) == "csv"


# ── xls_to_rows ──────────────────────────────────────────────────────────────

def test_xls_to_rows_roundtrip():
    pytest.importorskip("xlwt")
    input_rows = [
        ["header1", "header2", "header3"],
        ["value1", 42.0, "value3"],
        ["", 0.0, ""],
    ]
    xls_bytes = _make_xls_bytes(input_rows)
    assert xls_bytes[:4] == OLE2_MAGIC

    result = xls_to_rows(xls_bytes)
    assert len(result) == 3
    assert result[0][0] == "header1"
    assert result[1][1] == 42.0
    # Empty cells come back as ""
    assert result[2][0] == ""


def test_xls_to_rows_rejects_non_xls():
    with pytest.raises(AssertionError, match="Not an OLE2"):
        xls_to_rows(b"PK\x03\x04" + b"\x00" * 100)


# ── fetch_report_listings ────────────────────────────────────────────────────

def test_fetch_report_listings_parses_response():
    mock_session = MagicMock()
    mock_session.post.return_value.status_code = 200
    mock_session.post.return_value.json.return_value = _api_response([
        _make_api_row("Daily Warehouse Stocks", "/dl/stocks.xls"),
        _make_api_row("Certified Stock Aging Report", "/dl/aging.xls"),
    ])

    from ice_stocks.ice_api import ICE_BASE_URL, fetch_report_listings
    rows = fetch_report_listings(mock_session, product_id=2)

    assert len(rows) == 2
    assert rows[0]["download_label"] == "Daily Warehouse Stocks"
    assert rows[0]["download_url"] == ICE_BASE_URL + "/dl/stocks.xls"
    assert rows[1]["download_label"] == "Certified Stock Aging Report"


def test_fetch_report_listings_prepends_base_url_for_absolute():
    """If URL already starts with http, don't prepend base."""
    mock_session = MagicMock()
    mock_session.post.return_value.status_code = 200
    mock_session.post.return_value.json.return_value = _api_response([
        _make_api_row("Test", "https://other.example.com/file.xls"),
    ])

    from ice_stocks.ice_api import fetch_report_listings
    rows = fetch_report_listings(mock_session, product_id=2)
    assert rows[0]["download_url"] == "https://other.example.com/file.xls"


# ── find_latest_report ───────────────────────────────────────────────────────

def test_find_latest_report_label_match():
    mock_session = MagicMock()
    mock_session.post.return_value.status_code = 200
    mock_session.post.return_value.json.return_value = _api_response([
        _make_api_row("Daily Warehouse Stocks"),
        _make_api_row("Certified Stock Aging Report"),
        _make_api_row("Historical Stocks"),
    ])

    result = find_latest_report(mock_session, "Aging Report")
    assert result is not None
    assert result["download_label"] == "Certified Stock Aging Report"


def test_find_latest_report_no_match_returns_none():
    mock_session = MagicMock()
    mock_session.post.return_value.status_code = 200
    # Return empty rows on all pages
    mock_session.post.return_value.json.return_value = _api_response([])

    result = find_latest_report(mock_session, "Nonexistent Label XYZ")
    assert result is None


# ── canonical CSV output ──────────────────────────────────────────────────────

def test_build_canonical_csv_from_xls_rows():
    """Verify execute._build_canonical_csv_from_xls produces correct schema."""
    pytest.importorskip("xlwt")

    # Simulate ICE daily stocks XLS structure
    sheet_rows = [
        ["Coffee C Warehouse Stocks"] + [""] * 9,           # row 0
        [""] * 10,                                           # row 1
        ["As of: 2/14/2026"] + [""] * 9,                    # row 2 — report date
    ] + [[""] * 10] * 20 + [                                # rows 3-22
        ["Total in Bags", 10000.0, 5000.0, 2000.0, 1000.0, 500.0, 0.0, 0.0, 0.0, 18500.0],  # row 23
    ]

    xls_bytes = _make_xls_bytes(sheet_rows)

    from ice_stocks.execute import _build_canonical_csv_from_xls
    result = _build_canonical_csv_from_xls(xls_bytes)

    assert result, "Expected non-empty CSV output"
    reader = csv.DictReader(io.StringIO(result.decode("utf-8")))
    rows = list(reader)
    assert len(rows) == 1
    assert rows[0]["report_date"] == "2026-02-14"
    assert rows[0]["total_certified_bags"] == "18500"


def test_build_canonical_csv_from_xls_missing_date_returns_empty():
    """If header row has no parseable date, return empty bytes."""
    pytest.importorskip("xlwt")

    sheet_rows = [["Not a valid date header"] + [""] * 9] + [[""] * 10] * 30
    xls_bytes = _make_xls_bytes(sheet_rows)

    from ice_stocks.execute import _build_canonical_csv_from_xls
    result = _build_canonical_csv_from_xls(xls_bytes)
    assert result == b""