beanflows/extract/psdonline/src/psdonline/normalize.py

import gzip
import pathlib
import zipfile
from io import BytesIO


def normalize_zipped_csv(buffer: BytesIO)->BytesIO:
    out = BytesIO()
    with zipfile.ZipFile(buffer, mode='r').open("psd_alldata.csv", mode='r') as csv:
        with gzip.open(out, "wb") as outfile:
            outfile.write(csv.read())
    out.seek(0)
    return out


def convert_existing():
    data = pathlib.Path(__file__).parent / "data"
    for file in data.glob("*.zip"):
        outfile = data / f"{file.stem}.csv.gzip"
        if outfile.exists() and outfile.stat().st_size > 0:
            continue
        print(file)
        gzip_contents = normalize_zipped_csv(file)
        outfile.write_bytes(gzip_contents.read())


if __name__ == "__main__":
#    """Test to make sure file contents are the same"""
#    import pathlib
#    import hashlib
#
#    test_file = pathlib.Path(__file__).parent / "data/00d6e992d8c81_0.zip"
#
#    with zipfile.ZipFile(test_file.open("rb"), mode='r').open("psd_alldata.csv", mode='r') as csv:
#        raw_hash = hashlib.sha256(csv.read()).hexdigest()
#
#    normalized = normalize_zipped_csv(test_file.open("rb"))
#    print(raw_hash)
#
#    with gzip.open(normalized, "rb") as normalized_file:
#        normalized_hash = hashlib.sha256(normalized_file.read()).hexdigest()
#    print(normalized_hash)
#
#    assert raw_hash == normalized_hash
    convert_existing()