cleanup and prefect service setup
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from .normalize import normalize_zipped_csv
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
@@ -33,7 +34,7 @@ FIRST_MONTH = 8
|
||||
|
||||
def check_r2_file_exists(etag: str, s3_client) -> bool:
|
||||
"""Check if file exists in R2."""
|
||||
r2_key = f"landing/psd/{etag}.zip"
|
||||
r2_key = f"landing/psd/{etag}.csv.gzip"
|
||||
try:
|
||||
s3_client.head_object(Bucket=R2_BUCKET, Key=r2_key)
|
||||
logger.info(f"File {r2_key} already exists in R2, skipping")
|
||||
@@ -46,7 +47,7 @@ def check_r2_file_exists(etag: str, s3_client) -> bool:
|
||||
|
||||
def upload_to_r2(content: bytes, etag: str, s3_client):
|
||||
"""Upload file content to R2."""
|
||||
r2_key = f"landing/psd/{etag}.zip"
|
||||
r2_key = f"landing/psd/{etag}.csv.gzip"
|
||||
logger.info(f"Uploading to R2: {r2_key}")
|
||||
s3_client.put_object(Bucket=R2_BUCKET, Key=r2_key, Body=content)
|
||||
logger.info("Upload complete")
|
||||
@@ -75,11 +76,12 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
|
||||
if check_r2_file_exists(etag, s3_client):
|
||||
return
|
||||
response = http_session.get(url)
|
||||
upload_to_r2(response.content, etag, s3_client)
|
||||
normalized_content = normalize_zipped_csv(response.content)
|
||||
upload_to_r2(normalized_content, etag, s3_client)
|
||||
return
|
||||
|
||||
# Local mode: check local and download if needed
|
||||
local_file = extract_to_path / f"{etag}.zip"
|
||||
local_file = extract_to_path / f"{etag}.csv.gzip"
|
||||
if local_file.exists():
|
||||
logger.info(f"File {etag}.zip already exists locally, skipping")
|
||||
return
|
||||
@@ -87,7 +89,8 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
|
||||
response = http_session.get(url)
|
||||
logger.info(f"Storing file to {local_file}")
|
||||
extract_to_path.mkdir(parents=True, exist_ok=True)
|
||||
local_file.write_bytes(response.content)
|
||||
normalized_content = normalize_zipped_csv(response.content)
|
||||
local_file.write_bytes(normalized_content)
|
||||
logger.info("Download complete")
|
||||
|
||||
|
||||
|
||||
55
extract/psdonline/src/psdonline/normalize.py
Normal file
55
extract/psdonline/src/psdonline/normalize.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import zipfile
|
||||
import gzip
|
||||
from io import BytesIO
|
||||
import pathlib
|
||||
|
||||
|
||||
|
||||
def normalize_zipped_csv(buffer: BytesIO)->BytesIO:
|
||||
out = BytesIO()
|
||||
with zipfile.ZipFile(buffer, mode='r').open("psd_alldata.csv", mode='r') as csv:
|
||||
with gzip.open(out, "wb") as outfile:
|
||||
outfile.write(csv.read())
|
||||
out.seek(0)
|
||||
return out
|
||||
|
||||
|
||||
def convert_existing():
|
||||
data = pathlib.Path(__file__).parent / "data"
|
||||
for file in data.glob("*.zip"):
|
||||
outfile = data / f"{file.stem}.csv.gzip"
|
||||
if outfile.exists() and outfile.stat().st_size > 0:
|
||||
continue
|
||||
print(file)
|
||||
gzip_contents = normalize_zipped_csv(file)
|
||||
outfile.write_bytes(gzip_contents.read())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# """Test to make sure file contents are the same"""
|
||||
# import pathlib
|
||||
# import hashlib
|
||||
#
|
||||
# test_file = pathlib.Path(__file__).parent / "data/00d6e992d8c81_0.zip"
|
||||
#
|
||||
# with zipfile.ZipFile(test_file.open("rb"), mode='r').open("psd_alldata.csv", mode='r') as csv:
|
||||
# raw_hash = hashlib.sha256(csv.read()).hexdigest()
|
||||
#
|
||||
# normalized = normalize_zipped_csv(test_file.open("rb"))
|
||||
# print(raw_hash)
|
||||
#
|
||||
# with gzip.open(normalized, "rb") as normalized_file:
|
||||
# normalized_hash = hashlib.sha256(normalized_file.read()).hexdigest()
|
||||
# print(normalized_hash)
|
||||
#
|
||||
# assert raw_hash == normalized_hash
|
||||
convert_existing()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user