cleanup and prefect service setup

This commit is contained in:
Deeman
2026-02-04 22:24:55 +01:00
parent fc27d5f887
commit 6d4377ccf9
41 changed files with 15888 additions and 2591 deletions

View File

@@ -1,3 +1,4 @@
from .normalize import normalize_zipped_csv
import logging
import os
import pathlib
@@ -33,7 +34,7 @@ FIRST_MONTH = 8
def check_r2_file_exists(etag: str, s3_client) -> bool:
"""Check if file exists in R2."""
r2_key = f"landing/psd/{etag}.zip"
r2_key = f"landing/psd/{etag}.csv.gzip"
try:
s3_client.head_object(Bucket=R2_BUCKET, Key=r2_key)
logger.info(f"File {r2_key} already exists in R2, skipping")
@@ -46,7 +47,7 @@ def check_r2_file_exists(etag: str, s3_client) -> bool:
def upload_to_r2(content: bytes, etag: str, s3_client):
"""Upload file content to R2."""
r2_key = f"landing/psd/{etag}.zip"
r2_key = f"landing/psd/{etag}.csv.gzip"
logger.info(f"Uploading to R2: {r2_key}")
s3_client.put_object(Bucket=R2_BUCKET, Key=r2_key, Body=content)
logger.info("Upload complete")
@@ -75,11 +76,12 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
if check_r2_file_exists(etag, s3_client):
return
response = http_session.get(url)
upload_to_r2(response.content, etag, s3_client)
normalized_content = normalize_zipped_csv(response.content)
upload_to_r2(normalized_content, etag, s3_client)
return
# Local mode: check local and download if needed
local_file = extract_to_path / f"{etag}.zip"
local_file = extract_to_path / f"{etag}.csv.gzip"
if local_file.exists():
logger.info(f"File {etag}.zip already exists locally, skipping")
return
@@ -87,7 +89,8 @@ def extract_psd_file(url: str, extract_to_path: pathlib.Path, http_session: niqu
response = http_session.get(url)
logger.info(f"Storing file to {local_file}")
extract_to_path.mkdir(parents=True, exist_ok=True)
local_file.write_bytes(response.content)
normalized_content = normalize_zipped_csv(response.content)
local_file.write_bytes(normalized_content)
logger.info("Download complete")

View File

@@ -0,0 +1,55 @@
import zipfile
import gzip
from io import BytesIO
import pathlib
def normalize_zipped_csv(buffer: BytesIO)->BytesIO:
out = BytesIO()
with zipfile.ZipFile(buffer, mode='r').open("psd_alldata.csv", mode='r') as csv:
with gzip.open(out, "wb") as outfile:
outfile.write(csv.read())
out.seek(0)
return out
def convert_existing():
data = pathlib.Path(__file__).parent / "data"
for file in data.glob("*.zip"):
outfile = data / f"{file.stem}.csv.gzip"
if outfile.exists() and outfile.stat().st_size > 0:
continue
print(file)
gzip_contents = normalize_zipped_csv(file)
outfile.write_bytes(gzip_contents.read())
if __name__ == "__main__":
# """Test to make sure file contents are the same"""
# import pathlib
# import hashlib
#
# test_file = pathlib.Path(__file__).parent / "data/00d6e992d8c81_0.zip"
#
# with zipfile.ZipFile(test_file.open("rb"), mode='r').open("psd_alldata.csv", mode='r') as csv:
# raw_hash = hashlib.sha256(csv.read()).hexdigest()
#
# normalized = normalize_zipped_csv(test_file.open("rb"))
# print(raw_hash)
#
# with gzip.open(normalized, "rb") as normalized_file:
# normalized_hash = hashlib.sha256(normalized_file.read()).hexdigest()
# print(normalized_hash)
#
# assert raw_hash == normalized_hash
convert_existing()