adding incremental load abilities
This commit is contained in:
@@ -23,9 +23,12 @@ FIRST_YEAR = 2006
|
||||
FIRST_MONTH = 8
|
||||
|
||||
def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: niquests.Session):
|
||||
logger.info(f"Start downloading {url} ...")
|
||||
logger.info(f"Requesting file {url} ...")
|
||||
response = http_session.get(url)
|
||||
if response.status_code != 200:
|
||||
if response.status_code == 404:
|
||||
logger.error("File doesn't exist on server, received status code 404 Not Found")
|
||||
return
|
||||
elif response.status_code != 200:
|
||||
logger.error(f"Status code not ok, STATUS={response.status_code}")
|
||||
return
|
||||
logger.info(f"Storing file to {extract_to_path}")
|
||||
@@ -33,14 +36,26 @@ def extract_psd_file(url:str, extract_to_path: pathlib.Path, http_session: nique
|
||||
extract_to_path.write_bytes(response.content)
|
||||
logger.info("Download done.")
|
||||
|
||||
def extraction_status():
|
||||
extracted_files = list(output_dir.rglob("*.zip"))
|
||||
extracted_months = [tuple(map(int, str(file).split("/")[-3:-1])) for file in extracted_files]
|
||||
return extracted_months
|
||||
|
||||
def extract_historical_psd_dataset():
|
||||
status = extraction_status()
|
||||
today = datetime.now()
|
||||
years = list(range(FIRST_YEAR, today.year))
|
||||
months = list(range(1,13))
|
||||
logger.info(f"Downloading {len(years) * len(months)} urls")
|
||||
years = list(range(FIRST_YEAR, today.year+1))
|
||||
for year in years:
|
||||
if year == years[0]:
|
||||
months = list(range(8, 13))
|
||||
if year == years[-1]:
|
||||
months = list(range(1, today.month+1))
|
||||
else:
|
||||
months = list(range(1,13))
|
||||
|
||||
for month in months:
|
||||
if (year, month) in status:
|
||||
continue
|
||||
url = PSD_HISTORICAL_URL.format(year=year, month=month)
|
||||
target_path = output_dir / f"{year}"/f"{month:02d}" / "psd_alldata_csv.zip"
|
||||
with niquests.Session() as session:
|
||||
|
||||
Reference in New Issue
Block a user