Skip to content
Snippets Groups Projects
Commit b2434588 authored by Alois Klink's avatar Alois Klink
Browse files

Handle 404 and other download errors better

Caches 404 errors by default since they remain constant.
This means building the new dataset is much faster.
parent d4d81ab3
Branches master
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@ import argparse
import enum # importing this since needed for argparse
import datetime
from pathlib import Path
import csv
try:
import dateutil.parser # used to decihper types
......@@ -140,13 +141,22 @@ def make_url(location: Locations, format: Formats, date: datetime.datetime
return urllib.parse.urlunparse(urlparts), filename
def download(url: str, output_loc, wait: typing.Optional[int]=None):
def make_error_csv(output_loc, error_no, msg):
with open(output_loc, "w") as f:
writer = csv.DictWriter(f, fieldnames=["ERROR", "ERROR_MSG"])
writer.writeheader()
writer.writerow({"ERROR": error_no,
"ERROR_MSG": msg})
def download(url: str, output_loc, wait: typing.Optional[int]=None,
cache_404: bool=True):
"""Downloads the given url to the output location.
Arguments:
url: The url to download
output_loc: Where to download the url to.
wait: Wait for this many seconds to avoid overloading the server.
cache_404: If there is a 404 file-not-found error, cache it.
"""
if wait is None:
wait = 6 # wait for 6 seconds between downloads
......@@ -157,9 +167,20 @@ def download(url: str, output_loc, wait: typing.Optional[int]=None):
temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
try:
urllib.request.urlretrieve(url, temp_loc)
temp_loc.rename(output_loc)
# for some reason sometimes the csv is empty, so
# make an error if that happens
if temp_loc.stat().st_size == 0:
make_error_csv(output_loc, 204, f"{url} was empty")
raise Exception(f"Empty CSV Error when downloading {url}")
except urllib.error.HTTPError as e:
if e.code == 404 and cache_404:
make_error_csv(output_loc, error_no=404, msg=f"{url} not found")
raise Exception(f"{e} when downloading {url}") from e
except Exception as e:
raise Exception(f"{e} when downloading {url}") from e
else:
# if no errors, move the downloaded file to the real location
temp_loc.rename(output_loc)
finally:
time.sleep(wait) # wait for x seconds to avoid the server blocking us
......@@ -196,6 +217,13 @@ def uppercase_columns(pandas_dataset):
pandas_dataset.columns = pandas_dataset.columns.str.upper()
return pandas_dataset
def read_csv(csv_file_path: str):
try:
return pd.read_csv(csv_file_path)
except pd.errors.EmptyDataError as e:
new_e = pd.errors.EmptyDataError(f"{e} {csv_file_path}")
raise new_e from e
def combine_csvs(input_csv_files: list, output_loc):
"""Combines the given csv files into one big one.
......@@ -205,8 +233,9 @@ def combine_csvs(input_csv_files: list, output_loc):
input_csv_files: The list of csv files to combine.
output_loc: Where the save the combined csv file.
"""
combined = pd.concat([uppercase_columns(pd.read_csv(f))
for f in input_csv_files])
combined = pd.concat([uppercase_columns(read_csv(f))
for f in input_csv_files],
sort=False)
combined.to_csv(output_loc, index=False, encoding='utf-8')
def make_csv(location: Locations,
......@@ -244,12 +273,14 @@ def make_csv(location: Locations,
# show the file we are downloading
pbar.set_description(str(filename))
try:
download(url, output_loc=output_loc, wait=wait)
# add the file to the list of downloaded files
file_name_list.append(output_loc)
download(url, output_loc=output_loc, wait=wait, cache_404=True)
except Exception as e:
if not quiet:
pbar.write(f"{e}")
else:
# add the file to the list of downloaded files if there was
# no exception
file_name_list.append(output_loc)
pbar.update(1) # update the fancy progressbar
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment