Skip to content
Snippets Groups Projects
Commit b2434588 authored by Alois Klink's avatar Alois Klink
Browse files

Handle 404 and other download errors better

Caches 404 errors by default since they remain constant.
This means building the new dataset is much faster.
parent d4d81ab3
No related branches found
No related tags found
No related merge requests found
...@@ -2,6 +2,7 @@ import argparse ...@@ -2,6 +2,7 @@ import argparse
import enum # importing this since needed for argparse import enum # importing this since needed for argparse
import datetime import datetime
from pathlib import Path from pathlib import Path
import csv
try: try:
import dateutil.parser # used to decihper types import dateutil.parser # used to decihper types
...@@ -140,13 +141,22 @@ def make_url(location: Locations, format: Formats, date: datetime.datetime ...@@ -140,13 +141,22 @@ def make_url(location: Locations, format: Formats, date: datetime.datetime
return urllib.parse.urlunparse(urlparts), filename return urllib.parse.urlunparse(urlparts), filename
def download(url: str, output_loc, wait: typing.Optional[int]=None): def make_error_csv(output_loc, error_no, msg):
with open(output_loc, "w") as f:
writer = csv.DictWriter(f, fieldnames=["ERROR", "ERROR_MSG"])
writer.writeheader()
writer.writerow({"ERROR": error_no,
"ERROR_MSG": msg})
def download(url: str, output_loc, wait: typing.Optional[int]=None,
cache_404: bool=True):
"""Downloads the given url to the output location. """Downloads the given url to the output location.
Arguments: Arguments:
url: The url to download url: The url to download
output_loc: Where to download the url to. output_loc: Where to download the url to.
wait: Wait for this many seconds to avoid overloading the server. wait: Wait for this many seconds to avoid overloading the server.
cache_404: If there is a 404 file-not-found error, cache it.
""" """
if wait is None: if wait is None:
wait = 6 # wait for 6 seconds between downloads wait = 6 # wait for 6 seconds between downloads
...@@ -157,9 +167,20 @@ def download(url: str, output_loc, wait: typing.Optional[int]=None): ...@@ -157,9 +167,20 @@ def download(url: str, output_loc, wait: typing.Optional[int]=None):
temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp") temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
try: try:
urllib.request.urlretrieve(url, temp_loc) urllib.request.urlretrieve(url, temp_loc)
temp_loc.rename(output_loc) # for some reason sometimes the csv is empty, so
# make an error if that happens
if temp_loc.stat().st_size == 0:
make_error_csv(output_loc, 204, f"{url} was empty")
raise Exception(f"Empty CSV Error when downloading {url}")
except urllib.error.HTTPError as e:
if e.code == 404 and cache_404:
make_error_csv(output_loc, error_no=404, msg=f"{url} not found")
raise Exception(f"{e} when downloading {url}") from e
except Exception as e: except Exception as e:
raise Exception(f"{e} when downloading {url}") from e raise Exception(f"{e} when downloading {url}") from e
else:
# if no errors, move the downloaded file to the real location
temp_loc.rename(output_loc)
finally: finally:
time.sleep(wait) # wait for x seconds to avoid the server blocking us time.sleep(wait) # wait for x seconds to avoid the server blocking us
...@@ -196,6 +217,13 @@ def uppercase_columns(pandas_dataset): ...@@ -196,6 +217,13 @@ def uppercase_columns(pandas_dataset):
pandas_dataset.columns = pandas_dataset.columns.str.upper() pandas_dataset.columns = pandas_dataset.columns.str.upper()
return pandas_dataset return pandas_dataset
def read_csv(csv_file_path: str):
try:
return pd.read_csv(csv_file_path)
except pd.errors.EmptyDataError as e:
new_e = pd.errors.EmptyDataError(f"{e} {csv_file_path}")
raise new_e from e
def combine_csvs(input_csv_files: list, output_loc): def combine_csvs(input_csv_files: list, output_loc):
"""Combines the given csv files into one big one. """Combines the given csv files into one big one.
...@@ -205,8 +233,9 @@ def combine_csvs(input_csv_files: list, output_loc): ...@@ -205,8 +233,9 @@ def combine_csvs(input_csv_files: list, output_loc):
input_csv_files: The list of csv files to combine. input_csv_files: The list of csv files to combine.
output_loc: Where the save the combined csv file. output_loc: Where the save the combined csv file.
""" """
combined = pd.concat([uppercase_columns(pd.read_csv(f)) combined = pd.concat([uppercase_columns(read_csv(f))
for f in input_csv_files]) for f in input_csv_files],
sort=False)
combined.to_csv(output_loc, index=False, encoding='utf-8') combined.to_csv(output_loc, index=False, encoding='utf-8')
def make_csv(location: Locations, def make_csv(location: Locations,
...@@ -244,12 +273,14 @@ def make_csv(location: Locations, ...@@ -244,12 +273,14 @@ def make_csv(location: Locations,
# show the file we are downloading # show the file we are downloading
pbar.set_description(str(filename)) pbar.set_description(str(filename))
try: try:
download(url, output_loc=output_loc, wait=wait) download(url, output_loc=output_loc, wait=wait, cache_404=True)
# add the file to the list of downloaded files
file_name_list.append(output_loc)
except Exception as e: except Exception as e:
if not quiet: if not quiet:
pbar.write(f"{e}") pbar.write(f"{e}")
else:
# add the file to the list of downloaded files if there was
# no exception
file_name_list.append(output_loc)
pbar.update(1) # update the fancy progressbar pbar.update(1) # update the fancy progressbar
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment