From b243458832dd4acf53f4f99759538fc2dfcf5dda Mon Sep 17 00:00:00 2001 From: Alois Klink <alois.klink@soton.ac.uk> Date: Thu, 30 May 2019 16:58:06 +0100 Subject: [PATCH] Handle 404 and other download errors better Caches 404 errors by default since they remain constant. This means building the new dataset is much faster. --- chimet-scraper.py | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/chimet-scraper.py b/chimet-scraper.py index 81d8325..42f75a0 100644 --- a/chimet-scraper.py +++ b/chimet-scraper.py @@ -2,6 +2,7 @@ import argparse import enum # importing this since needed for argparse import datetime from pathlib import Path +import csv try: import dateutil.parser # used to decihper types @@ -140,13 +141,22 @@ def make_url(location: Locations, format: Formats, date: datetime.datetime return urllib.parse.urlunparse(urlparts), filename -def download(url: str, output_loc, wait: typing.Optional[int]=None): +def make_error_csv(output_loc, error_no, msg): + with open(output_loc, "w") as f: + writer = csv.DictWriter(f, fieldnames=["ERROR", "ERROR_MSG"]) + writer.writeheader() + writer.writerow({"ERROR": error_no, + "ERROR_MSG": msg}) + +def download(url: str, output_loc, wait: typing.Optional[int]=None, + cache_404: bool=True): """Downloads the given url to the output location. Arguments: url: The url to download output_loc: Where to download the url to. wait: Wait for this many seconds to avoid overloading the server. + cache_404: If there is a 404 file-not-found error, cache it. """ if wait is None: wait = 6 # wait for 6 seconds between downloads @@ -157,9 +167,20 @@ def download(url: str, output_loc, wait: typing.Optional[int]=None): temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp") try: urllib.request.urlretrieve(url, temp_loc) - temp_loc.rename(output_loc) + # for some reason sometimes the csv is empty, so + # make an error if that happens + if temp_loc.stat().st_size == 0: + make_error_csv(output_loc, 204, f"{url} was empty") + raise Exception(f"Empty CSV Error when downloading {url}") + except urllib.error.HTTPError as e: + if e.code == 404 and cache_404: + make_error_csv(output_loc, error_no=404, msg=f"{url} not found") + raise Exception(f"{e} when downloading {url}") from e except Exception as e: raise Exception(f"{e} when downloading {url}") from e + else: + # if no errors, move the downloaded file to the real location + temp_loc.rename(output_loc) finally: time.sleep(wait) # wait for x seconds to avoid the server blocking us @@ -196,6 +217,13 @@ def uppercase_columns(pandas_dataset): pandas_dataset.columns = pandas_dataset.columns.str.upper() return pandas_dataset +def read_csv(csv_file_path: str): + try: + return pd.read_csv(csv_file_path) + except pd.errors.EmptyDataError as e: + new_e = pd.errors.EmptyDataError(f"{e} {csv_file_path}") + raise new_e from e + def combine_csvs(input_csv_files: list, output_loc): """Combines the given csv files into one big one. @@ -205,8 +233,9 @@ def combine_csvs(input_csv_files: list, output_loc): input_csv_files: The list of csv files to combine. output_loc: Where the save the combined csv file. """ - combined = pd.concat([uppercase_columns(pd.read_csv(f)) - for f in input_csv_files]) + combined = pd.concat([uppercase_columns(read_csv(f)) + for f in input_csv_files], + sort=False) combined.to_csv(output_loc, index=False, encoding='utf-8') def make_csv(location: Locations, @@ -244,12 +273,14 @@ def make_csv(location: Locations, # show the file we are downloading pbar.set_description(str(filename)) try: - download(url, output_loc=output_loc, wait=wait) - # add the file to the list of downloaded files - file_name_list.append(output_loc) + download(url, output_loc=output_loc, wait=wait, cache_404=True) except Exception as e: if not quiet: pbar.write(f"{e}") + else: + # add the file to the list of downloaded files if there was + # no exception + file_name_list.append(output_loc) pbar.update(1) # update the fancy progressbar -- GitLab