From b243458832dd4acf53f4f99759538fc2dfcf5dda Mon Sep 17 00:00:00 2001
From: Alois Klink <alois.klink@soton.ac.uk>
Date: Thu, 30 May 2019 16:58:06 +0100
Subject: [PATCH] Handle 404 and other download errors better

Caches 404 errors by default since they remain constant.
This means building the new dataset is much faster.
---
 chimet-scraper.py | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/chimet-scraper.py b/chimet-scraper.py
index 81d8325..42f75a0 100644
--- a/chimet-scraper.py
+++ b/chimet-scraper.py
@@ -2,6 +2,7 @@ import argparse
 import enum # importing this since needed for argparse
 import datetime
 from pathlib import Path
+import csv
 
 try:
     import dateutil.parser # used to decihper types
@@ -140,13 +141,22 @@ def make_url(location: Locations, format: Formats, date: datetime.datetime
 
     return urllib.parse.urlunparse(urlparts), filename
 
-def download(url: str, output_loc, wait: typing.Optional[int]=None):
+def make_error_csv(output_loc, error_no, msg):
+    with open(output_loc, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=["ERROR", "ERROR_MSG"])
+        writer.writeheader()
+        writer.writerow({"ERROR": error_no,
+                         "ERROR_MSG": msg})
+
+def download(url: str, output_loc, wait: typing.Optional[int]=None,
+             cache_404: bool=True):
     """Downloads the given url to the output location.
 
     Arguments:
         url: The url to download
         output_loc: Where to download the url to.
         wait: Wait for this many seconds to avoid overloading the server.
+        cache_404: If there is a 404 file-not-found error, cache it.
     """
     if wait is None:
         wait = 6 # wait for 6 seconds between downloads
@@ -157,9 +167,20 @@ def download(url: str, output_loc, wait: typing.Optional[int]=None):
     temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
     try:
         urllib.request.urlretrieve(url, temp_loc)
-        temp_loc.rename(output_loc)
+        # for some reason sometimes the csv is empty, so
+        # make an error if that happens
+        if temp_loc.stat().st_size == 0:
+            make_error_csv(output_loc, 204, f"{url} was empty")
+            raise Exception(f"Empty CSV Error when downloading {url}")
+    except urllib.error.HTTPError as e:
+        if e.code == 404 and cache_404:
+            make_error_csv(output_loc, error_no=404, msg=f"{url} not found")
+        raise Exception(f"{e} when downloading {url}") from e
     except Exception as e:
         raise Exception(f"{e} when downloading {url}") from e
+    else:
+        # if no errors, move the downloaded file to the real location
+        temp_loc.rename(output_loc)
     finally:
         time.sleep(wait) # wait for x seconds to avoid the server blocking us
 
@@ -196,6 +217,13 @@ def uppercase_columns(pandas_dataset):
     pandas_dataset.columns = pandas_dataset.columns.str.upper()
     return pandas_dataset
 
+def read_csv(csv_file_path: str):
+    try:
+        return pd.read_csv(csv_file_path)
+    except pd.errors.EmptyDataError as e:
+        new_e = pd.errors.EmptyDataError(f"{e} {csv_file_path}")
+        raise new_e from e
+
 def combine_csvs(input_csv_files: list, output_loc):
     """Combines the given csv files into one big one.
     
@@ -205,8 +233,9 @@ def combine_csvs(input_csv_files: list, output_loc):
         input_csv_files: The list of csv files to combine.
         output_loc: Where the save the combined csv file.
     """
-    combined = pd.concat([uppercase_columns(pd.read_csv(f))
-                          for f in input_csv_files])
+    combined = pd.concat([uppercase_columns(read_csv(f))
+                          for f in input_csv_files],
+                         sort=False)
     combined.to_csv(output_loc, index=False, encoding='utf-8')
 
 def make_csv(location: Locations,
@@ -244,12 +273,14 @@ def make_csv(location: Locations,
             # show the file we are downloading
             pbar.set_description(str(filename))
             try:
-                download(url, output_loc=output_loc, wait=wait)
-                # add the file to the list of downloaded files
-                file_name_list.append(output_loc)
+                download(url, output_loc=output_loc, wait=wait, cache_404=True)
             except Exception as e:
                 if not quiet:
                     pbar.write(f"{e}")
+            else:
+                # add the file to the list of downloaded files if there was
+                # no exception
+                file_name_list.append(output_loc)
             
             pbar.update(1) # update the fancy progressbar
 
-- 
GitLab