Handle 404 and other download errors better

Caches 404 errors by default since they remain constant. This means building the new dataset is much faster.

Handle 404 and other download errors better
b2434588 · Alois Klink · d4d81ab3 · b2434588
Commit b2434588 authored 6 years ago by Alois Klink
--- a/chimet-scraper.py
+++ b/chimet-scraper.py
@@ -2,6 +2,7 @@ import argparse
 import enum # importing this since needed for argparse
 import datetime
 from pathlib import Path
+import csv
 try:
    import dateutil.parser # used to decihper types
@@ -140,13 +141,22 @@ def make_url(location: Locations, format: Formats, date: datetime.datetime
    return urllib.parse.urlunparse(urlparts), filename
-def download(url: str, output_loc, wait: typing.Optional[int]=None):
+def make_error_csv(output_loc, error_no, msg):
+    with open(output_loc, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=["ERROR", "ERROR_MSG"])
+        writer.writeheader()
+        writer.writerow({"ERROR": error_no,
+                         "ERROR_MSG": msg})
+def download(url: str, output_loc, wait: typing.Optional[int]=None,
+             cache_404: bool=True):
    """Downloads the given url to the output location.
    Arguments:
        url: The url to download
        output_loc: Where to download the url to.
        wait: Wait for this many seconds to avoid overloading the server.
+        cache_404: If there is a 404 file-not-found error, cache it.
    """
    if wait is None:
        wait = 6 # wait for 6 seconds between downloads
@@ -157,9 +167,20 @@ def download(url: str, output_loc, wait: typing.Optional[int]=None):
    temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
    try:
        urllib.request.urlretrieve(url, temp_loc)
-        temp_loc.rename(output_loc)
+        # for some reason sometimes the csv is empty, so
+        # make an error if that happens
+        if temp_loc.stat().st_size == 0:
+            make_error_csv(output_loc, 204, f"{url} was empty")
+            raise Exception(f"Empty CSV Error when downloading {url}")
+    except urllib.error.HTTPError as e:
+        if e.code == 404 and cache_404:
+            make_error_csv(output_loc, error_no=404, msg=f"{url} not found")
+        raise Exception(f"{e} when downloading {url}") from e
    except Exception as e:
        raise Exception(f"{e} when downloading {url}") from e
+    else:
+        # if no errors, move the downloaded file to the real location
+        temp_loc.rename(output_loc)
    finally:
        time.sleep(wait) # wait for x seconds to avoid the server blocking us
@@ -196,6 +217,13 @@ def uppercase_columns(pandas_dataset):
    pandas_dataset.columns = pandas_dataset.columns.str.upper()
    return pandas_dataset
+def read_csv(csv_file_path: str):
+    try:
+        return pd.read_csv(csv_file_path)
+    except pd.errors.EmptyDataError as e:
+        new_e = pd.errors.EmptyDataError(f"{e} {csv_file_path}")
+        raise new_e from e
 def combine_csvs(input_csv_files: list, output_loc):
    """Combines the given csv files into one big one.
@@ -205,8 +233,9 @@ def combine_csvs(input_csv_files: list, output_loc):
        input_csv_files: The list of csv files to combine.
        output_loc: Where the save the combined csv file.
    """
-    combined = pd.concat([uppercase_columns(pd.read_csv(f))
+    combined = pd.concat([uppercase_columns(read_csv(f))
-                          for f in input_csv_files])
+                          for f in input_csv_files],
+                         sort=False)
    combined.to_csv(output_loc, index=False, encoding='utf-8')
 def make_csv(location: Locations,
@@ -244,12 +273,14 @@ def make_csv(location: Locations,
            # show the file we are downloading
            pbar.set_description(str(filename))
            try:
-                download(url, output_loc=output_loc, wait=wait)
+                download(url, output_loc=output_loc, wait=wait, cache_404=True)
-                # add the file to the list of downloaded files
-                file_name_list.append(output_loc)
            except Exception as e:
                if not quiet:
                    pbar.write(f"{e}")
+            else:
+                # add the file to the list of downloaded files if there was
+                # no exception
+                file_name_list.append(output_loc)
            pbar.update(1) # update the fancy progressbar