chimet-scraper.py

import argparse
import enum # importing this since needed for argparse
import datetime
from pathlib import Path

try:
    import dateutil.parser # used to decihper types
    import pandas as pd # fast csv combining
    import tqdm # cool progress bar
except ImportError as e:
    raise ImportError("Could not find needed modules. Please run \n"
        "pip3 install python-dateutil pandas tqdm\n"
        "to install all modules. Original error was {e}") from e

def parse_date(date_str):
    try:
        return dateutil.parser.parse(date_str)
    except ValueError as e:
        raise argparse.ArgumentTypeError("{e}") from e

def dir(dir_str):
    dir_path = Path(dir_str)
    dir_path.mkdir(exist_ok=True)
    return dir_path

class Locations(enum.Enum):
    SOTON_DOCKHEAD = "Sot"
    BRAMBLE_BANK = "Bra"
    CHICHESTER_BAR = "Chi"
    CHICHESTER_HABOUR = "Cam"

def parse_args(args=None):
    parser = argparse.ArgumentParser(
        description="Download some historic MET data from Chimet.co.uk.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--location", choices=[e.value for e in Locations], required=False,
        help=("Downoad data only from a specific location. "
              "Leave unspecified to download from all locations."))
    parser.add_argument(
        "--start", type=parse_date,
        help="The first date to download data from.",
        default="2009-01-01")
    yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
    parser.add_argument(
        "--end", type=parse_date,
        help="The last date to download data from.",
        default=yesterday.strftime("%Y-%m-%d"))
    parser.add_argument(
        "--dlperiod", type=float,
        help=("Period between downloads in seconds. "
             "Increase this to download data slower if the server is blocking "
             "you."), default=1)
    parser.add_argument("--cachedir", type=dir,
        help="The location of the directory to keep the cached downloads",
        default=Path(__file__).parent / "cache",
    )
    parser.add_argument("--outputdir", type=dir,
        help="The location of the directory to store the output super CSV",
        default=Path(__file__).parent / "output",
    )
    parser.add_argument("-q", "--quiet", action="store_true",
        help="Supresses HTTP errors when downloading files."
    )
    return parser.parse_args(args=args)

if __name__ == "__main__": # only parse args if this is a main script
    args = parse_args()

# these imports were not needed for parseargs
import urllib.parse
import urllib.request
import time # for time.sleep
import typing

opener = urllib.request.build_opener()
# add these user-agent headers so website can identify us
opener.addheaders = [("User-agent", "chimet-scraper")]
urllib.request.install_opener(opener)

class Formats(enum.Enum):
    HTML = "html"
    CSV = "csv"

def make_url(location: Locations, format: Formats, date: datetime.datetime
) -> typing.Tuple[str, str]:
    """Makes a URL for downloading a file from chimet.

    Arguments:
        location: The location of the data, ie "Sot", "Bra", etc.
        format: This format of the data, either "csv" or "html"
        date: The date of the data to download.

    Examples:
        >>> url, filename = make_url(
        ...     location="Bra",
        ...     format="csv",
        ...     date=datetime.datetime(2010, 4, 14))
        >>> url
        "http://www.bramblemet.co.uk/archive/2010/April/CSV/Bra14Apr2010.csv"
        >>> filename
        "Bra14Apr2010.csv"

    Returns:
        the url, filename of the CSV file
    """
    location = Locations(location) # ensure location is valid
    location_filename = location.value # ie Sot
    format = Formats(format).value # ensure format is valid
    caps_format = format.upper()

    netloc_map = {
        Locations.SOTON_DOCKHEAD: "www.sotonmet.co.uk",
        Locations.BRAMBLE_BANK: "www.bramblemet.co.uk",
        Locations.CHICHESTER_BAR: "www.chimet.co.uk",
        Locations.CHICHESTER_HABOUR: "www.cambermet.co.uk"
    }
    # gets the appropriate netloc for the location
    netloc = netloc_map.get(location)

    current_date = datetime.datetime.now()
    yesterday = current_date - datetime.timedelta(days=1)
    if date > yesterday and date.day == current_date.day:
        raise ValueError(
            f"Current date: {current_date.date()} is not in past!")

    file_date_name = date.strftime("%d%b%Y") # ie 01Dec1999
    long_month = date.strftime("%B") # ie December
    year = date.year

    filename = f"{location.value}{file_date_name}.{format}"

    urlparts = urllib.parse.ParseResult(
        scheme="http",
        netloc=netloc,
        path=f"archive/{year}/{long_month}/{caps_format}/{filename}",
        params="",
        query="",
        fragment="")

    return urllib.parse.urlunparse(urlparts), filename

def download(url: str, output_loc, wait: typing.Optional[int]=None):
    """Downloads the given url to the output location.

    Arguments:
        url: The url to download
        output_loc: Where to download the url to.
        wait: Wait for this many seconds to avoid overloading the server.
    """
    if wait is None:
        wait = 6 # wait for 6 seconds between downloads

    output_loc = Path(output_loc)
    if output_loc.is_file():
        return # do nothing
    temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
    try:
        urllib.request.urlretrieve(url, temp_loc)
        temp_loc.rename(output_loc)
    except Exception as e:
        raise Exception(f"{e} when downloading {url}") from e
    finally:
        time.sleep(wait) # wait for x seconds to avoid the server blocking us

def daterange(from_time, to_time) -> typing.List[datetime.datetime]:
    """Makes a list of all dates between to times.

    Arguments:
        from_time: The start time to make the range from.
        to_time: The last time to make the range to.

    Returns:
        a list of dates between two datetimes
    """
    start = from_time
    step = datetime.timedelta(days=1)
    dates = [] # new empty list
    while start <= to_time:
        dates.append(start)
        start += step

    return dates

def uppercase_columns(pandas_dataset):
    """Makes sure all the column names are uppercase

    Arguments:
        pandas_dataset:
            The dataset to uppercase column names of.
            Warning, this modifies this object.

    Returns:
        The dataset with uppercase column names.
    """
    pandas_dataset.columns = pandas_dataset.columns.str.upper()
    return pandas_dataset

def combine_csvs(input_csv_files: list, output_loc):
    """Combines the given csv files into one big one.

    Make sure the CSV files all have the same headers.

    Arguments:
        input_csv_files: The list of csv files to combine.
        output_loc: Where the save the combined csv file.
    """
    combined = pd.concat([uppercase_columns(pd.read_csv(f))
                          for f in input_csv_files])
    combined.to_csv(output_loc, index=False, encoding='utf-8')

def make_csv(location: Locations,
             from_time: datetime.datetime,
             to_time: datetime.datetime,
             outputdir: Path,
             cachedir: Path,
             wait: float = None,
             quiet: bool = False):
    """Makes a giant csv of a certain location between two times.

    Arguments:
        location:
            The location of where the data was taken,
            ie "Sot" for Southampton Docks.
        from_time: The first day of data to download.
        to_time: The last day of data to download.
        outputdir: Where to save the output.
        cachedir: Where to cache the downloads.
        wait: The amount of time to wait between downloads.
        quiet: If False, print HTTP errors of failed downloads.
    """

    datelist = daterange(from_time, to_time)

    cache_dir = Path(cachedir)
    cache_dir.mkdir(exist_ok=True) # make a new folder

    file_name_list = []

    with tqdm.tqdm(total=len(datelist)) as pbar:
        for date in datelist:
            url, filename = make_url(location, "csv", date)
            output_loc = cache_dir / filename # ie ./cache/Sot27May2019.csv
            # show the file we are downloading
            pbar.set_description(str(filename))
            try:
                download(url, output_loc=output_loc, wait=wait)
                # add the file to the list of downloaded files
                file_name_list.append(output_loc)
            except Exception as e:
                if not quiet:
                    pbar.write(f"{e}")

            pbar.update(1) # update the fancy progressbar

    output_dir = Path(outputdir)
    output_dir.mkdir(exist_ok=True)

    output_filename = f"{location}-{from_time.date()}-{to_time.date()}.csv"
    output_loc = output_dir / output_filename

    combine_csvs(file_name_list, output_loc)
    print(f"Made a CSV of {location} from date "
        f"{from_time.date()} to {to_time.date()} at {output_loc}")

def download_all():
    """Downloads all the data from all locations."""
    current_date = datetime.datetime.now()
    yesterday = current_date - datetime.timedelta(days=1)
    first_date = datetime.datetime(2009, 1, 1)
    for loc in "Bra", "Sot", "Chi", "Cam":
        make_csv(loc, datetime.datetime(2009, 1, 1), yesterday, wait=1)

def main(args):
    kwargs = dict(from_time=args.start, to_time=args.end,
                  outputdir=args.outputdir, cachedir=args.cachedir,
                  wait=args.dlperiod, quiet=args.quiet)
    if args.location:
        make_csv(args.location, **kwargs)
    # else download all locations
    pbar = tqdm.tqdm(Locations, desc=("Iterating over "
                                      f"{[x.value for x in Locations]}"))
    for loc in pbar:
        pbar.set_description(f"Donwloading {loc.value} data.")
        make_csv(loc, **kwargs)

if __name__ == "__main__":
    main(args)