Skip to content
Snippets Groups Projects
Select Git revision
  • 4d38c5bfee6cad8d7a0971abc93521e8b0f96b68
  • master default protected
2 results

chimet-scraper.py

Blame
  • chimet-scraper.py 9.76 KiB
    import argparse
    import enum # importing this since needed for argparse
    import datetime
    from pathlib import Path
    
    try:
        import dateutil.parser # used to decihper types
        import pandas as pd # fast csv combining
        import tqdm # cool progress bar
    except ImportError as e:
        raise ImportError("Could not find needed modules. Please run \n"
            "pip3 install python-dateutil pandas tqdm\n"
            "to install all modules. Original error was {e}") from e
    
    def parse_date(date_str):
        try:
            return dateutil.parser.parse(date_str)
        except ValueError as e:
            raise argparse.ArgumentTypeError("{e}") from e
    
    def dir(dir_str):
        dir_path = Path(dir_str)
        dir_path.mkdir(exist_ok=True)
        return dir_path
    
    class Locations(enum.Enum):
        SOTON_DOCKHEAD = "Sot"
        BRAMBLE_BANK = "Bra"
        CHICHESTER_BAR = "Chi"
        CHICHESTER_HABOUR = "Cam"
    
    def parse_args(args=None):
        parser = argparse.ArgumentParser(
            description="Download some historic MET data from Chimet.co.uk.",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        parser.add_argument(
            "--location", choices=[e.value for e in Locations], required=False,
            help=("Downoad data only from a specific location. "
                  "Leave unspecified to download from all locations."))
        parser.add_argument(
            "--start", type=parse_date,
            help="The first date to download data from.",
            default="2009-01-01")
        yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
        parser.add_argument(
            "--end", type=parse_date,
            help="The last date to download data from.",
            default=yesterday.strftime("%Y-%m-%d"))
        parser.add_argument(
            "--dlperiod", type=float,
            help=("Period between downloads in seconds. "
                 "Increase this to download data slower if the server is blocking "
                 "you."), default=1)
        parser.add_argument("--cachedir", type=dir,
            help="The location of the directory to keep the cached downloads",
            default=Path(__file__).parent / "cache",
        )
        parser.add_argument("--outputdir", type=dir,
            help="The location of the directory to store the output super CSV",
            default=Path(__file__).parent / "output",
        )
        parser.add_argument("-q", "--quiet", action="store_true",
            help="Supresses HTTP errors when downloading files."
        )
        return parser.parse_args(args=args)
    
    if __name__ == "__main__": # only parse args if this is a main script
        args = parse_args()
    
    # these imports were not needed for parseargs
    import urllib.parse
    import urllib.request
    import time # for time.sleep
    import typing
    
    opener = urllib.request.build_opener()
    # add these user-agent headers so website can identify us
    opener.addheaders = [("User-agent", "chimet-scraper")]
    urllib.request.install_opener(opener)
    
    class Formats(enum.Enum):
        HTML = "html"
        CSV = "csv"
    
    def make_url(location: Locations, format: Formats, date: datetime.datetime
    ) -> typing.Tuple[str, str]:
        """Makes a URL for downloading a file from chimet.
    
        Arguments:
            location: The location of the data, ie "Sot", "Bra", etc.
            format: This format of the data, either "csv" or "html"
            date: The date of the data to download.
    
        Examples:
            >>> url, filename = make_url(
            ...     location="Bra",
            ...     format="csv",
            ...     date=datetime.datetime(2010, 4, 14))
            >>> url
            "http://www.bramblemet.co.uk/archive/2010/April/CSV/Bra14Apr2010.csv"
            >>> filename
            "Bra14Apr2010.csv"
    
        Returns:
            the url, filename of the CSV file
        """
        location = Locations(location) # ensure location is valid
        location_filename = location.value # ie Sot
        format = Formats(format).value # ensure format is valid
        caps_format = format.upper()
    
        netloc_map = {
            Locations.SOTON_DOCKHEAD: "www.sotonmet.co.uk",
            Locations.BRAMBLE_BANK: "www.bramblemet.co.uk",
            Locations.CHICHESTER_BAR: "www.chimet.co.uk",
            Locations.CHICHESTER_HABOUR: "www.cambermet.co.uk"
        }
        # gets the appropriate netloc for the location
        netloc = netloc_map.get(location)
    
        current_date = datetime.datetime.now()
        yesterday = current_date - datetime.timedelta(days=1)
        if date > yesterday and date.day == current_date.day:
            raise ValueError(
                f"Current date: {current_date.date()} is not in past!")
    
        file_date_name = date.strftime("%d%b%Y") # ie 01Dec1999
        long_month = date.strftime("%B") # ie December
        year = date.year
    
        filename = f"{location.value}{file_date_name}.{format}"
    
        urlparts = urllib.parse.ParseResult(
            scheme="http",
            netloc=netloc,
            path=f"archive/{year}/{long_month}/{caps_format}/{filename}",
            params="",
            query="",
            fragment="")
    
        return urllib.parse.urlunparse(urlparts), filename
    
    def download(url: str, output_loc, wait: typing.Optional[int]=None):
        """Downloads the given url to the output location.
    
        Arguments:
            url: The url to download
            output_loc: Where to download the url to.
            wait: Wait for this many seconds to avoid overloading the server.
        """
        if wait is None:
            wait = 6 # wait for 6 seconds between downloads
    
        output_loc = Path(output_loc)
        if output_loc.is_file():
            return # do nothing
        temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
        try:
            urllib.request.urlretrieve(url, temp_loc)
            temp_loc.rename(output_loc)
        except Exception as e:
            raise Exception(f"{e} when downloading {url}") from e
        finally:
            time.sleep(wait) # wait for x seconds to avoid the server blocking us
    
    def daterange(from_time, to_time) -> typing.List[datetime.datetime]:
        """Makes a list of all dates between to times.
    
        Arguments:
            from_time: The start time to make the range from.
            to_time: The last time to make the range to.
    
        Returns:
            a list of dates between two datetimes
        """
        start = from_time
        step = datetime.timedelta(days=1)
        dates = [] # new empty list
        while start <= to_time:
            dates.append(start)
            start += step
    
        return dates
    
    def uppercase_columns(pandas_dataset):
        """Makes sure all the column names are uppercase
    
        Arguments:
            pandas_dataset:
                The dataset to uppercase column names of.
                Warning, this modifies this object.
    
        Returns:
            The dataset with uppercase column names.
        """
        pandas_dataset.columns = pandas_dataset.columns.str.upper()
        return pandas_dataset
    
    def combine_csvs(input_csv_files: list, output_loc):
        """Combines the given csv files into one big one.
        
        Make sure the CSV files all have the same headers.
    
        Arguments:
            input_csv_files: The list of csv files to combine.
            output_loc: Where the save the combined csv file.
        """
        combined = pd.concat([uppercase_columns(pd.read_csv(f))
                              for f in input_csv_files])
        combined.to_csv(output_loc, index=False, encoding='utf-8')
    
    def make_csv(location: Locations,
                 from_time: datetime.datetime,
                 to_time: datetime.datetime,
                 outputdir: Path,
                 cachedir: Path,
                 wait: float = None,
                 quiet: bool = False):
        """Makes a giant csv of a certain location between two times.
    
        Arguments:
            location:
                The location of where the data was taken,
                ie "Sot" for Southampton Docks.
            from_time: The first day of data to download.
            to_time: The last day of data to download.
            outputdir: Where to save the output.
            cachedir: Where to cache the downloads.
            wait: The amount of time to wait between downloads.
            quiet: If False, print HTTP errors of failed downloads.
        """
    
        datelist = daterange(from_time, to_time)
    
        cache_dir = Path(cachedir)
        cache_dir.mkdir(exist_ok=True) # make a new folder
    
        file_name_list = []
    
        with tqdm.tqdm(total=len(datelist)) as pbar:
            for date in datelist:
                url, filename = make_url(location, "csv", date)
                output_loc = cache_dir / filename # ie ./cache/Sot27May2019.csv
                # show the file we are downloading
                pbar.set_description(str(filename))
                try:
                    download(url, output_loc=output_loc, wait=wait)
                    # add the file to the list of downloaded files
                    file_name_list.append(output_loc)
                except Exception as e:
                    if not quiet:
                        pbar.write(f"{e}")
                
                pbar.update(1) # update the fancy progressbar
    
        output_dir = Path(outputdir)
        output_dir.mkdir(exist_ok=True)
    
        output_filename = f"{location}-{from_time.date()}-{to_time.date()}.csv"
        output_loc = output_dir / output_filename
    
        combine_csvs(file_name_list, output_loc)
        print(f"Made a CSV of {location} from date "
            f"{from_time.date()} to {to_time.date()} at {output_loc}")
    
    def download_all():
        """Downloads all the data from all locations."""
        current_date = datetime.datetime.now()
        yesterday = current_date - datetime.timedelta(days=1)
        first_date = datetime.datetime(2009, 1, 1)
        for loc in "Bra", "Sot", "Chi", "Cam":
            make_csv(loc, datetime.datetime(2009, 1, 1), yesterday, wait=1)
    
    def main(args):
        kwargs = dict(from_time=args.start, to_time=args.end,
                      outputdir=args.outputdir, cachedir=args.cachedir,
                      wait=args.dlperiod, quiet=args.quiet)
        if args.location:
            make_csv(args.location, **kwargs)
        # else download all locations
        pbar = tqdm.tqdm(Locations, desc=("Iterating over "
                                          f"{[x.value for x in Locations]}"))
        for loc in pbar:
            pbar.set_description(f"Donwloading {loc.value} data.")
            make_csv(loc, **kwargs)
    
    if __name__ == "__main__":
        main(args)