Select Git revision
TestBot.java
chimet-scraper.py 9.76 KiB
import argparse
import enum # importing this since needed for argparse
import datetime
from pathlib import Path
try:
import dateutil.parser # used to decihper types
import pandas as pd # fast csv combining
import tqdm # cool progress bar
except ImportError as e:
raise ImportError("Could not find needed modules. Please run \n"
"pip3 install python-dateutil pandas tqdm\n"
"to install all modules. Original error was {e}") from e
def parse_date(date_str):
try:
return dateutil.parser.parse(date_str)
except ValueError as e:
raise argparse.ArgumentTypeError("{e}") from e
def dir(dir_str):
dir_path = Path(dir_str)
dir_path.mkdir(exist_ok=True)
return dir_path
class Locations(enum.Enum):
SOTON_DOCKHEAD = "Sot"
BRAMBLE_BANK = "Bra"
CHICHESTER_BAR = "Chi"
CHICHESTER_HABOUR = "Cam"
def parse_args(args=None):
parser = argparse.ArgumentParser(
description="Download some historic MET data from Chimet.co.uk.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--location", choices=[e.value for e in Locations], required=False,
help=("Downoad data only from a specific location. "
"Leave unspecified to download from all locations."))
parser.add_argument(
"--start", type=parse_date,
help="The first date to download data from.",
default="2009-01-01")
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
parser.add_argument(
"--end", type=parse_date,
help="The last date to download data from.",
default=yesterday.strftime("%Y-%m-%d"))
parser.add_argument(
"--dlperiod", type=float,
help=("Period between downloads in seconds. "
"Increase this to download data slower if the server is blocking "
"you."), default=1)
parser.add_argument("--cachedir", type=dir,
help="The location of the directory to keep the cached downloads",
default=Path(__file__).parent / "cache",
)
parser.add_argument("--outputdir", type=dir,
help="The location of the directory to store the output super CSV",
default=Path(__file__).parent / "output",
)
parser.add_argument("-q", "--quiet", action="store_true",
help="Supresses HTTP errors when downloading files."
)
return parser.parse_args(args=args)
if __name__ == "__main__": # only parse args if this is a main script
args = parse_args()
# these imports were not needed for parseargs
import urllib.parse
import urllib.request
import time # for time.sleep
import typing
opener = urllib.request.build_opener()
# add these user-agent headers so website can identify us
opener.addheaders = [("User-agent", "chimet-scraper")]
urllib.request.install_opener(opener)
class Formats(enum.Enum):
HTML = "html"
CSV = "csv"
def make_url(location: Locations, format: Formats, date: datetime.datetime
) -> typing.Tuple[str, str]:
"""Makes a URL for downloading a file from chimet.
Arguments:
location: The location of the data, ie "Sot", "Bra", etc.
format: This format of the data, either "csv" or "html"
date: The date of the data to download.
Examples:
>>> url, filename = make_url(
... location="Bra",
... format="csv",
... date=datetime.datetime(2010, 4, 14))
>>> url
"http://www.bramblemet.co.uk/archive/2010/April/CSV/Bra14Apr2010.csv"
>>> filename
"Bra14Apr2010.csv"
Returns:
the url, filename of the CSV file
"""
location = Locations(location) # ensure location is valid
location_filename = location.value # ie Sot
format = Formats(format).value # ensure format is valid
caps_format = format.upper()
netloc_map = {
Locations.SOTON_DOCKHEAD: "www.sotonmet.co.uk",
Locations.BRAMBLE_BANK: "www.bramblemet.co.uk",
Locations.CHICHESTER_BAR: "www.chimet.co.uk",
Locations.CHICHESTER_HABOUR: "www.cambermet.co.uk"
}
# gets the appropriate netloc for the location
netloc = netloc_map.get(location)
current_date = datetime.datetime.now()
yesterday = current_date - datetime.timedelta(days=1)
if date > yesterday and date.day == current_date.day:
raise ValueError(
f"Current date: {current_date.date()} is not in past!")
file_date_name = date.strftime("%d%b%Y") # ie 01Dec1999
long_month = date.strftime("%B") # ie December
year = date.year
filename = f"{location.value}{file_date_name}.{format}"
urlparts = urllib.parse.ParseResult(
scheme="http",
netloc=netloc,
path=f"archive/{year}/{long_month}/{caps_format}/{filename}",
params="",
query="",
fragment="")
return urllib.parse.urlunparse(urlparts), filename
def download(url: str, output_loc, wait: typing.Optional[int]=None):
"""Downloads the given url to the output location.
Arguments:
url: The url to download
output_loc: Where to download the url to.
wait: Wait for this many seconds to avoid overloading the server.
"""
if wait is None:
wait = 6 # wait for 6 seconds between downloads
output_loc = Path(output_loc)
if output_loc.is_file():
return # do nothing
temp_loc = output_loc.with_suffix(output_loc.suffix + ".tmp")
try:
urllib.request.urlretrieve(url, temp_loc)
temp_loc.rename(output_loc)
except Exception as e:
raise Exception(f"{e} when downloading {url}") from e
finally:
time.sleep(wait) # wait for x seconds to avoid the server blocking us
def daterange(from_time, to_time) -> typing.List[datetime.datetime]:
"""Makes a list of all dates between to times.
Arguments:
from_time: The start time to make the range from.
to_time: The last time to make the range to.
Returns:
a list of dates between two datetimes
"""
start = from_time
step = datetime.timedelta(days=1)
dates = [] # new empty list
while start <= to_time:
dates.append(start)
start += step
return dates
def uppercase_columns(pandas_dataset):
"""Makes sure all the column names are uppercase
Arguments:
pandas_dataset:
The dataset to uppercase column names of.
Warning, this modifies this object.
Returns:
The dataset with uppercase column names.
"""
pandas_dataset.columns = pandas_dataset.columns.str.upper()
return pandas_dataset
def combine_csvs(input_csv_files: list, output_loc):
"""Combines the given csv files into one big one.
Make sure the CSV files all have the same headers.
Arguments:
input_csv_files: The list of csv files to combine.
output_loc: Where the save the combined csv file.
"""
combined = pd.concat([uppercase_columns(pd.read_csv(f))
for f in input_csv_files])
combined.to_csv(output_loc, index=False, encoding='utf-8')
def make_csv(location: Locations,
from_time: datetime.datetime,
to_time: datetime.datetime,
outputdir: Path,
cachedir: Path,
wait: float = None,
quiet: bool = False):
"""Makes a giant csv of a certain location between two times.
Arguments:
location:
The location of where the data was taken,
ie "Sot" for Southampton Docks.
from_time: The first day of data to download.
to_time: The last day of data to download.
outputdir: Where to save the output.
cachedir: Where to cache the downloads.
wait: The amount of time to wait between downloads.
quiet: If False, print HTTP errors of failed downloads.
"""
datelist = daterange(from_time, to_time)
cache_dir = Path(cachedir)
cache_dir.mkdir(exist_ok=True) # make a new folder
file_name_list = []
with tqdm.tqdm(total=len(datelist)) as pbar:
for date in datelist:
url, filename = make_url(location, "csv", date)
output_loc = cache_dir / filename # ie ./cache/Sot27May2019.csv
# show the file we are downloading
pbar.set_description(str(filename))
try:
download(url, output_loc=output_loc, wait=wait)
# add the file to the list of downloaded files
file_name_list.append(output_loc)
except Exception as e:
if not quiet:
pbar.write(f"{e}")
pbar.update(1) # update the fancy progressbar
output_dir = Path(outputdir)
output_dir.mkdir(exist_ok=True)
output_filename = f"{location}-{from_time.date()}-{to_time.date()}.csv"
output_loc = output_dir / output_filename
combine_csvs(file_name_list, output_loc)
print(f"Made a CSV of {location} from date "
f"{from_time.date()} to {to_time.date()} at {output_loc}")
def download_all():
"""Downloads all the data from all locations."""
current_date = datetime.datetime.now()
yesterday = current_date - datetime.timedelta(days=1)
first_date = datetime.datetime(2009, 1, 1)
for loc in "Bra", "Sot", "Chi", "Cam":
make_csv(loc, datetime.datetime(2009, 1, 1), yesterday, wait=1)
def main(args):
kwargs = dict(from_time=args.start, to_time=args.end,
outputdir=args.outputdir, cachedir=args.cachedir,
wait=args.dlperiod, quiet=args.quiet)
if args.location:
make_csv(args.location, **kwargs)
# else download all locations
pbar = tqdm.tqdm(Locations, desc=("Iterating over "
f"{[x.value for x in Locations]}"))
for loc in pbar:
pbar.set_description(f"Donwloading {loc.value} data.")
make_csv(loc, **kwargs)
if __name__ == "__main__":
main(args)