_drake_cleanFeeders.R

# basic _drake.R style file
# but adapted for use in a project where there might be multiple plans in the same folder
# called using r_make() from make_cleanFeeders.R
# see https://books.ropensci.org/drake/projects.html#usage for explanation

# Libraries/Packages ----
# the drake book suggests putting this in packages.R but...

library(dataCleaning) # remember to build it first :-)
dataCleaning::setup() # load env.R set up the default paths etc

makeLibs <- c("data.table", # data munching
              "drake", # need to load this here too as r_make() doesn't (why not?)
              "here", # here
              "lubridate", # dates and times
              "hms", # times
              "ggplot2", # plots
              "skimr" # for skim
)
# load them
dataCleaning::loadLibraries(makeLibs)

# Parameters ----
updateData <- "rerun" # edit this in any way (at all) to get drake to re-load the data
updateReport <- "yep" # edit this in any way (at all) to get drake to re-load the data

# Some data to play with:

dFile <- "/mnt/SERG_data/Ellis_IOW/Cleaned_SS_Amps/amps_all_substations.csv.gz" # edit for your set up

# Rmd file to run as report
rmdFile <- "report_cleanFeeders" # <- name of the .Rmd file to run at the end 
version <- "_allData"
# yaml for Rmd (see makeReport function)
title <- "Testing electricity substation/feeder data"
subtitle <- "Outliers and missing data..."
authors <- "Ben Anderson & Ellis Ridett"

# Functions ----
# for use in drake
# the drake book suggests putting this in functions.R but...

addSeason <- function(dt,dateVar,h){
  dt <- dt[, tmpM := lubridate::month(get(dateVar))] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
  if(h == "S"){
    dt <- dt[, season := "Summer"] # easiest to set the default to be the one that bridges years
    dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
    dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
    dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Spring"]
    # re-order to make sense
    dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
  }
  if(h == "N"){
    dt <- dt[, season := "Winter"] # easiest to set the default to be the one that bridges years
    dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Spring"]
    dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Summer"]
    dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Autumn"]
    # re-order to make sense
    dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
  }
  dt$tmpM <- NULL
  return(dt)
}


getData <- function(f,updateData){
  # gets the data
  dt <- data.table::fread(f)
  dt[, rDateTime := lubridate::as_datetime(Time)] # the dateTime is now called Time!!!
  dt[, rTime := hms::as_hms(rDateTime)]
  dt[, rDate := as.Date(rDateTime)]
  dt[, rYear := lubridate::year(rDate)]
  dt[, rDoW := lubridate::wday(rDate, label = TRUE)]
  dt[, kW := Value] # for clarity
  dt[, Value := NULL] #drop
  
  # set sub_region to region when sub_region not set (only 1 feeder)
  dt[, sub_region := ifelse(sub_region == "", region, # set to region if empty
                            sub_region)]
  # some of the feeder IDs are re-used across the substations
  dt[, feeder_ID := paste0(region,"_", sub_region)] # create a unique feeder ID Ellis!!!!!
  dt <- addSeason(dt, dateVar = "rDateTime", h = "N") # do this here so it's done
  return(dt)
}

makeUniq <- function(dt){
  # we suspect there may be duplicates by feeder_ID, dateTime & kW
  # remove them (report this in the .Rmd)
  uniq <- unique(dt, by = c("rDateTime", # dateTime
                            "feeder_ID", # our constructed unique feeded ID
                            "kW") # kW
                 )
  return(uniq)
}

toWide <- function(dt){
  # converts to wide form so each feeder is in a column - so we can check across feeders for dateTimes with no missing data
  # dt <- feederDT
  wDT <- dcast(dt, rDateTime ~ feeder_ID, 
               fun.aggregate = "mean", # irrelevant, no aggregation happening
               fill = NA, # missing dateTimes will be NA
               #drop = FALSE, # keep all missing combinations - don't do this, bloats the data with useless NAs
               value.var = "kW")
  wDT$nNA <- rowSums(is.na(wDT)) 
  colCount <- ncol(wDT)
  wDT[, nFeedersReporting := colCount - nNA - 2] # 2 as dateTime and mNA = 2 cols
  return(wDT)
}

saveData <- function(dt, which){
  # Save the (newly) cleaned data
  if(which == "L"){
    # long data
    of <- "/mnt/SERG_data/Ellis_IOW/Cleaned_SS_Amps/uniq/uniq_amps_all_substations.csv"
    data.table::fwrite(dt[, .(rDateTime, region, sub_region, feeder_ID, kW)], # only the vars we can't recreate
                       of) 
    cmd <- paste0("gzip -f ", of)
    message("Gzip file: ", of)
    try(system(cmd)) # seems to throw an error on the  RStudio server but it still works
    message("Done ")
  }
  if(which == "W"){
    # wide data (feeders as columns)
    of <- "/mnt/SERG_data/Ellis_IOW/Cleaned_SS_Amps/uniq/uniq_amps_all_substations_wide.csv"
    # remove the vars we don't need?
    dt$nNA <- NULL
    data.table::fwrite(dt, of)
    cmd <- paste0("gzip -f ", of)
    message("Gzip file: ", of)
    try(system(cmd)) # seems to throw an error on the  RStudio server but it still works
    message("Done ")
  }
}

makeReport <- function(f,version, type = "html", updateReport){
  # default = html
  message("Rendering ", f, ".Rmd (version: ", version, ") to ", type)
  if(type == "html"){
    rmarkdown::render(input = paste0(here::here("Rmd", f),".Rmd"), # we love here:here() - it helps us find the .Rmd to use
                    params = list(title = title,
                                  subtitle = subtitle,
                                  authors = authors),
                    output_format = "html_document",
                    output_file = paste0(here::here("docs", f),
                                         version,".html") # where the output goes
  )
  }
  if(type == "pdf"){
    rmarkdown::render(input = paste0(here::here("Rmd", f),".Rmd"), # we love here:here() - it helps us find the .Rmd to use
                      params = list(title = title,
                                    subtitle = subtitle,
                                    authors = authors),
                      output_format = "pdf_document",
                      output_file = paste0(here::here("docs", f),
                                           version,".pdf") # where the output goes
    )
  }
}


# Set the drake plan ----
# the drake book suggests putting this in plan.R but...
# I had expected r_make() to load drake() in the new clean R session  but it doesn't
my_plan <- drake::drake_plan(
  origData = getData(dFile, updateData), # returns data as data.table. If you edit 'update' in any way it will reload - drake is watching you!
  uniqData = makeUniq(origData), # remove duplicates
  wideData = toWide(uniqData),
  saveLong = saveData(uniqData, "L"), # doesn't actually return anything
  saveWide = saveData(wideData, "W"), # doesn't actually return anything
  # pdf output fails
  pdfOut = makeReport(rmdFile, version, "pdf", updateReport), # pdf - must be some way to do this without re-running the whole thing
  htmlOut = makeReport(rmdFile, version, "html", updateReport) # html output
)

# see https://books.ropensci.org/drake/projects.html#usage
# I had expected r_make() to load drake() in the new clean R session  but it doesn't
drake::drake_config(my_plan, verbose = 2)