-
B.Anderson authoredB.Anderson authored
_drake_cleanFeeders.R 7.30 KiB
# basic _drake.R style file
# but adapted for use in a project where there might be multiple plans in the same folder
# called using r_make() from make_cleanFeeders.R
# see https://books.ropensci.org/drake/projects.html#usage for explanation
# Libraries/Packages ----
# the drake book suggests putting this in packages.R but...
library(dataCleaning) # remember to build it first :-)
dataCleaning::setup() # load env.R set up the default paths etc
makeLibs <- c("data.table", # data munching
"drake", # need to load this here too as r_make() doesn't (why not?)
"here", # here
"lubridate", # dates and times
"hms", # times
"ggplot2", # plots
"skimr" # for skim
)
# load them
dataCleaning::loadLibraries(makeLibs)
# Parameters ----
updateData <- "rerun" # edit this in any way (at all) to get drake to re-load the data
updateReport <- "yep" # edit this in any way (at all) to get drake to re-load the data
# Some data to play with:
dFile <- "/mnt/SERG_data/Ellis_IOW/Cleaned_SS_Amps/amps_all_substations.csv.gz" # edit for your set up
# Rmd file to run as report
rmdFile <- "report_cleanFeeders" # <- name of the .Rmd file to run at the end
version <- "_allData"
# yaml for Rmd (see makeReport function)
title <- "Testing electricity substation/feeder data"
subtitle <- "Outliers and missing data..."
authors <- "Ben Anderson & Ellis Ridett"
# Functions ----
# for use in drake
# the drake book suggests putting this in functions.R but...
addSeason <- function(dt,dateVar,h){
dt <- dt[, tmpM := lubridate::month(get(dateVar))] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
if(h == "S"){
dt <- dt[, season := "Summer"] # easiest to set the default to be the one that bridges years
dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Spring"]
# re-order to make sense
dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
}
if(h == "N"){
dt <- dt[, season := "Winter"] # easiest to set the default to be the one that bridges years
dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Spring"]
dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Summer"]
dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Autumn"]
# re-order to make sense
dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
}
dt$tmpM <- NULL
return(dt)
}
getData <- function(f,updateData){
# gets the data
dt <- data.table::fread(f)
dt[, rDateTime := lubridate::as_datetime(Time)] # the dateTime is now called Time!!!
dt[, rTime := hms::as_hms(rDateTime)]
dt[, rDate := as.Date(rDateTime)]
dt[, rYear := lubridate::year(rDate)]
dt[, rDoW := lubridate::wday(rDate, label = TRUE)]
dt[, kW := Value] # for clarity
dt[, Value := NULL] #drop
# set sub_region to region when sub_region not set (only 1 feeder)
dt[, sub_region := ifelse(sub_region == "", region, # set to region if empty
sub_region)]
# some of the feeder IDs are re-used across the substations
dt[, feeder_ID := paste0(region,"_", sub_region)] # create a unique feeder ID Ellis!!!!!
dt <- addSeason(dt, dateVar = "rDateTime", h = "N") # do this here so it's done
return(dt)
}
makeUniq <- function(dt){
# we suspect there may be duplicates by feeder_ID, dateTime & kW
# remove them (report this in the .Rmd)
uniq <- unique(dt, by = c("rDateTime", # dateTime
"feeder_ID", # our constructed unique feeded ID
"kW") # kW
)
return(uniq)
}
toWide <- function(dt){
# converts to wide form so each feeder is in a column - so we can check across feeders for dateTimes with no missing data
# dt <- feederDT
wDT <- dcast(dt, rDateTime ~ feeder_ID,
fun.aggregate = "mean", # irrelevant, no aggregation happening
fill = NA, # missing dateTimes will be NA
#drop = FALSE, # keep all missing combinations - don't do this, bloats the data with useless NAs
value.var = "kW")
wDT$nNA <- rowSums(is.na(wDT))
colCount <- ncol(wDT)
wDT[, nFeedersReporting := colCount - nNA - 2] # 2 as dateTime and mNA = 2 cols
return(wDT)
}
saveData <- function(dt, which){
# Save the (newly) cleaned data
if(which == "L"){
# long data
of <- "/mnt/SERG_data/Ellis_IOW/Cleaned_SS_Amps/uniq/uniq_amps_all_substations.csv"
data.table::fwrite(dt[, .(rDateTime, region, sub_region, feeder_ID, kW)], # only the vars we can't recreate
of)
cmd <- paste0("gzip -f ", of)
message("Gzip file: ", of)
try(system(cmd)) # seems to throw an error on the RStudio server but it still works
message("Done ")
}
if(which == "W"){
# wide data (feeders as columns)
of <- "/mnt/SERG_data/Ellis_IOW/Cleaned_SS_Amps/uniq/uniq_amps_all_substations_wide.csv"
# remove the vars we don't need?
dt$nNA <- NULL
data.table::fwrite(dt, of)
cmd <- paste0("gzip -f ", of)
message("Gzip file: ", of)
try(system(cmd)) # seems to throw an error on the RStudio server but it still works
message("Done ")
}
}
makeReport <- function(f,version, type = "html", updateReport){
# default = html
message("Rendering ", f, ".Rmd (version: ", version, ") to ", type)
if(type == "html"){
rmarkdown::render(input = paste0(here::here("Rmd", f),".Rmd"), # we love here:here() - it helps us find the .Rmd to use
params = list(title = title,
subtitle = subtitle,
authors = authors),
output_format = "html_document",
output_file = paste0(here::here("docs", f),
version,".html") # where the output goes
)
}
if(type == "pdf"){
rmarkdown::render(input = paste0(here::here("Rmd", f),".Rmd"), # we love here:here() - it helps us find the .Rmd to use
params = list(title = title,
subtitle = subtitle,
authors = authors),
output_format = "pdf_document",
output_file = paste0(here::here("docs", f),
version,".pdf") # where the output goes
)
}
}
# Set the drake plan ----
# the drake book suggests putting this in plan.R but...
# I had expected r_make() to load drake() in the new clean R session but it doesn't
my_plan <- drake::drake_plan(
origData = getData(dFile, updateData), # returns data as data.table. If you edit 'update' in any way it will reload - drake is watching you!
uniqData = makeUniq(origData), # remove duplicates
wideData = toWide(uniqData),
saveLong = saveData(uniqData, "L"), # doesn't actually return anything
saveWide = saveData(wideData, "W"), # doesn't actually return anything
# pdf output fails
pdfOut = makeReport(rmdFile, version, "pdf", updateReport), # pdf - must be some way to do this without re-running the whole thing
htmlOut = makeReport(rmdFile, version, "html", updateReport) # html output
)
# see https://books.ropensci.org/drake/projects.html#usage
# I had expected r_make() to load drake() in the new clean R session but it doesn't
drake::drake_config(my_plan, verbose = 2)