report_cleanFeeders.Rmd

params:
  subtitle: ""
  title: ""
  authors: ""
title: '`r params$title`'
subtitle: '`r params$subtitle`'
author: '`r params$authors`'
date: 'Last run at: `r Sys.time()`'
output:
  bookdown::html_document2:
    self_contained: no
    fig_caption: yes
    code_folding: hide
    number_sections: yes
    toc: yes
    toc_depth: 2
    toc_float: TRUE
  bookdown::pdf_document2:
    fig_caption: yes
    number_sections: yes
    toc: yes
    toc_depth: 2
  bookdown::word_document2:
    fig_caption: yes
    number_sections: yes
    toc: yes
    toc_depth: 2
    fig_width: 5
bibliography: '`r paste0(here::here(), "/bibliography.bib")`'
# Knitr setup ----
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE) # for final tidy run
knitr::opts_chunk$set(message = FALSE) # for final tidy run

# Set start time ----
startTime <- proc.time()

# Libraries ----
rmdLibs <- c("kableExtra" # tables
)
# load them
dataCleaning::loadLibraries(rmdLibs)

# Parameters ----
#dFile <- "~/Dropbox/Ben_IOW_SS.csv" # edit for your set up


# Functions ----
# put more general ones that could be useful to everyone in /R so they are built into the package.

# put functions relevant to this analysis here

origDataDT <- drake::readd(origData) # readd the drake object

uniqDataDT <- drake::readd(uniqData) # readd the drake object

kableExtra::kable(head(origDataDT), digits = 2,
                  caption = "First 6 rows of data")
message("Original data nrows: ", tidyNum(nrow(origDataDT)))

message("Unique data nrows: ", tidyNum(nrow(uniqDataDT)))

nDups <- tidyNum(nrow(origDataDT) - nrow(uniqDataDT))

message("So we have ", tidyNum(nDups), " duplicates...")

pc <- 100*((nrow(origDataDT) - nrow(uniqDataDT))/nrow(origDataDT))
message("That's ", round(pc,2), "%")

feederDT <- uniqDataDT[!is.na(rDateTime)] # use dt with no duplicates
origDataDT <- NULL # save memory

plotDT <- feederDT[, .(meankW = mean(kW),
                       nObs = .N), keyby = .(rTime, season, feeder_ID, rDoW)]

ggplot2::ggplot(plotDT, aes(x = rTime, y = meankW, colour = feeder_ID)) +
  geom_line() +
  theme(legend.position="none") + # remove legend so we can see the plot
  facet_grid(season ~ rDoW)

plotDT <- feederDT[, .(nObs = .N), keyby = .(rDate, feeder_ID)]
plotDT[, propExpected := nObs/(24*4)]

ggplot2::ggplot(plotDT, aes(x = rDate, y = feeder_ID, fill = 100*propExpected)) +
  geom_tile() +
  scale_x_date(date_breaks = "3 months", date_labels =  "%B %Y")  +
  theme(axis.text.x=element_text(angle=90, hjust=1)) +
  theme(legend.position="bottom") +
  scale_fill_viridis_c(name="% expected")


plotDT <- feederDT[, .(nObs = .N,
                       meankW = mean(kW)), keyby = .(rTime, rDate, season)]

plotDT[, propExpected := nObs/uniqueN(feederDT$feeder_ID)] # we now have all feeders per time so...

ggplot2::ggplot(plotDT, aes(x = rDate, y = rTime, fill = 100*propExpected)) +
  geom_tile() +
  scale_x_date(date_breaks = "6 months", date_labels =  "%B %Y")  +
  theme(axis.text.x=element_text(angle=90, hjust=1)) +
  theme(legend.position="bottom") +
  scale_fill_viridis_c(name="% expected")
plotDT <- feederDT[, .(meankW = mean(kW, na.rm = TRUE)), keyby = .(rDate, feeder_ID)]

ggplot2::ggplot(plotDT, aes(x = rDate, y = feeder_ID, fill = meankW)) +
  geom_tile() +
  scale_x_date(date_breaks = "3 months", date_labels =  "%B %Y")  +
  theme(axis.text.x=element_text(angle=90, hjust=1)) +
  theme(legend.position="bottom") +
  scale_fill_viridis_c(name="Mean kW")


plotDT <- feederDT[, .(nObs = .N,
                       meankW = mean(kW)), keyby = .(rTime, rDate, season)]

ggplot2::ggplot(plotDT, aes(x = rDate, y = rTime, fill = meankW)) +
  geom_tile() +
  scale_x_date(date_breaks = "6 months", date_labels =  "%B %Y")  +
  theme(axis.text.x=element_text(angle=90, hjust=1)) +
  theme(legend.position="bottom") +
  scale_fill_viridis_c(name="kW")
dateTimesDT <- feederDT[, .(nFeeders = uniqueN(feeder_ID),
                            meankW = mean(kW, na.rm = TRUE)),
                        keyby = .(rDateTime, rTime, rDate, season)] # keep season
dateTimesDT[, dtDiff := rDateTime - shift(rDateTime)] # should be 15 mins


summary(dateTimesDT)
ggplot2::ggplot(dateTimesDT, aes(x = rDate, y =  rTime, fill = nFeeders)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(caption = "Number of unique feeders in each dateTime")
dateTimesDT[, rYear := lubridate::year(rDateTime)]
plotDT <- dateTimesDT[, .(meanN = mean(nFeeders),
                          meankW = mean(meankW)), keyby = .(rTime, season, rYear)]

ggplot2::ggplot(plotDT, aes(y = meanN, x = rTime, colour = season)) +
  geom_line() +
  facet_wrap(rYear ~ .) +
  labs(y = "Mean n feeders reporting",
       caption = "Mean n feeders by time of day")

ggplot2::ggplot(plotDT, aes(y = meankW, x = rTime, colour = season)) +
  geom_line() +
  facet_wrap(rYear ~ .) +
  labs(y = "Mean kw reporting",
       caption = "Mean kw by time of day")
ggplot2::ggplot(plotDT, aes(y = meankW, x = meanN, colour = season)) +
  geom_point() +
  facet_wrap(rYear ~ .) +
  labs(y = "Mean kw per quarter hour",
       x = "Mean number feeders reporting")
wDT <- drake::readd(wideData) # back from the drake
names(wDT)
wDT <- addSeason(wDT, dateVar = "rDateTime", h = "N")
wDT[, rDoW := lubridate::wday(rDateTime)]
wDT[, rDate := lubridate::date(rDateTime)]

# how many days have all feeders sending data in all dateTimes?

aggDT <- wDT[, .(meanOK = mean(nFeedersReporting),
                 minOk = min(nFeedersReporting),
                 maxOk = max(nFeedersReporting),
                 sumOK = sum(nFeedersReporting) # will have a  max of n feeders * 24 hours  * 4 quarter hours
),
keyby = .(rDate, season)]

aggDT[, propExpected := sumOK/(uniqueN(feederDT$feeder_ID)*24*4)] # we expect 25*24*4

summary(aggDT)

message("How many days have 100%?")
n <- nrow(aggDT[propExpected == 1])
n
ggplot2::ggplot(aggDT, aes(x = rDate, colour = season, y = meanOK)) +
  geom_point()

ggplot2::ggplot(aggDT, aes(x = rDate, colour = season,
                           y = 100*propExpected)) +
  geom_point() +
  labs(y = "%")

aggDT[, rDoW := lubridate::wday(rDate, lab = TRUE)]
h <- head(aggDT[season == "Spring"][order(-propExpected)])
kableExtra::kable(h, caption = "Best Spring days overall",
                  digits = 3)

h <- head(aggDT[season == "Summer"][order(-propExpected)])
kableExtra::kable(h, caption = "Best Summer days overall",
                  digits = 3)

h <- head(aggDT[season == "Autumn"][order(-propExpected)])
kableExtra::kable(h, caption = "Best Autumn days overall",
                  digits = 3)

h <- head(aggDT[season == "Winter"][order(-propExpected)])
kableExtra::kable(h, caption = "Best Winter days overall",
                  digits = 3)
t <- proc.time() - startTime
elapsed <- t[[3]]
sessionInfo()
knitr::opts_chunk$set(echo = TRUE)
library(here)
library(tidyverse)
# Find files with AMPS. Exclude files which contain DI~CO
files_AMPS <- list.files("../Primary", recursive = T, pattern = "~AMPS", full.names = T) %>%
  .[!stringr::str_detect (., "DI~CO")]

files_AMPS
# Show a sample
fileSelect <- files_AMPS[4]
head(read_csv(fileSelect, skip = 3))
processAMPS <- function(filePath, databaseCon = con){

  message("Processing ", filePath)

  # 1st Level
  dirName_1 <- filePath %>%
    dirname() %>%
    basename

  # 2nd Level
  dirName_2 <- filePath %>%
    dirname() %>%
    dirname() %>%
    basename

  if (dirName_2 == "Primary"){
    dirName_2 <- dirName_1
    dirName_1 <- ""
  }

  # Load the CSV. There were some tab seperated files which are saved as CSVs, which confuse the search. There if the data is loaded incorrectly (only having a single column), the code will try and load it as a TSV.
  dataLoaded <- suppressWarnings(read_csv(filePath, skip = 3, col_types = cols(Value = col_number())))
  if(ncol(dataLoaded) == 1){
    dataLoaded <- suppressWarnings(read_tsv(filePath, skip = 3, col_types = cols()))
  }

  # Reformat data
  dataLoaded <-
    dataLoaded %>%
    mutate_at(vars(Time), function(x){gsub('[^ -~]', '', x)}) %>% # Remove invalid UTF characters
    mutate(Time = lubridate::dmy_hms(Time),
           Time = lubridate::floor_date(Time, unit = "15 minutes")) %>%
    group_by(Time) %>%
    summarise(Value = mean(Value, na.rm = T)) %>%
    mutate(region = dirName_2,
           sub_region = dirName_1
    )

  # There are some datasets which contain no values, whch can cause errors in running
  # If this happens, return NULL
  if(is.character(dataLoaded$Value)) return(NULL)

  return(dataLoaded)
}

Amps <- purrr::map_df(files_AMPS, processAMPS)
Amps_stats <- Amps %>%
  group_by(region) %>%
  summarise(mean = (mean(Value, na.rm = T)),
            n = n(),
            sd = sd(Value, na.rm = T),
            var = var(Value, na.rm = T))

Amps_stats

readr::write_csv(Amps_stats, path = "../Amps_stats.csv")
ggplot(Amps) +
  geom_point(aes(x = Time, y = Value, colour = region)) +
  facet_grid(region~., scales = "free_y") +
  labs(title = "Cleaned data for Amps")
readr::write_csv(Amps, path = "amps_all_substations.csv")
library(odbc)

library(DBI)
# Create an ephemeral in-memory RSQLite database
con <- dbConnect(RSQLite::SQLite(), "amps.sqlite")

dbListTables(con)


dbWriteTable(con, "amps", Amps)
dbListTables(con)

con <- dbConnect(RSQLite::SQLite(), "amps.sqlite")


library(dbplyr)

Amps_db <- tbl(con, "amps")


flights_db %>%
  group_by(region) %>%
  summarise(mean = (mean(Value, na.rm = T)),
            n = n(),
            sd = sd(Value, na.rm = T),
            var = var(Value, na.rm = T))


processAMPS_5mins <- function(filePath){

  message("Processing ", filePath)

  # 1st Level
  dirName_1 <- filePath %>%
    dirname() %>%
    basename

  # 2nd Level
  dirName_2 <- filePath %>%
    dirname() %>%
    dirname() %>%
    basename

  if (dirName_2 == "Primary"){
    dirName_2 <- dirName_1
    dirName_1 <- ""
  }

  # Load the CSV. There were some tab seperated files which are saved as CSVs, which confuse the search. There if the data is loaded incorrectly (only having a single column), the code will try and load it as a TSV.
  dataLoaded <- suppressWarnings(read_csv(filePath, skip = 3, col_types = cols()))
  if(ncol(dataLoaded) == 1){
    dataLoaded <- suppressWarnings(read_tsv(filePath, skip = 3, col_types = cols()))
  }

  # Reformat data
  dataLoaded <-
    dataLoaded %>%
    mutate_at(vars(Time), function(x){gsub('[^ -~]', '', x)}) %>% # Remove invalid UTF characters
    mutate(Time = lubridate::dmy_hms(Time),
           region = dirName_2,
           sub_region = dirName_1,
           code = paste(region, sub_region, sep = "_"),
    ) %>%
    mutate(Time = lubridate::floor_date(Time, unit = "5 minutes")) %>%
    group_by(Time, region, code) %>%
    summarise(Value = mean(Value)) %>%
    arrange(Time)

  # There are some datasets which contain no values, whch can cause errors in running
  # If this happens, return NULL
  if(is.character(dataLoaded$Value)) return(NULL)

  # Returns the loaded and cleaned dataframe
  return(dataLoaded)
}

Amps_5mins <<- purrr::map_df(files_AMPS[1:4], processAMPS_5mins)