data cleaning for Ellis

4b1e3b18 · Ben Anderson · a6bb17f2 · 4b1e3b18 · 4b1e3b18
Commit 4b1e3b18 authored 4 years ago by Ben Anderson
--- a/isleOfWight/cleaningFeederData.Rmd
+++ b/isleOfWight/cleaningFeederData.Rmd
+---
+title: 'Cleaning Feeder Data'
+subtitle: 'Code and notes'
+author: "Ben Anderson (b.anderson@soton.ac.uk), [SERG, Energy & Climate Change](http://www.energy.soton.ac.uk/), University of Southampton"
+date: 'Last run at: `r Sys.time()`'
+output:
+  bookdown::html_document2:
+    code_folding: hide
+    fig_caption: yes
+    number_sections: yes
+    self_contained: yes
+    toc: yes
+    toc_depth: 3
+    toc_float: yes
+  bookdown::word_document2:
+    fig_caption: yes
+    toc: yes
+    toc_depth: 2
+  bookdown::pdf_document2:
+    fig_caption: yes
+    keep_tex: yes
+    number_sections: yes
+    toc: yes
+    toc_depth: 2
+always_allow_html: yes
+bibliography: '`r path.expand("~/bibliography.bib")`'
+---
+
+```{r setup, include=FALSE}
+# Knitr setup ----
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(warning = FALSE) # for final tidy run
+knitr::opts_chunk$set(message = FALSE) # for final tidy run
+
+# Set start time ----
+startTime <- proc.time()
+
+# Libraries ----
+
+library(data.table) # cos we like data.table (you may not in which case dplyr is fine :-)
+library(lubridate) # for data & time manip
+library(hms) # for hh:mm:ss if we need it
+library(ggplot2) # fancy plots
+library(kableExtra) # for extra kable
+library(skimr) # for skim (data description)
+
+# Parameters ----
+dFile <- "~/Dropbox/Ben_IOW_SS.csv" # edit for your set up
+
+# Functions ----
+
+# set season
+addSeason <- function(dt,dateVar,h){
+  # h = hemisphere (N or S)
+  # dateVar is a date that lubridate can convert to months
+  dt <- dt[, tmpM := lubridate::month(get(dateVar))] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
+  if(h == "S"){
+    dt <- dt[, season := "Summer"] # easiest to set the default to be the one that bridges years
+    dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
+    dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
+    dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Spring"]
+    # re-order to make sense
+    dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
+  }
+  if(h == "N"){
+    dt <- dt[, season := "Winter"] # easiest to set the default to be the one that bridges years
+    dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Spring"]
+    dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Summer"]
+    dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Autumn"]
+    # re-order to make sense
+    dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
+  }
+  dt$tmpM <- NULL
+  return(dt)
+}
+```
+
+
+# Intro
+
+We have some feeder data. There seem to be NAs and missing time stamps. We want to select the 'best' (i.e most complete) days within a day-of-the-week/season/year sampling frame.
+
+# Data prep
+
+## Load data
+
+Loading from `r dFile`...
+
+```{r loadData}
+dt<- data.table::fread(dFile)
+dt[, V1 := NULL] #drop
+head(dt)
+```
+
+Do some data prep
+
+```{r skimIt}
+
+dt[, rDateTime := lubridate::as_datetime(DateTime)]
+dt[, rTime := hms::as_hms(rDateTime)]
+dt[, rDate := as.Date(rDateTime)]
+dt[, rYear := lubridate::year(rDate)]
+dt[, rDoW := lubridate::wday(rDate, label = TRUE)]
+dt[, kW := Value] # for clarity
+dt[, Value := NULL] #drop
+
+dt <- addSeason(dt, dateVar = "rDate", h = "N")
+skimr::skim(dt)
+```
+
+Do a sense check by season & feeder.
+
+```{r senseCheck}
+t <- dt[, .(mean_kW = mean(kW),
+            nObs = .N), keyby = .(rYear, sub_region,season)]
+
+NAt <- dt[is.na(kW), .(nObs = .N), keyby = .(rYear, sub_region,season)]
+
+kableExtra::kable(t,
+                  caption = "Sense check on raw data") %>%
+  kable_styling()
+```
+
+Histograms of kW by season and feeder...
+
+```{r histo}
+ggplot2::ggplot(dt, aes(x = kW)) +
+  geom_histogram(binwidth = 5) +
+  facet_grid(sub_region ~ season)
+  
+```
+
+Demand profiles of kW by season and feeder and day of the week...
+
+```{r kwProfiles}
+plotDT <- dt[, .(meankW = mean(kW)), keyby = .(rTime, season, sub_region, rDoW)]
+ggplot2::ggplot(plotDT, aes(x = rTime, y = meankW, colour = season)) +
+  geom_line() +
+  facet_grid(sub_region ~ rDoW)
+  
+```
+
+Is that what we expect?
+
+# Test for missing
+
+Can we see missing data?
+
+```{r missingVis}
+ggplot2::ggplot(dt, aes(x = rDate, y = rTime, fill = kW)) +
+  geom_tile() +
+  facet_grid(sub_region ~ .) +
+  scale_fill_viridis_c() +
+  labs(caption = "All kW values by feeder, time (y) and date (x)")
+
+```
+
+oooh.
+
+Try aggregating...
+```{r aggVis}
+dt[, rHour := lubridate::hour(rDateTime)]
+plotDT <- dt[, .(mean_kW = mean(kW),
+                 nObs = .N), keyby = .(rHour, rDate, sub_region)]
+ggplot2::ggplot(plotDT, aes(x = rDate, y = rHour, fill = mean_kW)) +
+  geom_tile() +
+  scale_fill_viridis_c() +
+  facet_grid(sub_region ~ .) +
+  labs(caption = "Mean kW per hour")
+
+ggplot2::ggplot(plotDT, aes(x = rDate, y = rHour, fill = nObs)) +
+  geom_tile() +
+  scale_fill_viridis_c() +
+  facet_grid(sub_region ~ .) +
+  labs(caption = "Number of obs per hour")
+
+ggplot2::ggplot(plotDT, aes(x = nObs, y = mean_kW, group = nObs)) +
+  geom_boxplot() +
+  facet_grid(.~sub_region) +
+  labs(caption = "mean kW per hour by number of observations contributing")
+  
+```
+
+# Which days have the 'least' missing? 
+
+This is quite tricky as we may have completely missing dateTimes. But we can test for this by counting the number of observations per dateTime and then seeing if the dateTimes are contiguous.
+
+```{r testDateTimes}
+dateTimesDT <- dt[, .(nFeeders = uniqueN(sub_region)), keyby = .(rDateTime, rTime, rDate, season)] # keep season
+dateTimesDT[, dtDiff := rDateTime - shift(rDateTime)] # should be 15 mins
+
+
+summary(dateTimesDT)
+
+ggplot2::ggplot(dateTimesDT, aes(x = rDate, y =  rTime, fill = nFeeders)) +
+  geom_tile() +
+  scale_fill_viridis_c() +
+  labs(caption = "Number of unique feeders in each dateTime")
+```
+
+Well we clearly have some dateTimes where we have no data _at all_!
+
+Are there time of day patterns? It looks like it...
+
+```{r missingProfiles}
+plotDT <- dateTimesDT[, .(meanN = mean(nFeeders)), keyby = .(rTime, season)]
+ggplot2::ggplot(plotDT, aes(y = meanN, x = rTime, colour = season)) +
+                  geom_line() +
+  labs(y = "Mean n feeders reporting",
+       caption = "Mean n feeders by time of day")
+```
+
+So...
+
+I find it distinctly odd that:
+
+ * we appear to have the most feeders reporting data at 'peak' times
+ * we have a lot of missing dateTimes between 00:30 and 05:00
+
+If the monitors were set to only collect data when the power (or Wh in a given time frame) was above a given threshold then it would look like this...
+
+# Runtime
+
+```{r check runtime, include=FALSE}
+t <- proc.time() - startTime
+elapsed <- t[[3]]
+```
+
+Analysis completed in `r round(elapsed,2)` seconds ( `r round(elapsed/60,2)` minutes) using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com) with `r R.version.string` running on `r R.version$platform`.
+
+# R environment
+
+## R packages used
+
+ * base R [@baseR]
+ * bookdown [@bookdown]
+ * data.table [@data.table]
+ * ggplot2 [@ggplot2]
+ * kableExtra [@kableExtra]
+ * knitr [@knitr]
+ * lubridate [@lubridate]
+ * rmarkdown [@rmarkdown]
+ * skimr [@skimr]
+ * XML [@XML]
+
+## Session info
+
+```{r sessionInfo, echo=FALSE}
+sessionInfo()
+```
+
+# The data cleaning code
+
+(c) Mikey Harper :-)
+
+Starts here:
+
+<hr>
+
+Scripts used clean and merge substation data.
+
+```{r setupMH, include=FALSE, eval=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(here)
+library(tidyverse)
+```
+
+## Input files
+
+Analysis will first look at the primary data. There are different types of files which refer to different paramters. Different search terms are used to extract these:
+
+```{r, eval=FALSE}
+# Find files with AMPS. Exclude files which contain DI~CO
+files_AMPS <- list.files("../Primary", recursive = T, pattern = "~AMPS", full.names = T) %>%
+  .[!stringr::str_detect (., "DI~CO")]
+
+files_AMPS
+```
+
+## Process Amps
+
+```{r, eval=FALSE}
+# Show a sample
+fileSelect <- files_AMPS[4]
+head(read_csv(fileSelect, skip = 3))
+```
+
+Again a function is used to do all the processing on the input CSVs. This is slightly amended from the `processkV` function.
+
+```{r, eval=FALSE}
+processAMPS <- function(filePath, databaseCon = con){
+  
+  message("Processing ", filePath)
+  
+  # 1st Level
+  dirName_1 <- filePath %>% 
+    dirname() %>% 
+    basename
+  
+  # 2nd Level
+  dirName_2 <- filePath %>% 
+    dirname() %>% 
+    dirname() %>% 
+    basename
+  
+  if (dirName_2 == "Primary"){
+    dirName_2 <- dirName_1
+    dirName_1 <- ""
+  }
+  
+  # Load the CSV. There were some tab seperated files which are saved as CSVs, which confuse the search. There if the data is loaded incorrectly (only having a single column), the code will try and load it as a TSV.
+  dataLoaded <- suppressWarnings(read_csv(filePath, skip = 3, col_types = cols(Value = col_number())))
+  if(ncol(dataLoaded) == 1){
+    dataLoaded <- suppressWarnings(read_tsv(filePath, skip = 3, col_types = cols()))
+  }
+  
+  # Reformat data
+  dataLoaded <-
+    dataLoaded %>%
+    mutate_at(vars(Time), function(x){gsub('[^ -~]', '', x)}) %>% # Remove invalid UTF characters
+    mutate(Time = lubridate::dmy_hms(Time),
+           Time = lubridate::floor_date(Time, unit = "15 minutes")) %>% 
+    group_by(Time) %>%
+    summarise(Value = mean(Value, na.rm = T)) %>%
+    mutate(region = dirName_2,
+           sub_region = dirName_1
+    )
+  
+  # There are some datasets which contain no values, whch can cause errors in running
+  # If this happens, return NULL
+  if(is.character(dataLoaded$Value)) return(NULL)
+  
+  return(dataLoaded)
+}
+
+```
+
+Run the function below:
+
+```{r, eval=FALSE}
+Amps <- purrr::map_df(files_AMPS, processAMPS)
+```
+
+
+```{r, eval=FALSE}
+Amps_stats <- Amps %>%
+  group_by(region) %>%
+  summarise(mean = (mean(Value, na.rm = T)),
+            n = n(),
+            sd = sd(Value, na.rm = T),
+            var = var(Value, na.rm = T))
+
+Amps_stats
+
+readr::write_csv(Amps_stats, path = "../Amps_stats.csv")
+```
+
+
+
+```{r, eval=FALSE}
+ggplot(Amps) +
+  geom_point(aes(x = Time, y = Value, colour = region)) +
+  facet_grid(region~., scales = "free_y") +
+  labs(title = "Cleaned data for Amps")
+```
+
+## Processing data
+
+
+
+
+```{r, eval=FALSE}
+readr::write_csv(Amps, path = "amps_all_substations.csv")
+```
+
+```{r, eval=FALSE}
+library(odbc)
+
+library(DBI)
+# Create an ephemeral in-memory RSQLite database
+con <- dbConnect(RSQLite::SQLite(), "amps.sqlite")
+
+dbListTables(con)
+
+
+dbWriteTable(con, "amps", Amps)
+dbListTables(con)
+
+```
+
+## Querying the data
+
+```{r, eval=FALSE}
+con <- dbConnect(RSQLite::SQLite(), "amps.sqlite")
+
+
+library(dbplyr)
+
+Amps_db <- tbl(con, "amps")
+
+
+flights_db %>%
+  group_by(region) %>%
+  summarise(mean = (mean(Value, na.rm = T)),
+            n = n(),
+            sd = sd(Value, na.rm = T),
+            var = var(Value, na.rm = T))
+
+```
+
+
+## Round to Nearest N minutes
+
+
+
+
+```{r, eval=FALSE}
+
+processAMPS_5mins <- function(filePath){
+  
+  message("Processing ", filePath)
+  
+  # 1st Level
+  dirName_1 <- filePath %>% 
+    dirname() %>% 
+    basename
+  
+  # 2nd Level
+  dirName_2 <- filePath %>% 
+    dirname() %>% 
+    dirname() %>% 
+    basename
+  
+  if (dirName_2 == "Primary"){
+    dirName_2 <- dirName_1
+    dirName_1 <- ""
+  }
+  
+  # Load the CSV. There were some tab seperated files which are saved as CSVs, which confuse the search. There if the data is loaded incorrectly (only having a single column), the code will try and load it as a TSV.
+  dataLoaded <- suppressWarnings(read_csv(filePath, skip = 3, col_types = cols()))
+  if(ncol(dataLoaded) == 1){
+    dataLoaded <- suppressWarnings(read_tsv(filePath, skip = 3, col_types = cols()))
+  }
+  
+  # Reformat data
+  dataLoaded <-
+    dataLoaded %>%
+    mutate_at(vars(Time), function(x){gsub('[^ -~]', '', x)}) %>% # Remove invalid UTF characters
+    mutate(Time = lubridate::dmy_hms(Time),
+           region = dirName_2,
+           sub_region = dirName_1,
+           code = paste(region, sub_region, sep = "_"),
+    ) %>%
+    mutate(Time = lubridate::floor_date(Time, unit = "5 minutes")) %>%
+    group_by(Time, region, code) %>%
+    summarise(Value = mean(Value)) %>%
+  arrange(Time)
+  
+  # There are some datasets which contain no values, whch can cause errors in running
+  # If this happens, return NULL
+  if(is.character(dataLoaded$Value)) return(NULL)
+  
+  # Returns the loaded and cleaned dataframe
+  return(dataLoaded)
+}
+
+```
+
+
+Nearest 5 minutes:
+
+```{r, eval=FALSE}
+Amps_5mins <<- purrr::map_df(files_AMPS[1:4], processAMPS_5mins)
+```
+
+
+
+# References
+
--- a/isleOfWight/cleaningFeederData.html
+++ b/isleOfWight/cleaningFeederData.html