analyseCERdata.R

# Meta -----
# Analyse Irish CER Smart Meter Trial data
# See: http://www.ucd.ie/issda/data/commissionforenergyregulationcer/

# code by: b.anderson@soton.ak.uk (@dataknut) with additions by lsb1@soton.ac.uk

# Purpose: ----
# introductory code for use in teaching using this dataset
# introduce large scale time series data
# introduce data tables
# introduce sub-setting 
# introduce zoo package for manipulating time series
# introduce autocorrelation

# Housekeeping ----
# clear the workspace
rm(list=ls())

# install a couple of packages that we will need
# if you have them already, you can comment out these lines with a "#"
#install.packages("data.table") 
library(data.table)
#install.packages("zoo")
library(zoo)

print(paste0("Working directory is: ", getwd()))

# change the working directory to where you put the data
# setwd("<where you put the data>")
# In my case:
setwd("~/Documents/Work/Data/CER Smart Metering Project/data/processed")

# Load data & describe it ----
# Load the data by looping over months and years
months <- c("October", "December")
years <- c("2009", "2010")
for (m in months) {
  for (y in years) {
    print(paste0("Reading in CER_",m,"_",y,"_residential.csv"))
    # need to set this table name according to values of m and y
    # why is this soooo hard in R? It must be a common need
    name <- paste0("resCER_",m,"_",y,"_DT")
    assign(name, as.data.table(
                      read.csv(paste0("CER_",m,"_",y,"_residential.csv"))))
  }
}

# what's here?
tables()

# combine the data into one big table in long format
# more use for summaries etc
l = list(resCER_October_2009_DT,resCER_December_2009_DT,resCER_October_2010_DT,resCER_December_2010_DT)
resCER_kWh_DT <- as.data.table(rbindlist(l))

# delete old tables to save memory
resCER_October_2009_DT <- NULL
resCER_December_2009_DT <- NULL
resCER_October_2010_DT <- NULL
resCER_December_2010_DT <- NULL

#L: We can create a date vector using R's date conversion functions
resCER_kWh_DT$l_datetime<-as.POSIXct(resCER_kWh_DT$datetime_start,tz = "", "%Y-%m-%d %H:%M:%S")
# now remove old datetime to save memory
resCER_kWh_DT$datetime_start <- NULL

# extract:
# year
# month
# date
# hour of the day variable (24 hour format)
# relies on the POSIXct time having a regular structure so that the hour is ALWAYS character 12 & 13
resCER_kWh_DT$year <- as.numeric(substr(resCER_kWh_DT$l_datetime, 1, 4))
resCER_kWh_DT$month <- as.numeric(substr(resCER_kWh_DT$l_datetime, 6, 7))
resCER_kWh_DT$date <- as.POSIXct(substr(resCER_kWh_DT$l_datetime, 1, 10))
resCER_kWh_DT$hour <- as.numeric(substr(resCER_kWh_DT$l_datetime, 12, 13))

# Plots ----
# . is a synonym for list
# we can quote functions and then eval them
toRun = quote( .(MeankWh = mean(kWh)))
hourlykWh <- resCER_kWh_DT[, 
              eval(toRun),
              by=.(hour,month,year)] # mean per half hour by month &  year

# use min/max to set limits of plot correctly
minkwh <- min(hourlykWh$MeankWh)
maxkwh <- max(hourlykWh$MeankWh)

plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 10],
     ylim = c(minkwh,maxkwh),
     col = "1",
     ylab = "Mean kWh",
     lab = c(12,8,2),
     xlab = "Hour",
     type="l")
# familiar daily profile?
par(new = TRUE)
plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 10],
     ylim = c(minkwh,maxkwh),
     col = "2",
     lab = c(12,8,2),
     xlab = "",
     ylab = "",
     type="l")
par(new = TRUE)
plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 12],
     ylim = c(minkwh,maxkwh),
     col = "3",
     lab = c(12,8,2),
     xlab = "",
     ylab = "",
     type="l")
par(new = TRUE)
plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 12],
     ylim = c(minkwh,maxkwh),
     col = "4",
     lab = c(12,8,2),
     xlab = "",
     ylab = "",
     type="l")

legend("topleft",
       col=c(1:4),
       lty=1,
       legend=c("October 2009","October 2010","December 2009","December 2010")
       )

# calculate mean kWh consumption for evening peak 16:00 - 19:00 
# October 2009 (pre trial)
base_Oct2009kWhDT <- as.data.table(resCER_kWh_DT[resCER_kWh_DT$hour > 15 & resCER_kWh_DT$hour < 20 &
                                         resCER_kWh_DT$year == 2009 &
                                         resCER_kWh_DT$month == 10])

m <- mean(base_Oct2009kWhDT$kWh)

# how skewed is that?
summary(base_Oct2009kWhDT$kWh)

# sd
s <- sd(base_Oct2009kWhDT$kWh)

# sample size calculation
# see ?power.t.test
# let us assume we see a 10% change in the mean due to higher evening prices
# we want to test for a reduction (1 sided) 
# for now assume this is a two sample test
power.t.test(delta = 0.1*m, sd = s, power = 0.8, type = "two.sample",
             alternative = "one.sided")

# Time series analysis ----
# create a subset for the first household only
hh_1002 <- subset(resCER_kWh_DT, resCER_kWh_DT$ID == 1002)

# select just October - you'll see why in a minute
##hh_1024oct <- subset(hh_1024, hh_1024$oct == 1)
#L: we can select data from the dataframe by date range
date_start<-as.POSIXct("2009-10-01",tz="")
date_end<-as.POSIXct("2009-10-31",tz="")
hh_1002oct <- hh_1002[hh_1002$l_datetime %in% date_start:date_end, ]

# need to check is sorted by datetime (always increasing) and evenly spaced
# create zoo (time series) object to do this
hh_1002oct_z=zoo(hh_1002oct,order.by=hh_1002oct$l_datetime)

# is it a regular time series? (using zoo function)
is.regular(hh_1002oct_z)
# if it wasn't, how could we get round this problem?
# hint: look at na.approx() and approx()

# plot data
plot(hh_1002oct$kwh)

# run acf with the first household only up to just over 48 hours (96 half hours)
acf(hh_1002oct$kwh, lag.max = 100)
# what do we conclude from that?

# let's find the *partial autocorrelation function* (pacf)
# this is the effect of successive lags with the effect of previous lags removed
# It shows more clearly how the random variation depends on the previous lags
# see https://www.youtube.com/watch?v=R-oWTWdS1Jg
pacf(hh_1002oct$kwh, lag.max = 100)
# how many lags are significant in this case?

# what might happen if we excluded sleep time (00:00 - 06:00?)
# hint: 
# hh_1024oct$my_hod<-(as.double(hh_1024oct$l_datetime)%%86400)/3600
# ...gives decimal hour of day
# then can use logical indexing to remove this data:
# (hh_1024oct$my_hod>6)

# Should we also filter out weekdays from weekends? Why?

# what kind of household is this?
table(hh_1002oct$ba_nadults)
# how many adults?

table(hh_1002oct$ba_nchildren)
# how many children?

table(hh_1002oct$ba_empl)
# Employed?