testing data analysis

8a1fee1b · Ben Anderson · 32d24efe · 8a1fee1b
Commit 8a1fee1b authored 9 years ago by Ben Anderson
--- a/analyseCERdata.R
+++ b/analyseCERdata.R
+# Meta -----
+# Analyse Irish CER Smart Meter Trial data
+# See: http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
+
+# code by: b.anderson@soton.ak.uk (@dataknut) with additions by lsb1@soton.ac.uk
+
+# Purpose: ----
+# introductory code for use in teaching using this dataset
+# introduce large scale time series data
+# introduce data tables
+# introduce sub-setting 
+# introduce zoo package for manipulating time series
+# introduce autocorrelation
+
+# Housekeeping ----
+# clear the workspace
+rm(list=ls())
+
+# install a couple of packages that we will need
+# if you have them already, you can comment out these lines with a "#"
+#install.packages("data.table") 
+library(data.table)
+#install.packages("zoo")
+library(zoo)
+
+print(paste0("Working directory is: ", getwd()))
+
+# change the working directory to where you put the data
+# setwd("<where you put the data>")
+# In my case:
+setwd("~/Documents/Work/Data/CER Smart Metering Project/data/processed")
+
+# Load data & describe it ----
+# Load the data
+
+# loop over months and years
+months <- c("October", "December")
+years <- c("2009", "2010")
+for (m in months) {
+  for (y in years) {
+    print(paste0("Reading in CER_",m,"_",y,"_residential.csv"))
+    # need to set this table name according to values of m and y
+    # why is this soooo hard in R? It must be a common need
+    name <- paste0("resCER_",m,"_",y,"_DT")
+    assign(name, as.data.table(
+                      read.csv(paste0("CER_",m,"_",y,"_residential.csv"))))
+  }
+}
+
+# what's here?
+tables()
+
+# combine the data into one big table in long format
+# more use for summaries etc
+l = list(resCER_October_2009_DT,resCER_December_2009_DT,resCER_October_2010_DT,resCER_December_2010_DT)
+resCER_kWh_DT <- as.data.table(rbindlist(l))
+
+# delete old tables to save memory
+resCER_October_2009_DT <- NULL
+resCER_December_2009_DT <- NULL
+resCER_October_2010_DT <- NULL
+resCER_December_2010_DT <- NULL
+
+#L: We can create a date vector using R's date conversion functions
+resCER_kWh_DT$l_datetime<-as.POSIXct(resCER_kWh_DT$datetime_start,tz = "", "%Y-%m-%d %H:%M:%S")
+# now remove old datetime to save memory
+resCER_kWh_DT$datetime_start <- NULL
+
+# extract:
+# year
+# month
+# date
+# hour of the day variable (24 hour format)
+# relies on the POSIXct time having a regular structure so that the hour is ALWAYS character 12 & 13
+resCER_kWh_DT$year <- as.numeric(substr(resCER_kWh_DT$l_datetime, 1, 4))
+resCER_kWh_DT$month <- as.numeric(substr(resCER_kWh_DT$l_datetime, 6, 7))
+resCER_kWh_DT$date <- as.POSIXct(substr(resCER_kWh_DT$l_datetime, 1, 10))
+resCER_kWh_DT$hour <- as.numeric(substr(resCER_kWh_DT$l_datetime, 12, 13))
+
+# Plots ----
+# . is a synonym for list
+# we can quote functions and then eval them
+toRun = quote( .(MeankWh = mean(kWh)))
+hourlykWh <- resCER_kWh_DT[, 
+              eval(toRun),
+              by=.(hour,month,year)] # mean per half hour by month &  year
+
+# use min/max to set limits of plot correctly
+minkwh <- min(hourlykWh$MeankWh)
+maxkwh <- max(hourlykWh$MeankWh)
+
+plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 10],
+     ylim = c(minkwh,maxkwh),
+     col = "1",
+     ylab = "Mean kWh",
+     lab = c(12,8,2),
+     xlab = "Hour",
+     type="l")
+# familiar daily profile?
+par(new = TRUE)
+plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 10],
+     ylim = c(minkwh,maxkwh),
+     col = "2",
+     lab = c(12,8,2),
+     xlab = "",
+     ylab = "",
+     type="l")
+par(new = TRUE)
+plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 12],
+     ylim = c(minkwh,maxkwh),
+     col = "3",
+     lab = c(12,8,2),
+     xlab = "",
+     ylab = "",
+     type="l")
+par(new = TRUE)
+plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 12],
+     ylim = c(minkwh,maxkwh),
+     col = "4",
+     lab = c(12,8,2),
+     xlab = "",
+     ylab = "",
+     type="l")
+
+legend("topleft",
+       col=c(1:4),
+       lty=1,
+       legend=c("October 2009","October 2010","December 2009","December 2010")
+       )
+
+# calculate mean kWh consumption for evening peak 16:00 - 19:00 
+# October 2009 (pre trial)
+base_Oct2009kWhDT <- as.data.table(resCER_kWh_DT[resCER_kWh_DT$hour > 15 & resCER_kWh_DT$hour < 20 &
+                                         resCER_kWh_DT$year == 2009 &
+                                         resCER_kWh_DT$month == 10])
+
+m <- mean(base_Oct2009kWhDT$kWh)
+
+# how skewed is that?
+summary(base_Oct2009kWhDT$kWh)
+
+# sd
+s <- sd(base_Oct2009kWhDT$kWh)
+
+# sample size calculation
+# see ?power.t.test
+# let us assume we see a 10% change in the mean due to higher evening prices
+# we want to test for a reduction (1 sided) 
+# for now assume this is a two sample test
+power.t.test(delta = 0.1*m, sd = s, power = 0.8, type = "two.sample",
+             alternative = "one.sided")
+
+# Time series analysis ----
+# create a subset for the first household only
+hh_1002 <- subset(resCER_kWh_DT, resCER_kWh_DT$ID == 1002)
+
+# select just October - you'll see why in a minute
+##hh_1024oct <- subset(hh_1024, hh_1024$oct == 1)
+#L: we can select data from the dataframe by date range
+date_start<-as.POSIXct("2009-10-01",tz="")
+date_end<-as.POSIXct("2009-10-31",tz="")
+hh_1002oct <- hh_1002[hh_1002$l_datetime %in% date_start:date_end, ]
+
+# need to check is sorted by datetime (always increasing) and evenly spaced
+# create zoo (time series) object to do this
+hh_1002oct_z=zoo(hh_1002oct,order.by=hh_1002oct$l_datetime)
+
+# is it a regular time series? (using zoo function)
+is.regular(hh_1002oct_z)
+# if it wasn't, how could we get round this problem?
+# hint: look at na.approx() and approx()
+
+# plot data
+plot(hh_1002oct$kwh)
+
+# run acf with the first household only up to just over 48 hours (96 half hours)
+acf(hh_1002oct$kwh, lag.max = 100)
+# what do we conclude from that?
+
+# let's find the *partial autocorrelation function* (pacf)
+# this is the effect of successive lags with the effect of previous lags removed
+# It shows more clearly how the random variation depends on the previous lags
+# see https://www.youtube.com/watch?v=R-oWTWdS1Jg
+pacf(hh_1002oct$kwh, lag.max = 100)
+# how many lags are significant in this case?
+
+# what might happen if we excluded sleep time (00:00 - 06:00?)
+# hint: 
+# hh_1024oct$my_hod<-(as.double(hh_1024oct$l_datetime)%%86400)/3600
+# ...gives decimal hour of day
+# then can use logical indexing to remove this data:
+# (hh_1024oct$my_hod>6)
+
+# Should we also filter out weekdays from weekends? Why?
+
+# what kind of household is this?
+table(hh_1002oct$ba_nadults)
+# how many adults?
+
+table(hh_1002oct$ba_nchildren)
+# how many children?
+
+table(hh_1002oct$ba_empl)
+# Employed?
+