Select Git revision
analyseCERdata.R
-
Ben Anderson authoredBen Anderson authored
analyseCERdata.R 6.41 KiB
# Meta -----
# Analyse Irish CER Smart Meter Trial data
# See: http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
# code by: b.anderson@soton.ak.uk (@dataknut) with additions by lsb1@soton.ac.uk
# Purpose: ----
# introductory code for use in teaching using this dataset
# introduce large scale time series data
# introduce data tables
# introduce sub-setting
# introduce zoo package for manipulating time series
# introduce autocorrelation
# Housekeeping ----
# clear the workspace
rm(list=ls())
# install a couple of packages that we will need
# if you have them already, you can comment out these lines with a "#"
#install.packages("data.table")
library(data.table)
#install.packages("zoo")
library(zoo)
print(paste0("Working directory is: ", getwd()))
# change the working directory to where you put the data
# setwd("<where you put the data>")
# In my case:
setwd("~/Documents/Work/Data/CER Smart Metering Project/data/processed")
# Load data & describe it ----
# Load the data by looping over months and years
months <- c("October", "December")
years <- c("2009", "2010")
for (m in months) {
for (y in years) {
print(paste0("Reading in CER_",m,"_",y,"_residential.csv"))
# need to set this table name according to values of m and y
# why is this soooo hard in R? It must be a common need
name <- paste0("resCER_",m,"_",y,"_DT")
assign(name, as.data.table(
read.csv(paste0("CER_",m,"_",y,"_residential.csv"))))
}
}
# what's here?
tables()
# combine the data into one big table in long format
# more use for summaries etc
l = list(resCER_October_2009_DT,resCER_December_2009_DT,resCER_October_2010_DT,resCER_December_2010_DT)
resCER_kWh_DT <- as.data.table(rbindlist(l))
# delete old tables to save memory
resCER_October_2009_DT <- NULL
resCER_December_2009_DT <- NULL
resCER_October_2010_DT <- NULL
resCER_December_2010_DT <- NULL
#L: We can create a date vector using R's date conversion functions
resCER_kWh_DT$l_datetime<-as.POSIXct(resCER_kWh_DT$datetime_start,tz = "", "%Y-%m-%d %H:%M:%S")
# now remove old datetime to save memory
resCER_kWh_DT$datetime_start <- NULL
# extract:
# year
# month
# date
# hour of the day variable (24 hour format)
# relies on the POSIXct time having a regular structure so that the hour is ALWAYS character 12 & 13
resCER_kWh_DT$year <- as.numeric(substr(resCER_kWh_DT$l_datetime, 1, 4))
resCER_kWh_DT$month <- as.numeric(substr(resCER_kWh_DT$l_datetime, 6, 7))
resCER_kWh_DT$date <- as.POSIXct(substr(resCER_kWh_DT$l_datetime, 1, 10))
resCER_kWh_DT$hour <- as.numeric(substr(resCER_kWh_DT$l_datetime, 12, 13))
# Plots ----
# . is a synonym for list
# we can quote functions and then eval them
toRun = quote( .(MeankWh = mean(kWh)))
hourlykWh <- resCER_kWh_DT[,
eval(toRun),
by=.(hour,month,year)] # mean per half hour by month & year
# use min/max to set limits of plot correctly
minkwh <- min(hourlykWh$MeankWh)
maxkwh <- max(hourlykWh$MeankWh)
plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 10],
ylim = c(minkwh,maxkwh),
col = "1",
ylab = "Mean kWh",
lab = c(12,8,2),
xlab = "Hour",
type="l")
# familiar daily profile?
par(new = TRUE)
plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 10],
ylim = c(minkwh,maxkwh),
col = "2",
lab = c(12,8,2),
xlab = "",
ylab = "",
type="l")
par(new = TRUE)
plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 12],
ylim = c(minkwh,maxkwh),
col = "3",
lab = c(12,8,2),
xlab = "",
ylab = "",
type="l")
par(new = TRUE)
plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 12],
ylim = c(minkwh,maxkwh),
col = "4",
lab = c(12,8,2),
xlab = "",
ylab = "",
type="l")
legend("topleft",
col=c(1:4),
lty=1,
legend=c("October 2009","October 2010","December 2009","December 2010")
)
# calculate mean kWh consumption for evening peak 16:00 - 19:00
# October 2009 (pre trial)
base_Oct2009kWhDT <- as.data.table(resCER_kWh_DT[resCER_kWh_DT$hour > 15 & resCER_kWh_DT$hour < 20 &
resCER_kWh_DT$year == 2009 &
resCER_kWh_DT$month == 10])
m <- mean(base_Oct2009kWhDT$kWh)
# how skewed is that?
summary(base_Oct2009kWhDT$kWh)
# sd
s <- sd(base_Oct2009kWhDT$kWh)
# sample size calculation
# see ?power.t.test
# let us assume we see a 10% change in the mean due to higher evening prices
# we want to test for a reduction (1 sided)
# for now assume this is a two sample test
power.t.test(delta = 0.1*m, sd = s, power = 0.8, type = "two.sample",
alternative = "one.sided")
# Time series analysis ----
# create a subset for the first household only
hh_1002 <- subset(resCER_kWh_DT, resCER_kWh_DT$ID == 1002)
# select just October - you'll see why in a minute
##hh_1024oct <- subset(hh_1024, hh_1024$oct == 1)
#L: we can select data from the dataframe by date range
date_start<-as.POSIXct("2009-10-01",tz="")
date_end<-as.POSIXct("2009-10-31",tz="")
hh_1002oct <- hh_1002[hh_1002$l_datetime %in% date_start:date_end, ]
# need to check is sorted by datetime (always increasing) and evenly spaced
# create zoo (time series) object to do this
hh_1002oct_z=zoo(hh_1002oct,order.by=hh_1002oct$l_datetime)
# is it a regular time series? (using zoo function)
is.regular(hh_1002oct_z)
# if it wasn't, how could we get round this problem?
# hint: look at na.approx() and approx()
# plot data
plot(hh_1002oct$kwh)
# run acf with the first household only up to just over 48 hours (96 half hours)
acf(hh_1002oct$kwh, lag.max = 100)
# what do we conclude from that?
# let's find the *partial autocorrelation function* (pacf)
# this is the effect of successive lags with the effect of previous lags removed
# It shows more clearly how the random variation depends on the previous lags
# see https://www.youtube.com/watch?v=R-oWTWdS1Jg
pacf(hh_1002oct$kwh, lag.max = 100)
# how many lags are significant in this case?
# what might happen if we excluded sleep time (00:00 - 06:00?)
# hint:
# hh_1024oct$my_hod<-(as.double(hh_1024oct$l_datetime)%%86400)/3600
# ...gives decimal hour of day
# then can use logical indexing to remove this data:
# (hh_1024oct$my_hod>6)
# Should we also filter out weekdays from weekends? Why?
# what kind of household is this?
table(hh_1002oct$ba_nadults)
# how many adults?
table(hh_1002oct$ba_nchildren)
# how many children?
table(hh_1002oct$ba_empl)
# Employed?