Skip to content
Snippets Groups Projects
Select Git revision
  • master default
1 result

analyseCERdata.R

Blame
  • analyseCERdata.R 6.41 KiB
    # Meta -----
    # Analyse Irish CER Smart Meter Trial data
    # See: http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
    
    # code by: b.anderson@soton.ak.uk (@dataknut) with additions by lsb1@soton.ac.uk
    
    # Purpose: ----
    # introductory code for use in teaching using this dataset
    # introduce large scale time series data
    # introduce data tables
    # introduce sub-setting 
    # introduce zoo package for manipulating time series
    # introduce autocorrelation
    
    # Housekeeping ----
    # clear the workspace
    rm(list=ls())
    
    # install a couple of packages that we will need
    # if you have them already, you can comment out these lines with a "#"
    #install.packages("data.table") 
    library(data.table)
    #install.packages("zoo")
    library(zoo)
    
    print(paste0("Working directory is: ", getwd()))
    
    # change the working directory to where you put the data
    # setwd("<where you put the data>")
    # In my case:
    setwd("~/Documents/Work/Data/CER Smart Metering Project/data/processed")
    
    # Load data & describe it ----
    # Load the data by looping over months and years
    months <- c("October", "December")
    years <- c("2009", "2010")
    for (m in months) {
      for (y in years) {
        print(paste0("Reading in CER_",m,"_",y,"_residential.csv"))
        # need to set this table name according to values of m and y
        # why is this soooo hard in R? It must be a common need
        name <- paste0("resCER_",m,"_",y,"_DT")
        assign(name, as.data.table(
                          read.csv(paste0("CER_",m,"_",y,"_residential.csv"))))
      }
    }
    
    # what's here?
    tables()
    
    # combine the data into one big table in long format
    # more use for summaries etc
    l = list(resCER_October_2009_DT,resCER_December_2009_DT,resCER_October_2010_DT,resCER_December_2010_DT)
    resCER_kWh_DT <- as.data.table(rbindlist(l))
    
    # delete old tables to save memory
    resCER_October_2009_DT <- NULL
    resCER_December_2009_DT <- NULL
    resCER_October_2010_DT <- NULL
    resCER_December_2010_DT <- NULL
    
    #L: We can create a date vector using R's date conversion functions
    resCER_kWh_DT$l_datetime<-as.POSIXct(resCER_kWh_DT$datetime_start,tz = "", "%Y-%m-%d %H:%M:%S")
    # now remove old datetime to save memory
    resCER_kWh_DT$datetime_start <- NULL
    
    # extract:
    # year
    # month
    # date
    # hour of the day variable (24 hour format)
    # relies on the POSIXct time having a regular structure so that the hour is ALWAYS character 12 & 13
    resCER_kWh_DT$year <- as.numeric(substr(resCER_kWh_DT$l_datetime, 1, 4))
    resCER_kWh_DT$month <- as.numeric(substr(resCER_kWh_DT$l_datetime, 6, 7))
    resCER_kWh_DT$date <- as.POSIXct(substr(resCER_kWh_DT$l_datetime, 1, 10))
    resCER_kWh_DT$hour <- as.numeric(substr(resCER_kWh_DT$l_datetime, 12, 13))
    
    # Plots ----
    # . is a synonym for list
    # we can quote functions and then eval them
    toRun = quote( .(MeankWh = mean(kWh)))
    hourlykWh <- resCER_kWh_DT[, 
                  eval(toRun),
                  by=.(hour,month,year)] # mean per half hour by month &  year
    
    # use min/max to set limits of plot correctly
    minkwh <- min(hourlykWh$MeankWh)
    maxkwh <- max(hourlykWh$MeankWh)
    
    plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 10],
         ylim = c(minkwh,maxkwh),
         col = "1",
         ylab = "Mean kWh",
         lab = c(12,8,2),
         xlab = "Hour",
         type="l")
    # familiar daily profile?
    par(new = TRUE)
    plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 10],
         ylim = c(minkwh,maxkwh),
         col = "2",
         lab = c(12,8,2),
         xlab = "",
         ylab = "",
         type="l")
    par(new = TRUE)
    plot(hourlykWh$MeankWh[hourlykWh$year == 2009 & hourlykWh$month == 12],
         ylim = c(minkwh,maxkwh),
         col = "3",
         lab = c(12,8,2),
         xlab = "",
         ylab = "",
         type="l")
    par(new = TRUE)
    plot(hourlykWh$MeankWh[hourlykWh$year == 2010 & hourlykWh$month == 12],
         ylim = c(minkwh,maxkwh),
         col = "4",
         lab = c(12,8,2),
         xlab = "",
         ylab = "",
         type="l")
    
    legend("topleft",
           col=c(1:4),
           lty=1,
           legend=c("October 2009","October 2010","December 2009","December 2010")
           )
    
    # calculate mean kWh consumption for evening peak 16:00 - 19:00 
    # October 2009 (pre trial)
    base_Oct2009kWhDT <- as.data.table(resCER_kWh_DT[resCER_kWh_DT$hour > 15 & resCER_kWh_DT$hour < 20 &
                                             resCER_kWh_DT$year == 2009 &
                                             resCER_kWh_DT$month == 10])
    
    m <- mean(base_Oct2009kWhDT$kWh)
    
    # how skewed is that?
    summary(base_Oct2009kWhDT$kWh)
    
    # sd
    s <- sd(base_Oct2009kWhDT$kWh)
    
    # sample size calculation
    # see ?power.t.test
    # let us assume we see a 10% change in the mean due to higher evening prices
    # we want to test for a reduction (1 sided) 
    # for now assume this is a two sample test
    power.t.test(delta = 0.1*m, sd = s, power = 0.8, type = "two.sample",
                 alternative = "one.sided")
    
    # Time series analysis ----
    # create a subset for the first household only
    hh_1002 <- subset(resCER_kWh_DT, resCER_kWh_DT$ID == 1002)
    
    # select just October - you'll see why in a minute
    ##hh_1024oct <- subset(hh_1024, hh_1024$oct == 1)
    #L: we can select data from the dataframe by date range
    date_start<-as.POSIXct("2009-10-01",tz="")
    date_end<-as.POSIXct("2009-10-31",tz="")
    hh_1002oct <- hh_1002[hh_1002$l_datetime %in% date_start:date_end, ]
    
    # need to check is sorted by datetime (always increasing) and evenly spaced
    # create zoo (time series) object to do this
    hh_1002oct_z=zoo(hh_1002oct,order.by=hh_1002oct$l_datetime)
    
    # is it a regular time series? (using zoo function)
    is.regular(hh_1002oct_z)
    # if it wasn't, how could we get round this problem?
    # hint: look at na.approx() and approx()
    
    # plot data
    plot(hh_1002oct$kwh)
    
    # run acf with the first household only up to just over 48 hours (96 half hours)
    acf(hh_1002oct$kwh, lag.max = 100)
    # what do we conclude from that?
    
    # let's find the *partial autocorrelation function* (pacf)
    # this is the effect of successive lags with the effect of previous lags removed
    # It shows more clearly how the random variation depends on the previous lags
    # see https://www.youtube.com/watch?v=R-oWTWdS1Jg
    pacf(hh_1002oct$kwh, lag.max = 100)
    # how many lags are significant in this case?
    
    # what might happen if we excluded sleep time (00:00 - 06:00?)
    # hint: 
    # hh_1024oct$my_hod<-(as.double(hh_1024oct$l_datetime)%%86400)/3600
    # ...gives decimal hour of day
    # then can use logical indexing to remove this data:
    # (hh_1024oct$my_hod>6)
    
    # Should we also filter out weekdays from weekends? Why?
    
    # what kind of household is this?
    table(hh_1002oct$ba_nadults)
    # how many adults?
    
    table(hh_1002oct$ba_nchildren)
    # how many children?
    
    table(hh_1002oct$ba_empl)
    # Employed?