Census2022-CER_regP_48_CLUSTER_std-1-5-15.R

####################################################################################
## Data Analysis for ESRC Transformative project
## - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
##   - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/

## midweek & weekend household regressions using:
## - standardised 24 hour power data derived clusters
## - AR coefficients (lag = 35 & 70)
## - power variables

## For the latest version of this code go to: https://github.com/dataknut/Census2022

## This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
## "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project 
## - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
## - http://www.energy.soton.ac.uk/tag/census2022/
 
## Copyright (C) 2014  University of Southampton

## Author: Sharon Lin (X.Lin@soton.ac.uk) 
##  [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]

## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License 
## (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## #YMMV - http://en.wiktionary.org/wiki/YMMV

rm(list=ls())
## install.packages("psych")
library("psych")
library(lme4)
require(plyr)
library("lmtest")
library(xlsx)
library(nnet)


## data load and preparation
setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
HHsurvey = read.table("HH09_pretrial.txt",header=T,stringsAsFactors=FALSE,sep="\t")   ## read in txt file, need sep="t", HHpresurvey data, the income column is from Q4021 only
HH09FA = read.table("HH_floor_pretrial_survey.txt",header=T,stringsAsFactors=FALSE,sep="\t")   ## read in txt file, need sep="t"; pretrial floor area data
## income category 6 has been taken out of the sample.
HH09income = read.xlsx("CER_income09.xlsx",3, header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
tmp1 = HHsurvey[,1:11]
tmp2 = HH09FA[,-2:-3]
tmp3 = merge(tmp1,tmp2)
tmp4 = merge(tmp3,HH09income)
describe(tmp4)
svdata = tmp4
## make a dummy variable with presence of chilren 0, no children, 1 with children
svdata["dum_child"] = NA
svdata= within(svdata,dum_child[HHtype_y09==3]<-1)   ##HHtype = 3, living with children (< 15 years)
svdata= within(svdata,dum_child[HHtype_y09==2]<-0)   ##HHtype = 2, living with adults(> 15years)
svdata= within(svdata,dum_child[HHtype_y09==1]<-0)   ##HHtype = 1, living alone 

## make a dummy variable for no. of residents,  1 or 2 residents, dum_people = 0, 3 or above residents, dum_people = 1 
svdata["dum_people"]= NA
svdata = within(svdata,dum_people[HHtype_y09==1]<-0)   
svdata = within(svdata,dum_people[HHtype_y09==3&HHchild_y09==1]<-0)   
svdata = within(svdata,dum_people[HHtype_y09==2&HHadult_y09<3]<-0)   
svdata = within(svdata,dum_people[HHchild_y09>1]<-1) 
svdata = within(svdata,dum_people[HHadult_y09>2]<-1)
## make a dummy variable for employment status,  1-3 as employment (0), 4-7 as unemployment (1)
svdata["dum_em"]= NA
svdata = within(svdata,dum_em[employment_y09==1]<-0)   
svdata = within(svdata,dum_em[employment_y09==2]<-0)   
svdata = within(svdata,dum_em[employment_y09==3]<-0)   
svdata = within(svdata,dum_em[employment_y09>3]<-1)   


## careful with weekend or midweek data


## 8 LHS indicators: 1) peak time, 2) peak load, 3) baseload, 4) mean demand, 5) sum, 6) 97.5%, 7) ECF, 8) LF    
## energy data
setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")

## weekend data
  cluster_48 = read.table("CER_Oct09HH_48obs_weekend.txt", header=T, stringsAsFactors=FALSE)        ## read in text files
  energydata = read.table("CER_OctHH_wkend_long", header=T, stringsAsFactors=FALSE)        
  ARlong = read.csv("CER_Oct2009_archr-all_hubids_Weekends_v2.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
# 
# 


## midweek data
#     cluster_48 = read.table("CER_Oct09HH_48obs_midweek.txt", header=T, stringsAsFactors=FALSE)        ## read in text files
#     energydata = read.table("CER_OctHH_midwk_long", header=T, stringsAsFactors=FALSE) 
#     ARlong = read.csv("CER_Oct2009_archr-all_hubids_Midweek_v2.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"

## processing AR from long to shor data form
ARlong = ARlong[,1:3]
AR.sort= ARlong[order(ARlong$ID),]    ## unique IDs: 3486
AR = reshape(AR.sort, timevar="lag",idvar="ID", direction = "wide")    ## checked, done correctly, 3486 IDs

tmp = energydata[energydata$DateOct<365,]
regdata = ddply(tmp, .(ID), summarize,
                octpeaktime = round(mean(dailypktime), 4),
                octmax = round(mean(dailymax), 4),
                octammean = round(mean(dailyammean), 4),
                octbase = round(mean(dailybase), 4),
                octmean = round(mean(dailymean), 4),
                octsum = round(mean(dailysum), 4),
                octq975= round(mean(dailyq975),4),
                octECF = round(mean(ECF), 4),
                octLF = round(mean(LF), 4),
                na.rm=TRUE)  

## merge data together
tmp = merge(svdata,AR)           ## AR35 is the autocorrleation of 35 lags, or a day.  AR70 is the autocorrelation of 70 lags, or two days
tmp1 = merge(tmp,regdata)
mydatam = merge(tmp1,cluster_48)

######### standardised clustering  ############
stddata = mydatam[,177:224]
stddata = na.omit(stddata)
stddata = scale(stddata)   ## make mean 0, stdev 1

## k means clustering
## partitioning by a plot of the within groups sum of squares (WSS) by number of clusters extracted (looking for a bend in the plot, http://www.statmethods.net/advstats/cluster.html)
## wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
wss <- (nrow(stddata)-1)*sum(apply(stddata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(stddata,centers=i,iter.max=20,nstart=1)$withinss)   ## allow 20 iterations
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares",
     main="CER_midweek_clusters (30min profile")

### fit kmeans analysis for 6 clusters ##########
set.seed(1)
fit <- kmeans(stddata, 6) # no. of cluster solution
ckmean = aggregate(stddata,by=list(fit$cluster),FUN=mean)    # get cluster means
# append cluster assignment
stddataf <- data.frame(mydatam$ID,stddata, fit$cluster)    ## standardized clusters with ID column
mydata = stddataf
mydata["dum_ck1"] = 0
mydata = within(mydata, dum_ck1[fit.cluster==1]<-1)
mydata["dum_ck2"] = 0
mydata = within(mydata, dum_ck2[fit.cluster==2]<-1)
mydata["dum_ck3"] = 0
mydata = within(mydata, dum_ck3[fit.cluster==3]<-1)
mydata["dum_ck4"] = 0
mydata = within(mydata, dum_ck4[fit.cluster==4]<-1)
mydata["dum_ck5"] = 0
mydata = within(mydata, dum_ck5[fit.cluster==5]<-1)
mydata["dum_ck6"] = 0
mydata = within(mydata, dum_ck6[fit.cluster==6]<-1)
names(mydata)[names(mydata)=="mydatam.ID"]<-"ID"

############## end of standardizing clusters,  mydata has 48 obs of standardized means and dummy variables for 6 clusters  ################

## merge energy, HH survey, AR, cluster data together
datam = merge(mydatam[,1:175],mydata)   ### data in daily 48 obs are standardized (mean 0, std = 1)
## write.table(mydata,"test_wkend_cluster")
mydata = as.data.frame(datam)
## test=do.call(data.frame,lapply(mydata, function(x) replace(x, is.infinite(x),NA)))

## change data structure
mydata$dum_em = as.factor(mydata$dum_em)
mydata$dum_people = as.factor(mydata$dum_people)
mydata$dum_child = as.factor(mydata$dum_child)
mydata$dum_ck1 = as.factor(mydata$dum_ck1)
mydata$dum_ck2 = as.factor(mydata$dum_ck2)
mydata$dum_ck3 = as.factor(mydata$dum_ck3)
mydata$dum_ck4 = as.factor(mydata$dum_ck4)
mydata$dum_ck5 = as.factor(mydata$dum_ck5)
mydata$dum_ck6 = as.factor(mydata$dum_ck6)


##### logit regresssion  models  

## Wkend
# logit(dum_resident) ~ ampkmean + mean
# logit(dum_child) ~ ampkmean + mean 


# ## dummy people    1/2   v 3/3+
tmp = mydata
tmp1 = tmp[!is.na(tmp$dum_people),]
mydata_r = tmp1
### weekend regresssion 
D1_logit = glm(dum_people ~ octammean + octmean, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
D1a_logit = glm(dum_people ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
D1b_logit = glm(dum_people ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6+ archr.35+archr.70, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 

lrtest(D1_logit,D1a_logit)      ## 4.297e-09 ***
lrtest(D1a_logit,D1b_logit)    ## 0.002879 **
# 
# ## dummy children     0 or 1 (presence of chilidren)
tmp = mydata
tmp1 = tmp[!is.na(tmp$dum_child),]
mydata_r = tmp1

D2_logit = glm(dum_child ~ octammean + octmean, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
 D2a_logit = glm(dum_child ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")
 D2b_logit = glm(dum_child ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 + archr.35+archr.70, na.action="na.exclude", data=mydata_r,family="binomial")
 lrtest(D2_logit,D2a_logit)      ## 2.258e-15 ***
 lrtest(D2a_logit,D2b_logit)      ##  0.08583
## Midweek regression
# logit(dum_people) ~ ampkmean + base + mean
# logit(dum_child) ~ ampkmean + mean + ECF
# logit(employment_y09) ~ ECF + LF

# dummy people as LHV
tmp = mydata
tmp1 = tmp[!is.na(tmp$dum_people),]
mydata_r = tmp1
C1_logit = glm(dum_people ~ octammean + octbase + octmean, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
C1a_logit = glm(dum_people ~ octammean + octbase + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
C1b_logit = glm(dum_people ~ octammean + octbase + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 +archr.35+archr.70 , na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
lrtest(C1_logit,C1a_logit)     ## 4.393e-12 ***
lrtest(C1a_logit,C1b_logit)   ## 0.4109 .
# 
# ## dummy child as LHV
tmp = mydata
tmp1 = tmp[!is.na(tmp$dum_child),]
mydata_r = tmp1

C2_logit = glm(dum_child ~ octmax + octammean + octmean + octECF, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
C2a_logit = glm(dum_child ~ octmax +  octammean + octmean + octECF  + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
C2b_logit = glm(dum_child ~ octmax + octammean + octmean + octECF+ dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 +archr.35+archr.70, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 

lrtest(C2_logit,C2a_logit)     ##  2.973e-09 *** midweek
lrtest(C2a_logit,C2b_logit)    ##   0.00177 ** midweek

# 

### dum_employment as LHV, midweek only
#  tmp = mydata
#  tmp1 = tmp[!is.na(tmp$dum_em),]
#  mydata_r = tmp1
# 
# S5_logit = glm(dum_em ~ octECF + octLF, na.action="na.exclude", data=mydata_r,family="binomial")
# S5a_logit = glm(dum_em ~ octECF + octLF + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 , na.action="na.exclude", data=mydata_r,family="binomial")
# S5b_logit = glm(dum_em ~ octECF + octLF + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 + archr.35 + archr.70, na.action="na.exclude", data=mydata_r,family="binomial")
# S5c_logit = glm(dum_em ~ dum_child+ dum_people + octECF + octLF + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 + archr.35 + archr.70, na.action="na.exclude", data=mydata_r,family="binomial")
# lrtest(S5_logit,S5a_logit)     ## 2.2e-16
# lrtest(S5a_logit,S5b_logit)    ## 0.00013
# lrtest(S5b_logit,S5c_logit)    ## 2.2e-16 

####### classification
## input output regression
logit = C2b_logit

## input depedent variable
true_dum = as.numeric(as.character(mydata_r$dum_child))   ## put dummy back to 0 and 1; dummy for employed is 1

## run the following, 
## extract fitted values
fitted = fitted(logit)
predited_dum = 0
logit_out= as.data.frame(cbind(true_dum,fitted,predited_dum))      ## 
colnames(logit_out) = c("true_dum", "fitted","predited_dum")       ## 
logit_out = within(logit_out, predited_dum[fitted>0.5]<-1)

## fill in the correct "+" preditions and "-" predictions
## 
TrueD = as.data.frame(count(logit_out,"true_dum"))
Total = TrueD[1,2] + TrueD[2,2]
# true_dum freq
# 1        0 2089
# 2        1 1399

correctP = sum(logit_out[,1]==0&logit_out[,3]==0)  ## corret negative prediction  , eg. 1757
correctN= sum(logit_out[,1]==1&logit_out[,3]==1)  ## correct positive prediction  ,  e.g. 763
(sum(logit_out[,1]==0&logit_out[,3]==0) + sum(logit_out[,1]==1&logit_out[,3]==1)) /(TrueD[1,2]+TrueD[2,2])  
## correct predictions (both positive and negative)
(correctP + correctN) /Total  ## 0.7225
###### end of classification test ##############


## regression ##  
## income band
## multinomail income band regression as LHV
 tmp = mydata
 tmp1 = tmp[!is.na(tmp$HH_income),]
 mydata_r = tmp1
# midweek
#  test = multinom(HH_income ~ octammean + octbase + octmean,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2179 (residual deviance)
#  test_cluster = multinom(HH_income ~ octammean + octbase + octmean + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 ,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2166 (residual deviance)
#  test_clusterar = multinom(HH_income ~ octammean + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2163 (residual deviance)
#  test_clusterar_admin = multinom(HH_income ~ dum_people + dum_child + octammean + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105,na.action="na.omit",data=mydata_r)   ## 2LL = 2*2163 (residual deviance)

##weekend 
test = multinom(HH_income ~ octmax + octbase + octmean,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2179 (residual deviance)
test_cluster = multinom(HH_income ~ octmax + octbase + octmean + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 ,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2166 (residual deviance)
test_clusterar = multinom(HH_income ~ octmax + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2163 (residual deviance)
test_clusterar_admin = multinom(HH_income ~ dum_people + dum_child + octmax + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 +  archr.105,na.action="na.omit",data=mydata_r)

 lrtest(test,test_cluster)                     ### 0.005248 (midweek)         ### 0.5166 (weekend)
 lrtest(test_cluster,test_clusterar)           ## 0.002872 (midweek)         ##  0.9379 (weekend)
 lrtest(test_clusterar,test_clusterar_admin)   ## 2.483e-13  (midweek)       ##  1.403e-11 (weekend) 
# 
test = test_clusterar_admin
summary(test)
z <- summary(test)$coefficients/summary(test)$standard.errors   ## z-score
z
p <- (1 - pnorm(abs(z), 0, 1)) * 2    ## p-value
p
exp(coef(test))    ## relative risk ratio
pp <- fitted(test)   ## 
head(pp)

# ## floor space weekend
#  FL = lm(M2 ~ octbase + octLF, ,na.action="na.exclude",data=mydata)    ## working order
#  FLa = lm(M2 ~ octbase + octLF + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6, ,na.action="na.exclude",data=mydata)    ## working order
#  FLb = lm(M2 ~ octbase + octLF + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105, ,na.action="na.exclude",data=mydata)    ## working order
#  FLc = lm(M2 ~ octbase + dum_people + dum_child + octLF + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105, ,na.action="na.exclude",data=mydata)    ## working order
#  lrtest(FL,FLa)    ## 0.0843,  0.2686 (weekend)
#  lrtest(FLa,FLb)   ## 0.546,   0.7867 (weekend)
#  lrtest(FLb,FLc)   ## 0.0024,  0.008342 (weekend)  


######### output cluster data  ###################
# test = mydata[,-2:-223]
# write.xlsx(test, "weekend_cluster.xlsx")

# test = mydata[,-2:-223]
# write.xlsx(test, "midweek_cluster.xlsx")