diff --git a/Census2022-CER_cluster_profile_48points_dataprep.R b/Census2022-CER_cluster_profile_48points_dataprep.R
new file mode 100755
index 0000000000000000000000000000000000000000..e1a97c6fdbed8bc3b6fa014c8496cb5af82c3520
--- /dev/null
+++ b/Census2022-CER_cluster_profile_48points_dataprep.R
@@ -0,0 +1,102 @@
+####################################################################################
+## Data Exploration for ESRC Transformative project
+## - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
+##   - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
+
+## data prep to extract Oct 2009 sub-sample
+
+## This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
+## "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project 
+## - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
+## - http://www.energy.soton.ac.uk/tag/census2022/
+ 
+## Copyright (C) 2014  University of Southampton
+
+## Author: Sharon Lin (X.Lin@soton.ac.uk) 
+##  [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
+
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License 
+## (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+
+## #YMMV - http://en.wiktionary.org/wiki/YMMV
+
+rm(list=ls())
+require(plyr) 
+
+## read in the tables of data
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_data/unzipped/CER_Electricity_Revised_March_2012")
+real1 <- read.table("file1.txt", header=F, stringsAsFactors=FALSE)
+real2 <- read.table("file2.txt", header=F, stringsAsFactors=FALSE)
+real3 <- read.table("file3.txt", header=F, stringsAsFactors=FALSE)
+real4 <- read.table("file4.txt", header=F, stringsAsFactors=FALSE)
+real5 <- read.table("file5.txt", header=F, stringsAsFactors=FALSE)
+real6 <- read.table("file6.txt", header=F, stringsAsFactors=FALSE)
+
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
+ResID = read.csv("CER_HHID.csv", header = F, dec="." )   ## file with residential ID from survey file
+ID = as.vector(as.numeric(ResID$V1))   ## transfer the ResID as a vector
+
+## midweek dates (days start from 1 = 1/1/2009)
+## DateOct = c(272,273,274,279,280,281,286,287,288,293,294,295)   
+
+## weekend dates
+DateOct = c(270,276,277,283,284,290,291,297)   
+
+Fun1 = function (x) {
+  x = x[is.element(x$V1,ID),]
+  x["Dateoct"]=NA  
+  x$Dateoct = substring(x$V2,1,3) 
+  x = x[is.element(x$Dateoct,DateOct),]
+  x}
+
+data1 = Fun1(real1)  
+data2 = Fun1(real2)    
+data3 = Fun1(real3)    
+data4 = Fun1(real4)    
+data5 = Fun1(real5)    
+data6 = Fun1(real6) 
+
+dataOct = rbind(data1,data2)
+dataOct = rbind(dataOct,data3)
+dataOct = rbind(dataOct,data4)
+dataOct = rbind(dataOct,data5)
+dataOct = rbind(dataOct,data6)
+colnames(dataOct) <- c("ID","DS","KW","DateOct")
+
+dataOct["time"] = NA  
+dataOct$time = substring(dataOct$DS,4,5) 
+
+tmp = ddply(dataOct, .(ID,time), summarize,
+            time_mean = round(mean(KW), 4),
+            na.rm=TRUE)  
+mydata = tmp[,1:3]
+## convert data to wide form
+l.sort <- mydata[order(mydata$ID,mydata$time),]
+
+w <- reshape(l.sort, 
+             timevar = "time",
+             idvar = c("ID"),
+             direction = "wide")
+
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
+# write.table(w, file = "CER_Oct09HH_48obs_midweek.txt", append = FALSE, sep = "\t",
+#                         eol = "\n", na = "NA", dec = ".", row.names = F,
+#                        col.names = TRUE, qmethod = c("escape", "double"),
+#                        fileEncoding = "")
+# 
+# 
+# write.table(w, file = "CER_Oct09HH_48obs_weekend.txt", append = FALSE, sep = "\t",
+#             eol = "\n", na = "NA", dec = ".", row.names = F,
+#             col.names = TRUE, qmethod = c("escape", "double"),
+#             fileEncoding = "")
+
+
+
+
diff --git a/Census2022-CER_multinomial_classification_tests.R b/Census2022-CER_multinomial_classification_tests.R
new file mode 100644
index 0000000000000000000000000000000000000000..c566a6b175e29f5ecc4120ef296d152c4387e100
--- /dev/null
+++ b/Census2022-CER_multinomial_classification_tests.R
@@ -0,0 +1,253 @@
+####################################################################################
+## Data Analysis for ESRC Transformative project
+## - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
+##   - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
+
+## evaluating usefulness for logistic models: the benchmark that we will use to charaterize a multinomial logistice regression model as useful is a 25% improvement over the rate of accuracy chievable by change alone.
+## ppt from Utexas, using google search "R multinomial logistic regression interpretation"
+
+## This file is used to generate forecasts on banded info.
+## LHV: 4 categories on number of children:  0, 1, 2, 3+
+## LHV: 4 categories on number of adults:  1, 2, 3, 4+
+## LHV: 5 categories on income band: 1,2,3,4,5
+
+## For the latest version of this code go to: https://github.com/dataknut/Census2022
+
+## This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
+## "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project 
+## - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
+## - http://www.energy.soton.ac.uk/tag/census2022/
+ 
+## Copyright (C) 2014  University of Southampton
+
+## Author: Sharon Lin (X.Lin@soton.ac.uk) 
+##  [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
+
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License 
+## (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+
+## #YMMV - http://en.wiktionary.org/wiki/YMMV
+
+
+rm(list=ls())
+## install.packages("psych")
+library("psych")
+library(lme4)
+require(plyr)
+library("lmtest")
+library(xlsx)
+library(nnet)
+install.packages("mlogit")
+library(mlogit)
+
+
+## load and prepare data
+## HH survey data
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
+HHsurvey = read.table("HH09_pretrial.txt",header=T,stringsAsFactors=FALSE,sep="\t")   ## read in txt file, need sep="t", HHpresurvey data, the income column is from Q4021 only
+HH09FA = read.table("HH_floor_pretrial_survey.txt",header=T,stringsAsFactors=FALSE,sep="\t")   ## read in txt file, need sep="t"; pretrial floor area data
+## income category 6 has been taken out of the sample as it is a non-response code.
+HH09income = read.xlsx("CER_income09.xlsx",3, header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
+tmp1 = HHsurvey[,1:11]
+tmp2 = HH09FA[,-2:-3]
+tmp3 = merge(tmp1,tmp2)
+tmp4 = merge(tmp3,HH09income)
+describe(tmp4)
+svdata = tmp4
+## make a dummy variable with presence of chilren 0, no children, 1 with children
+svdata["dum_child"] = NA
+svdata= within(svdata,dum_child[HHtype_y09==3]<-1)   ##HHtype = 3, living with children (< 15 years)
+svdata= within(svdata,dum_child[HHtype_y09==2]<-0)   ##HHtype = 2, living with adults(> 15years)
+svdata= within(svdata,dum_child[HHtype_y09==1]<-0)   ##HHtype = 1, living alone 
+
+## make a dummy variable for no. of residents,  1 or 2 residents, dum_people = 0, 3 or above residents, dum_people = 1 
+svdata["dum_people"]= NA
+svdata = within(svdata,dum_people[HHtype_y09==1]<-0)   
+svdata = within(svdata,dum_people[HHtype_y09==3&HHchild_y09==1]<-0)   
+svdata = within(svdata,dum_people[HHtype_y09==2&HHadult_y09<3]<-0)   
+svdata = within(svdata,dum_people[HHchild_y09>1]<-1) 
+svdata = within(svdata,dum_people[HHadult_y09>2]<-1)   
+
+## track number of children, residents
+num_children = unique(svdata$HHchild_y09)
+num_resident = unique(svdata$HHadult_y09)
+## ddply(svdata,.(HHchild_y09),nrow)    NA = 3003
+## ddply(svdata, .(HHadult_y09),nrow)
+svdata["num_children"]= NA
+svdata$num_children = svdata$HHchild_y09
+svdata = within(svdata,num_children[HHtype_y09==1]<-0)   
+svdata = within(svdata,num_children[HHtype_y09==2]<-0)   
+svdata = within(svdata,num_children[num_children>3]<-3)   
+ddply(svdata,.(num_children),nrow)   ##
+
+svdata["num_adult"] = NA
+svdata$num_adult = svdata$HHadult_y09
+svdata = within(svdata,num_adult[num_adult>4]<-4)
+ddply(svdata,.(HHadult_y09),nrow)   ##
+ddply(svdata,.(num_adult),nrow)   ##
+
+## careful with weekend or midweek data
+
+## 8 LHS indicators: 1) peak time, 2) peak load, 3) baseload, 4) mean demand, 5) sum, 6) 97.5%, 7) ECF, 8) LF    
+## energy data
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
+
+## weekend data
+## energydata = read.table("CER_OctHH_wkend_long", header=T, stringsAsFactors=FALSE)        
+## ARlong = read.csv("CER_Oct2009_archr-all_hubids_Weekends_v1.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
+
+## midweek data
+energydata = read.table("CER_OctHH_midwk_long", header=T, stringsAsFactors=FALSE) 
+ARlong = read.csv("CER_Oct2009_archr-all_hubids_Midweek_v2.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
+
+## read AR ready from long form to short form
+## head(ARlong)
+AR.sort= ARlong[order(ARlong$ID),]    ## unique IDs: 3486
+AR = reshape(AR.sort, timevar="lag",idvar="ID", direction = "wide")    ## checked, done correctly, 3486 IDs
+
+tmp = energydata[energydata$DateOct<365,]
+regdata = ddply(tmp, .(ID), summarize,
+                octpeaktime = round(mean(dailypktime), 4),
+                octmax = round(mean(dailymax), 4),
+                octbase = round(mean(dailybase), 4),
+                octmean = round(mean(dailymean), 4),
+                octsum = round(mean(dailysum), 4),
+                octq975= round(mean(dailyq975),4),
+                octECF = round(mean(ECF), 4),
+                octLF = round(mean(LF), 4),
+                na.rm=TRUE)  
+
+
+ARtmp = as.data.frame(cbind(AR$ID,AR$archr.35,AR$archr.70,AR$archr.105))
+colnames(ARtmp) = c("ID","AR35","AR70","AR105")
+
+tmp = merge(svdata,ARtmp) 
+datam = merge(tmp,regdata)      ## 3486 IDs
+names(datam)
+datam$dum_people = as.factor(datam$dum_people)
+datam$dum_child = as.factor(datam$dum_child)
+datam$HH_income = as.factor(datam$HH_income)
+
+
+
+## important! take out NAs from variables used in logit model, If not, observations and fitted values not matched, problems for classification!
+
+## Take out NAs from HH_income, ## Total obs of HH_income is 1481
+datam1 = datam[!is.na(datam$HH_income),]
+datam2 = datam1[!is.na(datam1$dum_people),]
+datam3 = datam2[!is.na(datam2$dum_child),]
+datam4 = datam3[!is.na(datam3$octbase),]
+datam5 = datam4[!is.na(datam4$octECF),]
+
+datamf = datam5
+
+testest = multinom(HH_income ~ dum_people + dum_child + octbase + octECF,na.action="na.omit", data=datamf)
+summary(testest)
+
+test = testest
+summary(test)
+z <- summary(test)$coefficients/summary(test)$standard.errors   ## z-score
+z
+p <- (1 - pnorm(abs(z), 0, 1)) * 2    ## p-value
+p
+exp(coef(test))    ## relative risk ratio
+pp <- fitted(test)   ## 
+head(pp)
+
+
+
+####### classification #######
+## Give the highest prediction on the category as the predicted category
+## give the HH predicted category = true category as "1", 
+## Table of true classification given by income data ### 
+# 
+
+## ddply(datamf,.(HH_income),nrow)
+# HH_income  V1
+# 1         1 153
+# 2         2 222
+# 3         3 343
+# 4         4 475
+# 5         5 285
+
+### classification test
+####### classification codes #######################
+fitted = as.data.frame(pp)
+colnames(fitted) = c("pp1","pp2","pp3","pp4","pp5")
+names(fitted)
+true_income = as.numeric(as.character(datamf$HH_income))    ## put dummy back to 0 and 1; dummy for employed is 1
+predited_cat = 0
+predited_dum = 0
+out= as.matrix(cbind(true_income,fitted,predited_cat,predited_dum))      ## 
+
+##find the max row value, max.col(), stored in predited_cat
+tmp1 = max.col(out[,2:6],"first")
+out = as.data.frame(out)
+out$predited_cat = as.numeric(tmp1)
+
+## if predited_cat equal to true income band, put 1 in predited_dum
+out$predited_dum = ifelse(out$predited_cat==out$true_income,1,0)
+# ddply(out,.(predited_dum),nrow)
+# predited_dum   V1
+# 1            0 1007
+# 2            1  471
+
+## examples of probabilities calculated against observed & predicted categories using this method
+## essentially the majority are allocated to band 4
+# head(out)
+# true_income         pp1        pp2       pp3       pp4       pp5       predited_cat predited_dum
+# 1            1 0.225094742 0.23912560 0.1712665 0.2564000 0.1081132            4            0
+# 2            3 0.020366367 0.09399776 0.2354826 0.4401418 0.2100115            4            0
+# 8            3 0.037214194 0.12067370 0.2321019 0.3690928 0.2409174            4            0
+# 18           4 0.009786489 0.05460458 0.2638757 0.3738515 0.2978817            4            1
+# 24           4 0.190800918 0.22274730 0.1890820 0.2766414 0.1207283            4            1
+# 26           1 0.031743912 0.09372484 0.2716323 0.3721229 0.2307760            4            0
+           
+# ddply(out,.(predited_dum,predited_cat),nrow)
+#         predited_dum predited_cat  V1
+# 1             0            1  23
+# 2             0            2  23
+# 3             0            3  21
+# 4             0            4 912
+# 5             0            5  28
+# 6             1            1  12
+# 7             1            2   8
+# 8             1            3   9
+# 9             1            4 430
+# 10            1            5  12
+
+## classification table: income band from 1 - 5, 10 entries as follows: 
+table =  ddply(out,.(predited_dum,predited_cat),nrow)
+table_T = ddply(out,.(true_income),nrow)
+cltable =table_T
+colnames(cltable)= c("income_range", "true_income")
+cltable$correctPre = table[6:10,3]
+cltable$incorrectPre = table[1:5,3]
+cltable$correctPercent =  cltable$correctPre/cltable$true_income                   ## correct prediction given all real outcome
+cltable$overallPercent =  cltable$correctPre/(cltable$correctPre + cltable$incorrectPre)
+## grand correct percentage
+F = sum((cltable$correctPercent*cltable$overallPercent)^2)    ## 0.085167
+F
+
+### classification by random as a comparison
+
+
+# cltable
+#        income_range true_income correctPre incorrectPre correctPercent overallPercent
+# 1            1         153         12           23     0.07843137      0.3428571
+# 2            2         222          8           23     0.03603604      0.2580645
+# 3            3         343          9           21     0.02623907      0.3000000
+# 4            4         475        430          912     0.90526316      0.3204173
+# 5            5         285         12           28     0.04210526      0.3000000
+# 
+# The proportional by chance accuracy rate is
+## CA = sum((cltable$true_income/sum(cltable$true_income))^2)
+## CA     ## 0.2276
+## The proportional by chance accuracy criteria,   CA * 1.25 = 0.2276*1.25 = 0.2845   (not good!)
diff --git a/Census2022-CER_regP_48_CLUSTER_std-1-5-15.R b/Census2022-CER_regP_48_CLUSTER_std-1-5-15.R
new file mode 100755
index 0000000000000000000000000000000000000000..5a2fa6f4eb475880c19ccdfa3206c70748341275
--- /dev/null
+++ b/Census2022-CER_regP_48_CLUSTER_std-1-5-15.R
@@ -0,0 +1,333 @@
+####################################################################################
+## Data Analysis for ESRC Transformative project
+## - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
+##   - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
+
+## midweek & weekend household regressions using:
+## - standardised 24 hour power data derived clusters
+## - AR coefficients (lag = 35 & 70)
+## - power variables
+
+## For the latest version of this code go to: https://github.com/dataknut/Census2022
+
+## This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
+## "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project 
+## - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
+## - http://www.energy.soton.ac.uk/tag/census2022/
+ 
+## Copyright (C) 2014  University of Southampton
+
+## Author: Sharon Lin (X.Lin@soton.ac.uk) 
+##  [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
+
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License 
+## (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+
+## #YMMV - http://en.wiktionary.org/wiki/YMMV
+
+rm(list=ls())
+## install.packages("psych")
+library("psych")
+library(lme4)
+require(plyr)
+library("lmtest")
+library(xlsx)
+library(nnet)
+
+
+## data load and preparation
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
+HHsurvey = read.table("HH09_pretrial.txt",header=T,stringsAsFactors=FALSE,sep="\t")   ## read in txt file, need sep="t", HHpresurvey data, the income column is from Q4021 only
+HH09FA = read.table("HH_floor_pretrial_survey.txt",header=T,stringsAsFactors=FALSE,sep="\t")   ## read in txt file, need sep="t"; pretrial floor area data
+## income category 6 has been taken out of the sample.
+HH09income = read.xlsx("CER_income09.xlsx",3, header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
+tmp1 = HHsurvey[,1:11]
+tmp2 = HH09FA[,-2:-3]
+tmp3 = merge(tmp1,tmp2)
+tmp4 = merge(tmp3,HH09income)
+describe(tmp4)
+svdata = tmp4
+## make a dummy variable with presence of chilren 0, no children, 1 with children
+svdata["dum_child"] = NA
+svdata= within(svdata,dum_child[HHtype_y09==3]<-1)   ##HHtype = 3, living with children (< 15 years)
+svdata= within(svdata,dum_child[HHtype_y09==2]<-0)   ##HHtype = 2, living with adults(> 15years)
+svdata= within(svdata,dum_child[HHtype_y09==1]<-0)   ##HHtype = 1, living alone 
+
+## make a dummy variable for no. of residents,  1 or 2 residents, dum_people = 0, 3 or above residents, dum_people = 1 
+svdata["dum_people"]= NA
+svdata = within(svdata,dum_people[HHtype_y09==1]<-0)   
+svdata = within(svdata,dum_people[HHtype_y09==3&HHchild_y09==1]<-0)   
+svdata = within(svdata,dum_people[HHtype_y09==2&HHadult_y09<3]<-0)   
+svdata = within(svdata,dum_people[HHchild_y09>1]<-1) 
+svdata = within(svdata,dum_people[HHadult_y09>2]<-1)   
+
+## make a dummy variable for employment status,  1-3 as employment (0), 4-7 as unemployment (1)
+svdata["dum_em"]= NA
+svdata = within(svdata,dum_em[employment_y09==1]<-0)   
+svdata = within(svdata,dum_em[employment_y09==2]<-0)   
+svdata = within(svdata,dum_em[employment_y09==3]<-0)   
+svdata = within(svdata,dum_em[employment_y09>3]<-1)   
+
+
+
+## careful with weekend or midweek data
+
+
+## 8 LHS indicators: 1) peak time, 2) peak load, 3) baseload, 4) mean demand, 5) sum, 6) 97.5%, 7) ECF, 8) LF    
+## energy data
+setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
+
+## weekend data
+  cluster_48 = read.table("CER_Oct09HH_48obs_weekend.txt", header=T, stringsAsFactors=FALSE)        ## read in text files
+  energydata = read.table("CER_OctHH_wkend_long", header=T, stringsAsFactors=FALSE)        
+  ARlong = read.csv("CER_Oct2009_archr-all_hubids_Weekends_v2.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
+# 
+# 
+
+
+## midweek data
+#     cluster_48 = read.table("CER_Oct09HH_48obs_midweek.txt", header=T, stringsAsFactors=FALSE)        ## read in text files
+#     energydata = read.table("CER_OctHH_midwk_long", header=T, stringsAsFactors=FALSE) 
+#     ARlong = read.csv("CER_Oct2009_archr-all_hubids_Midweek_v2.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables  ## note in excel file, use space to replace "NA"
+
+## processing AR from long to shor data form
+ARlong = ARlong[,1:3]
+AR.sort= ARlong[order(ARlong$ID),]    ## unique IDs: 3486
+AR = reshape(AR.sort, timevar="lag",idvar="ID", direction = "wide")    ## checked, done correctly, 3486 IDs
+
+tmp = energydata[energydata$DateOct<365,]
+regdata = ddply(tmp, .(ID), summarize,
+                octpeaktime = round(mean(dailypktime), 4),
+                octmax = round(mean(dailymax), 4),
+                octammean = round(mean(dailyammean), 4),
+                octbase = round(mean(dailybase), 4),
+                octmean = round(mean(dailymean), 4),
+                octsum = round(mean(dailysum), 4),
+                octq975= round(mean(dailyq975),4),
+                octECF = round(mean(ECF), 4),
+                octLF = round(mean(LF), 4),
+                na.rm=TRUE)  
+
+## merge data together
+tmp = merge(svdata,AR)           ## AR35 is the autocorrleation of 35 lags, or a day.  AR70 is the autocorrelation of 70 lags, or two days
+tmp1 = merge(tmp,regdata)
+mydatam = merge(tmp1,cluster_48)
+
+######### standardised clustering  ############
+stddata = mydatam[,177:224]
+stddata = na.omit(stddata)
+stddata = scale(stddata)   ## make mean 0, stdev 1
+
+## partitioning by a plot of the within groups sum of squares (WSS) by number of clusters extracted (looking for a bend in the plot, http://www.statmethods.net/advstats/cluster.html)
+## wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
+wss <- (nrow(stddata)-1)*sum(apply(stddata,2,var))
+for (i in 2:15) wss[i] <- sum(kmeans(stddata,centers=i,iter.max=20,nstart=1)$withinss)   ## allow 20 iterations
+plot(1:15, wss, type="b", xlab="Number of Clusters",
+     ylab="Within groups sum of squares",
+     main="CER_midweek_clusters (30min profile")
+
+### fit kmeans analysis for 6 clusters ##########
+set.seed(1)
+fit <- kmeans(stddata, 6) # no. of cluster solution
+ckmean = aggregate(stddata,by=list(fit$cluster),FUN=mean)    # get cluster means 
+# append cluster assignment
+stddataf <- data.frame(mydatam$ID,stddata, fit$cluster)    ## standardized clusters with ID column
+mydata = stddataf
+mydata["dum_ck1"] = 0
+mydata = within(mydata, dum_ck1[fit.cluster==1]<-1)
+mydata["dum_ck2"] = 0
+mydata = within(mydata, dum_ck2[fit.cluster==2]<-1)
+mydata["dum_ck3"] = 0
+mydata = within(mydata, dum_ck3[fit.cluster==3]<-1)
+mydata["dum_ck4"] = 0
+mydata = within(mydata, dum_ck4[fit.cluster==4]<-1)
+mydata["dum_ck5"] = 0
+mydata = within(mydata, dum_ck5[fit.cluster==5]<-1)
+mydata["dum_ck6"] = 0
+mydata = within(mydata, dum_ck6[fit.cluster==6]<-1)
+names(mydata)[names(mydata)=="mydatam.ID"]<-"ID"
+
+############## end of standardizing clusters,  mydata has 48 obs of standardized means and dummy variables for 6 clusters  ################
+
+## merge energy, HH survey, AR, cluster data together
+datam = merge(mydatam[,1:175],mydata)   ### data in daily 48 obs are standardized (mean 0, std = 1)
+## write.table(mydata,"test_wkend_cluster")
+mydata = as.data.frame(datam)
+## test=do.call(data.frame,lapply(mydata, function(x) replace(x, is.infinite(x),NA)))
+
+## change data structure
+mydata$dum_em = as.factor(mydata$dum_em)
+mydata$dum_people = as.factor(mydata$dum_people)
+mydata$dum_child = as.factor(mydata$dum_child)
+mydata$dum_ck1 = as.factor(mydata$dum_ck1)
+mydata$dum_ck2 = as.factor(mydata$dum_ck2)
+mydata$dum_ck3 = as.factor(mydata$dum_ck3)
+mydata$dum_ck4 = as.factor(mydata$dum_ck4)
+mydata$dum_ck5 = as.factor(mydata$dum_ck5)
+mydata$dum_ck6 = as.factor(mydata$dum_ck6)
+
+
+
+##### logit regresssion  models  
+
+## Wkend
+# logit(dum_resident) ~ ampkmean + mean
+# logit(dum_child) ~ ampkmean + mean 
+
+
+
+# ## dummy people    1/2   v 3/3+
+tmp = mydata
+tmp1 = tmp[!is.na(tmp$dum_people),]
+mydata_r = tmp1
+### weekend regresssion 
+D1_logit = glm(dum_people ~ octammean + octmean, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+D1a_logit = glm(dum_people ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+D1b_logit = glm(dum_people ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6+ archr.35+archr.70, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+
+lrtest(D1_logit,D1a_logit)      ## 4.297e-09 ***
+lrtest(D1a_logit,D1b_logit)    ## 0.002879 **
+# 
+# ## dummy children     0 or 1 (presence of chilidren)
+tmp = mydata
+tmp1 = tmp[!is.na(tmp$dum_child),]
+mydata_r = tmp1
+
+D2_logit = glm(dum_child ~ octammean + octmean, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+ D2a_logit = glm(dum_child ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")
+ D2b_logit = glm(dum_child ~ octammean + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 + archr.35+archr.70, na.action="na.exclude", data=mydata_r,family="binomial")
+ lrtest(D2_logit,D2a_logit)      ## 2.258e-15 ***
+ lrtest(D2a_logit,D2b_logit)      ##  0.08583
+
+
+## Midweek regression
+# logit(dum_people) ~ ampkmean + base + mean
+# logit(dum_child) ~ ampkmean + mean + ECF
+# logit(employment_y09) ~ ECF + LF
+
+# dummy people as LHV
+tmp = mydata
+tmp1 = tmp[!is.na(tmp$dum_people),]
+mydata_r = tmp1
+C1_logit = glm(dum_people ~ octammean + octbase + octmean, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+C1a_logit = glm(dum_people ~ octammean + octbase + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+C1b_logit = glm(dum_people ~ octammean + octbase + octmean + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 +archr.35+archr.70 , na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+lrtest(C1_logit,C1a_logit)     ## 4.393e-12 ***
+lrtest(C1a_logit,C1b_logit)   ## 0.4109 .
+# 
+# ## dummy child as LHV
+tmp = mydata
+tmp1 = tmp[!is.na(tmp$dum_child),]
+mydata_r = tmp1
+
+C2_logit = glm(dum_child ~ octmax + octammean + octmean + octECF, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+C2a_logit = glm(dum_child ~ octmax +  octammean + octmean + octECF  + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+C2b_logit = glm(dum_child ~ octmax + octammean + octmean + octECF+ dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 +archr.35+archr.70, na.action="na.exclude", data=mydata_r,family="binomial")    ## linear regression 
+
+lrtest(C2_logit,C2a_logit)     ##  2.973e-09 *** midweek
+lrtest(C2a_logit,C2b_logit)    ##   0.00177 ** midweek
+
+# 
+
+### dum_employment as LHV, midweek only
+#  tmp = mydata
+#  tmp1 = tmp[!is.na(tmp$dum_em),]
+#  mydata_r = tmp1
+# 
+# S5_logit = glm(dum_em ~ octECF + octLF, na.action="na.exclude", data=mydata_r,family="binomial")
+# S5a_logit = glm(dum_em ~ octECF + octLF + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 , na.action="na.exclude", data=mydata_r,family="binomial")
+# S5b_logit = glm(dum_em ~ octECF + octLF + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 + archr.35 + archr.70, na.action="na.exclude", data=mydata_r,family="binomial")
+# S5c_logit = glm(dum_em ~ dum_child+ dum_people + octECF + octLF + dum_ck2 + dum_ck3 + dum_ck4 + dum_ck5 + dum_ck6 + archr.35 + archr.70, na.action="na.exclude", data=mydata_r,family="binomial")
+# lrtest(S5_logit,S5a_logit)     ## 2.2e-16
+# lrtest(S5a_logit,S5b_logit)    ## 0.00013
+# lrtest(S5b_logit,S5c_logit)    ## 2.2e-16 
+
+####### classification
+## input output regression
+logit = C2b_logit
+
+## input depedent variable
+true_dum = as.numeric(as.character(mydata_r$dum_child))   ## put dummy back to 0 and 1; dummy for employed is 1
+
+## run the following, 
+## extract fitted values
+fitted = fitted(logit)
+predited_dum = 0
+logit_out= as.data.frame(cbind(true_dum,fitted,predited_dum))      ## 
+colnames(logit_out) = c("true_dum", "fitted","predited_dum")       ## 
+logit_out = within(logit_out, predited_dum[fitted>0.5]<-1)
+
+## fill in the correct "+" preditions and "-" predictions
+## 
+TrueD = as.data.frame(count(logit_out,"true_dum"))
+Total = TrueD[1,2] + TrueD[2,2]
+# true_dum freq
+# 1        0 2089
+# 2        1 1399
+
+correctP = sum(logit_out[,1]==0&logit_out[,3]==0)  ## corret negative prediction  , eg. 1757
+correctN= sum(logit_out[,1]==1&logit_out[,3]==1)  ## correct positive prediction  ,  e.g. 763
+(sum(logit_out[,1]==0&logit_out[,3]==0) + sum(logit_out[,1]==1&logit_out[,3]==1)) /(TrueD[1,2]+TrueD[2,2])  
+## correct predictions (both positive and negative)
+(correctP + correctN) /Total  ## 0.7225
+###### end of classification test ##############
+
+
+
+
+
+## regression ##  
+## income band
+## multinomail income band regression as LHV
+ tmp = mydata
+ tmp1 = tmp[!is.na(tmp$HH_income),]
+ mydata_r = tmp1
+# midweek
+#  test = multinom(HH_income ~ octammean + octbase + octmean,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2179 (residual deviance)
+#  test_cluster = multinom(HH_income ~ octammean + octbase + octmean + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 ,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2166 (residual deviance)
+#  test_clusterar = multinom(HH_income ~ octammean + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2163 (residual deviance)
+#  test_clusterar_admin = multinom(HH_income ~ dum_people + dum_child + octammean + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105,na.action="na.omit",data=mydata_r)   ## 2LL = 2*2163 (residual deviance)
+
+##weekend 
+test = multinom(HH_income ~ octmax + octbase + octmean,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2179 (residual deviance)
+test_cluster = multinom(HH_income ~ octmax + octbase + octmean + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 ,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2166 (residual deviance)
+test_clusterar = multinom(HH_income ~ octmax + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105,na.action="na.omit",data=mydata_r )   ## 2LL = 2*2163 (residual deviance)
+test_clusterar_admin = multinom(HH_income ~ dum_people + dum_child + octmax + octbase + octmean  + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 +  archr.105,na.action="na.omit",data=mydata_r)
+
+ lrtest(test,test_cluster)                     ### 0.005248 (midweek)         ### 0.5166 (weekend)
+ lrtest(test_cluster,test_clusterar)           ## 0.002872 (midweek)         ##  0.9379 (weekend)
+ lrtest(test_clusterar,test_clusterar_admin)   ## 2.483e-13  (midweek)       ##  1.403e-11 (weekend) 
+# 
+test = test_clusterar_admin
+summary(test)
+z <- summary(test)$coefficients/summary(test)$standard.errors   ## z-score
+z
+p <- (1 - pnorm(abs(z), 0, 1)) * 2    ## p-value
+p
+exp(coef(test))    ## relative risk ratio
+pp <- fitted(test)   ## 
+head(pp)
+
+# ## floor space weekend
+#  FL = lm(M2 ~ octbase + octLF, ,na.action="na.exclude",data=mydata)    ## working order
+#  FLa = lm(M2 ~ octbase + octLF + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6, ,na.action="na.exclude",data=mydata)    ## working order
+#  FLb = lm(M2 ~ octbase + octLF + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105, ,na.action="na.exclude",data=mydata)    ## working order
+#  FLc = lm(M2 ~ octbase + dum_people + dum_child + octLF + dum_ck2 + dum_ck3+ dum_ck4+ dum_ck5+dum_ck6 + archr.35 + archr.70 + archr.105, ,na.action="na.exclude",data=mydata)    ## working order
+#  lrtest(FL,FLa)    ## 0.0843,  0.2686 (weekend)
+#  lrtest(FLa,FLb)   ## 0.546,   0.7867 (weekend)
+#  lrtest(FLb,FLc)   ## 0.0024,  0.008342 (weekend)  
+
+
+######### output cluster data  ###################
+# test = mydata[,-2:-223]
+# write.xlsx(test, "weekend_cluster.xlsx")
+
+# test = mydata[,-2:-223]
+# write.xlsx(test, "midweek_cluster.xlsx")