Select Git revision
Census2022-CER_multinomial_classification_tests.R
Census2022-CER_multinomial_classification_tests.R 11.13 KiB
####################################################################################
## Data Analysis for ESRC Transformative project
## - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
## - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
## evaluating usefulness for logistic models: the benchmark that we will use to charaterize a multinomial logistice regression model as useful is a 25% improvement over the rate of accuracy chievable by change alone.
## ppt from Utexas, using google search "R multinomial logistic regression interpretation"
## This file is used to generate forecasts on banded info.
## LHV: 4 categories on number of children: 0, 1, 2, 3+
## LHV: 4 categories on number of adults: 1, 2, 3, 4+
## LHV: 5 categories on income band: 1,2,3,4,5
## For the latest version of this code go to: https://github.com/dataknut/Census2022
## This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
## "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project
## - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
## - http://www.energy.soton.ac.uk/tag/census2022/
## Copyright (C) 2014 University of Southampton
## Author: Sharon Lin (X.Lin@soton.ac.uk)
## [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License
## (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## #YMMV - http://en.wiktionary.org/wiki/YMMV
rm(list=ls())
## install.packages("psych")
library("psych")
library(lme4)
require(plyr)
library("lmtest")
library(xlsx)
library(nnet)
install.packages("mlogit")
library(mlogit)
## load and prepare data
## HH survey data
setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
HHsurvey = read.table("HH09_pretrial.txt",header=T,stringsAsFactors=FALSE,sep="\t") ## read in txt file, need sep="t", HHpresurvey data, the income column is from Q4021 only
HH09FA = read.table("HH_floor_pretrial_survey.txt",header=T,stringsAsFactors=FALSE,sep="\t") ## read in txt file, need sep="t"; pretrial floor area data
## income category 6 has been taken out of the sample as it is a non-response code.
HH09income = read.xlsx("CER_income09.xlsx",3, header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables ## note in excel file, use space to replace "NA"
tmp1 = HHsurvey[,1:11]
tmp2 = HH09FA[,-2:-3]
tmp3 = merge(tmp1,tmp2)
tmp4 = merge(tmp3,HH09income)
describe(tmp4)
svdata = tmp4
## make a dummy variable with presence of chilren 0, no children, 1 with children
svdata["dum_child"] = NA
svdata= within(svdata,dum_child[HHtype_y09==3]<-1) ##HHtype = 3, living with children (< 15 years)
svdata= within(svdata,dum_child[HHtype_y09==2]<-0) ##HHtype = 2, living with adults(> 15years)
svdata= within(svdata,dum_child[HHtype_y09==1]<-0) ##HHtype = 1, living alone
## make a dummy variable for no. of residents, 1 or 2 residents, dum_people = 0, 3 or above residents, dum_people = 1
svdata["dum_people"]= NA
svdata = within(svdata,dum_people[HHtype_y09==1]<-0)
svdata = within(svdata,dum_people[HHtype_y09==3&HHchild_y09==1]<-0)
svdata = within(svdata,dum_people[HHtype_y09==2&HHadult_y09<3]<-0)
svdata = within(svdata,dum_people[HHchild_y09>1]<-1)
svdata = within(svdata,dum_people[HHadult_y09>2]<-1)
## track number of children, residents
num_children = unique(svdata$HHchild_y09)
num_resident = unique(svdata$HHadult_y09)
## ddply(svdata,.(HHchild_y09),nrow) NA = 3003
## ddply(svdata, .(HHadult_y09),nrow)
svdata["num_children"]= NA
svdata$num_children = svdata$HHchild_y09
svdata = within(svdata,num_children[HHtype_y09==1]<-0)
svdata = within(svdata,num_children[HHtype_y09==2]<-0)
svdata = within(svdata,num_children[num_children>3]<-3)
ddply(svdata,.(num_children),nrow) ##
svdata["num_adult"] = NA
svdata$num_adult = svdata$HHadult_y09
svdata = within(svdata,num_adult[num_adult>4]<-4)
ddply(svdata,.(HHadult_y09),nrow) ##
ddply(svdata,.(num_adult),nrow) ##
## careful with weekend or midweek data
## 8 LHS indicators: 1) peak time, 2) peak load, 3) baseload, 4) mean demand, 5) sum, 6) 97.5%, 7) ECF, 8) LF
## energy data
setwd("//soton.ac.uk/ude/PersonalFiles/Users/xl25g12/mydocuments/census2022/CER_analysis/data")
## weekend data
## energydata = read.table("CER_OctHH_wkend_long", header=T, stringsAsFactors=FALSE)
## ARlong = read.csv("CER_Oct2009_archr-all_hubids_Weekends_v1.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables ## note in excel file, use space to replace "NA"
## midweek data
energydata = read.table("CER_OctHH_midwk_long", header=T, stringsAsFactors=FALSE)
ARlong = read.csv("CER_Oct2009_archr-all_hubids_Midweek_v2.csv", header=T, stringsAsFactors=FALSE) ## 4232 obs of 13 variables ## note in excel file, use space to replace "NA"
## read AR ready from long form to short form
## head(ARlong)
AR.sort= ARlong[order(ARlong$ID),] ## unique IDs: 3486
AR = reshape(AR.sort, timevar="lag",idvar="ID", direction = "wide") ## checked, done correctly, 3486 IDs
tmp = energydata[energydata$DateOct<365,]
regdata = ddply(tmp, .(ID), summarize,
octpeaktime = round(mean(dailypktime), 4),
octmax = round(mean(dailymax), 4),
octbase = round(mean(dailybase), 4),
octmean = round(mean(dailymean), 4),
octsum = round(mean(dailysum), 4),
octq975= round(mean(dailyq975),4),
octECF = round(mean(ECF), 4),
octLF = round(mean(LF), 4),
na.rm=TRUE)
ARtmp = as.data.frame(cbind(AR$ID,AR$archr.35,AR$archr.70,AR$archr.105))
colnames(ARtmp) = c("ID","AR35","AR70","AR105")
tmp = merge(svdata,ARtmp)
datam = merge(tmp,regdata) ## 3486 IDs
names(datam)
datam$dum_people = as.factor(datam$dum_people)
datam$dum_child = as.factor(datam$dum_child)
datam$HH_income = as.factor(datam$HH_income)
## important! take out NAs from variables used in logit model, If not, observations and fitted values not matched, problems for classification!
## Take out NAs from HH_income, ## Total obs of HH_income is 1481
datam1 = datam[!is.na(datam$HH_income),]
datam2 = datam1[!is.na(datam1$dum_people),]
datam3 = datam2[!is.na(datam2$dum_child),]
datam4 = datam3[!is.na(datam3$octbase),]
datam5 = datam4[!is.na(datam4$octECF),]
datamf = datam5
testest = multinom(HH_income ~ dum_people + dum_child + octbase + octECF,na.action="na.omit", data=datamf)
summary(testest)
test = testest
summary(test)
z <- summary(test)$coefficients/summary(test)$standard.errors ## z-score
z
p <- (1 - pnorm(abs(z), 0, 1)) * 2 ## p-value
p
exp(coef(test)) ## relative risk ratio
pp <- fitted(test) ##
head(pp)
####### classification #######
## Give the highest prediction on the category as the predicted category
## give the HH predicted category = true category as "1",
## Table of true classification given by income data ###
#
## ddply(datamf,.(HH_income),nrow)
# HH_income V1
# 1 1 153
# 2 2 222
# 3 3 343
# 4 4 475
# 5 5 285
### classification test
####### classification codes #######################
fitted = as.data.frame(pp)
colnames(fitted) = c("pp1","pp2","pp3","pp4","pp5")
names(fitted)
true_income = as.numeric(as.character(datamf$HH_income)) ## put dummy back to 0 and 1; dummy for employed is 1
predited_cat = 0
predited_dum = 0
out= as.matrix(cbind(true_income,fitted,predited_cat,predited_dum)) ##
##find the max row value, max.col(), stored in predited_cat
tmp1 = max.col(out[,2:6],"first")
out = as.data.frame(out)
out$predited_cat = as.numeric(tmp1)
## if predited_cat equal to true income band, put 1 in predited_dum
out$predited_dum = ifelse(out$predited_cat==out$true_income,1,0)
# ddply(out,.(predited_dum),nrow)
# predited_dum V1
# 1 0 1007
# 2 1 471
## examples of probabilities calculated against observed & predicted categories using this method
## essentially the majority are allocated to band 4
# head(out)
# true_income pp1 pp2 pp3 pp4 pp5 predited_cat predited_dum
# 1 1 0.225094742 0.23912560 0.1712665 0.2564000 0.1081132 4 0
# 2 3 0.020366367 0.09399776 0.2354826 0.4401418 0.2100115 4 0
# 8 3 0.037214194 0.12067370 0.2321019 0.3690928 0.2409174 4 0
# 18 4 0.009786489 0.05460458 0.2638757 0.3738515 0.2978817 4 1
# 24 4 0.190800918 0.22274730 0.1890820 0.2766414 0.1207283 4 1
# 26 1 0.031743912 0.09372484 0.2716323 0.3721229 0.2307760 4 0
# ddply(out,.(predited_dum,predited_cat),nrow)
# predited_dum predited_cat V1
# 1 0 1 23
# 2 0 2 23
# 3 0 3 21
# 4 0 4 912
# 5 0 5 28
# 6 1 1 12
# 7 1 2 8
# 8 1 3 9
# 9 1 4 430
# 10 1 5 12
## classification table: income band from 1 - 5, 10 entries as follows:
table = ddply(out,.(predited_dum,predited_cat),nrow)
table_T = ddply(out,.(true_income),nrow)
cltable =table_T
colnames(cltable)= c("income_range", "true_income")
cltable$correctPre = table[6:10,3]
cltable$incorrectPre = table[1:5,3]
cltable$correctPercent = cltable$correctPre/cltable$true_income ## correct prediction given all real outcome
cltable$overallPercent = cltable$correctPre/(cltable$correctPre + cltable$incorrectPre)
## grand correct percentage
F = sum((cltable$correctPercent*cltable$overallPercent)^2) ## 0.085167
F
### classification by random as a comparison
# cltable
# income_range true_income correctPre incorrectPre correctPercent overallPercent
# 1 1 153 12 23 0.07843137 0.3428571
# 2 2 222 8 23 0.03603604 0.2580645
# 3 3 343 9 21 0.02623907 0.3000000
# 4 4 475 430 912 0.90526316 0.3204173
# 5 5 285 12 28 0.04210526 0.3000000
#
# The proportional by chance accuracy rate is
## CA = sum((cltable$true_income/sum(cltable$true_income))^2)
## CA ## 0.2276
## The proportional by chance accuracy criteria, CA * 1.25 = 0.2276*1.25 = 0.2845 (not good!)