Commit c069e1bb authored by Ben Anderson's avatar Ben Anderson
Browse files

copied repo from github

parents
# Meta ----
# Script to load & analyse DECC NEED public use file data
# Data & documentation at:
# https://www.gov.uk/government/collections/national-energy-efficiency-data-need-framework
# code by: b.anderson@soton.ak.uk (@dataknut)
### Housekeeping ------------------------
# clear the workspace
rm(list=ls())
# set input file name
ifile <- "http://www.soton.ac.uk/~ba1e12/need_public_use_file_2014_2012_10pc_extract.csv"
### Required packages ----
#install.packages("data.table") # if needed
library(data.table)
#install.packages("car") # if needed
library(car)
### Load data ----
# load the data into a data table
# this may take a few seconds - it is quite large!
needPulf2014DT <- as.data.table(read.csv(ifile))
# how much data do we have?
dim(needPulf2014DT)
# create some factor variables ----
# see http://bit.ly/1LazmvE -> "Excel workbook explaining variable codes"
# The gcons*valid variable codes:
# G = Gas consumption invalid, greater than 50,000
# L = Gas consumption invalid, less than 100
# M = Gas consumption data is missing in source data
# 0 = Property does not have a gas connection
# V = Valid gas consumption (between 100 and 50,000 inclusive)
# NB - there are valid gas readings of '0' which presumably were > 100 but < 249 (first gas 'heap' = 'nearest 500')
# The Econs*valid variable codes:
# G = Electricity consumption invalid, greater than 25,000 (DECC lookup table says 50,000)
# L = Electricity consumption invalid, less than 100
# M = Electricity consumption data is missing in source dataset
# V = Valid electricity consumption (between 100 and 25,000 inclusive)
# test values
table(needPulf2014DT$gcons2012valid)
# all 4
needPulf2014DT$gcons2012valid <- factor(needPulf2014DT$gcons2012valid,
labels = c("> 50k",
"< 100",
"Missing",
"No gas",
"Valid 100 - 25k" ))
# test values
table(needPulf2014DT$econs2012valid)
# only two
needPulf2014DT$econs2012valid <- factor(needPulf2014DT$econs2012valid,
labels = c("> 25k",
#"< 100",
#"Missing",
"Valid 100 - 25k" ))
needPulf2014DT$main_heat_fuel <- factor(needPulf2014DT$main_heat_fuel,
labels = c("Gas", "Electricity"))
needPulf2014DT$e7flag2012 <- factor(needPulf2014DT$e7flag2012,
labels = c("Econ7 = Yes"))
needPulf2014DT$prop_age <- factor(needPulf2014DT$prop_age,
labels = c("before 1930",
"1930-1949",
"1950-1966",
"1967-1982",
"1983-1995",
"1996 onwards"))
needPulf2014DT$prop_type <- factor(needPulf2014DT$prop_type,
labels = c("Detached",
"Semi-detached",
"End terrace",
"Mid terrace",
"Bungalow",
"Flat (incl. maisonette)"))
needPulf2014DT$floor_area_band <- factor(needPulf2014DT$floor_area_band,
labels = c("1 to 50 m2",
"51-100 m2",
"101-150 m2",
"> 151 m2"))
needPulf2014DT$ee_band <- factor(needPulf2014DT$ee_band,
labels = c("A or B",
"C",
"D",
"E",
"F",
"G"))
needPulf2014DT$wall_cons <- factor(needPulf2014DT$wall_cons,
labels = c("Cavity",
"Other"))
# recode loft depth
needPulf2014DT$loft_depth <- factor(needPulf2014DT$loft_depth,
labels = c("Under 150 mm", "> 150 mm", "Unknown"),
) # 99 = NA (helpfully!)
# only 1 valid value
needPulf2014DT$li <- factor(needPulf2014DT$li,
labels = c("Has loft insulation"))
## Q0: Check out the data ----
# Q0.0: What are the variable names?
# Hint: use names()
names(needPulf2014DT)
# Q0.1: What kind of data do we have and what are the central tendencies?
# Hint: use summary() and the with() function so you don't have to keep typing the data table name
with(needPulf2014DT, summary(gcons2012))
with(needPulf2014DT, summary(econs2012))
# Note that there are NA in both - why?
# Hint: see above and also http://bit.ly/1LazmvE -> "Excel workbook explaining variable codes"
# Q0.2: How many properties do not have 'valid' elec/gas readings for 2012?
# Hint: use table(x,y) and the econsvalid variable
table(needPulf2014DT$econs2012valid, needPulf2014DT$gcons2012valid)
## Q1: Predicting gas consumption in 2012 ----
# Task: Run a linear regression model to understand the relationship between
# gas consumption in 2012 and dwelling characteristics
# First test your assumptions:
# Q1a: Is the outcome variable (gas consumption in 2012) normally distributed?
hist(needPulf2014DT$gcons2012)
qqnorm(needPulf2014DT$gcons2012); qqline(needPulf2014DT$gcons2012, col = 2)
shapiro.test(needPulf2014DT$gcons2012)
# why do you think that happened?
# so the gas data is not normally distributed - there is a long right tail
# Transform it using a sqrt function to try to fix this
needPulf2014DT$sqrtgcons2012 <- sqrt(needPulf2014DT$gcons2012)
# now check the results
hist(needPulf2014DT$sqrtgcons2012)
qqnorm(needPulf2014DT$sqrtgcons2012); qqline(needPulf2014DT$sqrtgcons2012, col = 2)
shapiro.test(needPulf2014DT$sqrtgcons2012)
# better?
# Q1b: Are the observations likely to be independent? Justify your answer
# Q1c: Run the linear regression model
# Hint: use gasModel <- lm(needPulf2014DT$sqrtgcons2012 ~ dwelling characteristics of your choice)
# Notice that 1 category within each of your factor variables 'vanishes' in the
# regression summary output. Why? Answer: r is comparing all the other categories to
# the 'vanished' contrast category - ask for help if this is not clear!
# Be careful if you include LOFT_DEPTH (why?)
# Hint: check variable frequencies for 'unknown'
# If you use an incremental hierarchical/nested approach be sure to compare the models
with(needPulf2014DT, summary(main_heat_fuel))
with(needPulf2014DT, summary(prop_age))
with(needPulf2014DT, summary(prop_type))
with(needPulf2014DT, summary(floor_area_band))
with(needPulf2014DT, summary(loft_depth))
gasModel1 <- lm(sqrtgcons2012 ~ prop_age + prop_type + floor_area_band,
data = needPulf2014DT)
summary(gasModel1)
# use confint to report confidence intervals with 99% level)
confint(gasModel1, level = 0.99)
# add LOFT_DEPTH to a different model
gasModel2 <- lm(sqrtgcons2012 ~ prop_age + prop_type + floor_area_band +
loft_depth,
data = needPulf2014DT)
summary(gasModel2)
# You have a very large sample so use confint to report confidence intervals at 99% level
# instead of the usual 95%
# Hint: see ?confint to find out how
confint(gasModel2, level = 0.999)
# compare the models
anova(gasModel1, gasModel2)
# Q1d: check for collinearity
# collinearity
# Hint: vif()
vif(gasModel1)
vif(gasModel2)
# if any values > 10 -> problem
# tolerance
1/vif(gasModel1)
1/vif(gasModel2)
# if any values < 0.2 -> possible problem
# if any values < 0.1 -> definitely a problem
# Q1e: check for homoskedasticity
# hint: library(car) and then a spreadLevelPlot()
spreadLevelPlot(gasModel1)
spreadLevelPlot(gasModel2)
# do we think the variance of the residuals is constant?
# why did the plot suggest a transformation?
# formal test
ncvTest(gasModel1)
ncvTest(gasModel2)
# if p > 0.05 then there is heteroskedasticity
# Q1f: Normality of residuals
# Hint: library(car) and then qqPlot
qqPlot(gasModel1) # shows default 95% CI
qqPlot(gasModel2) # shows default 95% CI
# Q1g autocorrelation/independence of errors
durbinWatsonTest(gasModel1)
durbinWatsonTest(gasModel2)
# if p < 0.05 then a problem as implies autocorrelation
# What do you conclude from the tests and the models?
# If necessary adjust your model and re-run
# save your results out so you can put them into an excel table/chart etc
# these will be saved to the folder reported by getwd()
print(paste0("* Look for your results in: ",getwd()))
# Hint:
# conf ints
write.csv(confint(gasModel1),
file = "gasModel1_confint.csv")
write.csv(confint(gasModel2),
file = "gasModel2_confint.csv")
# coefficients
write.csv(coef(gasModel1),
file = "gasModel1_coef.csv")
write.csv(coef(gasModel2),
file = "gasModel2_coef.csv")
# Q1 conclusion: which factors have the strongest effect on gas consumption?
# Q2: Probability of being a low gas household ----
# Task: Run a logistic regression model to analyse the dwelling characteristics that
# predict low gas consumption where 'low' = in the lowest 25% of gas consumption
# first create dummy (binary) variable to flag cases in the lowest 25%
# check value at which to cut for lowest 25%
quantile(needPulf2014DT$gcons2012, na.rm = TRUE)
# 18000
needPulf2014DT$lowgcons2012 <- as.factor(
ifelse(needPulf2014DT$gcons2012 < 8500,
c("Low gas"), c("Other"))
)
# re-order the factor so the logistic regression treats 'low gas' as the
# category of interest!
needPulf2014DT$lowgcons2012 = factor(needPulf2014DT$lowgcons2012,
levels(needPulf2014DT$lowgcons2012)[c(2,1)])
# check
needPulf2014DT[, mean(gcons2012, na.rm = TRUE), by = lowgcons2012]
# Q2a: Run your logistic regression model and interpret the results
# Hint: use glm(formula = Higcons2012 ~ dwelling characteristics of your choice, family = family = binomial(logit), needPulf2014DT)
loGasModel1 <- glm(formula = lowgcons2012 ~ prop_age + prop_type + floor_area_band +
loft_depth,
family = binomial(logit), needPulf2014DT)
# use summary to report the log odds
# how do we interpret them?
# Hint: http://www.ats.ucla.edu/stat/stata/library/logit_wgould.htm might help?
summary(loGasModel1)
# use confint to report confidence intervals with 99% level)
confint(loGasModel1, level = 0.99)
# convert the log odds given by summary() to odds rations (easier to understand)
# How do we interpret odds ratios?
# Hint: http://www.ats.ucla.edu/stat/stata/library/logit_wgould.htm might help?
exp(cbind(OR = coef(loGasModel1), confint(loGasModel1)))
# Q2b:What diagnostics should you use (they are not the same as for OLS :-)?
# Hint: http://scg.sdsu.edu/logit_r/ might help? Don't try the ROC curve unless you really
# want to.
# Run diagnostics and if necessary adjust your model
# vif, durbinwatson, crPlots
vif(loGasModel1)
durbinWatsonTest(loGasModel1)
crPlots(loGasModel1)
# Q2 conclusion: which factors have the greatest effect on the probability of being a low gas household?
# If you have time:
# Q3: Probability of being in any particular gas consumption quartile ----
# Which factors have the greatest effect on the probability of being in the top 25%
# of gas consumption compared to being in any other quartile?
# Hint: Convert the original gas consumption variable to quartiles and run the an ordinal
# regression model.
quantile(needPulf2014DT$gcons2012, na.rm = TRUE)
# now use those values as the quartile cut points
needPulf2014DT$quartgcons2012[needPulf2014DT$gcons2012 <= 8500] <- 1
needPulf2014DT$quartgcons2012[needPulf2014DT$gcons2012 > 8500 &
needPulf2014DT$gcons2012 <= 12600] <- 2
needPulf2014DT$quartgcons2012[needPulf2014DT$gcons2012 > 12600 &
needPulf2014DT$gcons2012 <= 17500] <- 3
needPulf2014DT$quartgcons2012[needPulf2014DT$gcons2012 > 17500 ] <- 4
# convert to factors
needPulf2014DT$quartgcons2012 <- factor(needPulf2014DT$quartgcons2012,
labels = c("< 25%",
"25% - 50%",
"50% - 75%",
"> 75%"))
# check
needPulf2014DT[, mean(gcons2012, na.rm = TRUE), by = quartgcons2012]
# run model
# Hint: you will need to install & load the MASS package first
# install.packages("MASS") # if needed
# see http://www.ats.ucla.edu/stat/r/dae/ologit.htm for guidance/interpretation
library(MASS)
# then use polr(quartgcons2012 ~ dwelling characteristics of your choice, data = needPulf2014DT)
ordGasModel1 <- polr(quartgcons2012 ~ prop_age + prop_type + floor_area_band +
loft_depth,
data = needPulf2014DT)
## view a summary of the model
summary(ordGasModel1)
confint(ordGasModel1)
# as for the logit model, use odds ratios to make interpretation easier
ci <- confint(ordGasModel1)
# this may take some time!
exp(cbind(OR = coef(ordGasModel1), ci))
# what diagnostics should you use?
# Run them and adjust your model if needed.
# Q3 Conclusion: Which factors have the greatest effect on the probability of being in the top 25%
# of gas consumption compared to being in any other quartile?
\ No newline at end of file
# Meta ----
# Script to load & analyse DECC NEED public use file data
# Data & documentation at:
# https://www.gov.uk/government/collections/national-energy-efficiency-data-need-framework
# code by: b.anderson@soton.ak.uk (@dataknut)
### Housekeeping ------------------------
# clear the workspace
rm(list=ls())
# set input file name
ifile <- "http://www.soton.ac.uk/~ba1e12/need_public_use_file_2014_2012_10pc_extract.csv"
### Required packages ----
#install.packages("data.table") # if needed
library(data.table)
#install.packages("car") # if needed
library(car)
### Load data ----
# load the data into a data table
# this may take a few seconds - it is quite large!
needPulf2014DT <- as.data.table(read.csv(ifile))
# how much data do we have?
dim(needPulf2014DT)
# create some factor variables ----
# see http://bit.ly/1LazmvE -> "Excel workbook explaining variable codes"
# The gcons*valid variable codes:
# G = Gas consumption invalid, greater than 50,000
# L = Gas consumption invalid, less than 100
# M = Gas consumption data is missing in source data
# 0 = Property does not have a gas connection
# V = Valid gas consumption (between 100 and 50,000 inclusive)
# NB - there are valid gas readings of '0' which presumably were > 100 but < 249 (first gas 'heap' = 'nearest 500')
# The Econs*valid variable codes:
# G = Electricity consumption invalid, greater than 25,000 (DECC lookup table says 50,000)
# L = Electricity consumption invalid, less than 100
# M = Electricity consumption data is missing in source dataset
# V = Valid electricity consumption (between 100 and 25,000 inclusive)
# test values
table(needPulf2014DT$gcons2012valid)
# all 4
needPulf2014DT$gcons2012valid <- factor(needPulf2014DT$gcons2012valid,
labels = c("> 50k",
"< 100",
"Missing",
"No gas",
"Valid 100 - 25k" ))
# test values
table(needPulf2014DT$econs2012valid)
# only two
needPulf2014DT$econs2012valid <- factor(needPulf2014DT$econs2012valid,
labels = c("> 25k",
#"< 100",
#"Missing",
"Valid 100 - 25k" ))
needPulf2014DT$main_heat_fuel <- factor(needPulf2014DT$main_heat_fuel,
labels = c("Gas", "Electricity"))
needPulf2014DT$e7flag2012 <- factor(needPulf2014DT$e7flag2012,
labels = c("Econ7 = Yes"))
needPulf2014DT$prop_age <- factor(needPulf2014DT$prop_age,
labels = c("before 1930",
"1930-1949",
"1950-1966",
"1967-1982",
"1983-1995",
"1996 onwards"))
needPulf2014DT$prop_type <- factor(needPulf2014DT$prop_type,
labels = c("Detached",
"Semi-detached",
"End terrace",
"Mid terrace",
"Bungalow",
"Flat (incl. maisonette)"))
needPulf2014DT$floor_area_band <- factor(needPulf2014DT$floor_area_band,
labels = c("1 to 50 m2",
"51-100 m2",
"101-150 m2",
"> 151 m2"))
needPulf2014DT$ee_band <- factor(needPulf2014DT$ee_band,
labels = c("A or B",
"C",
"D",
"E",
"F",
"G"))
needPulf2014DT$wall_cons <- factor(needPulf2014DT$wall_cons,
labels = c("Cavity",
"Other"))
# recode loft depth
needPulf2014DT$loft_depth <- factor(needPulf2014DT$loft_depth,
labels = c("Under 150 mm", "> 150 mm", "Unknown"),
) # 99 = NA (helpfully!)
# only 1 valid value
needPulf2014DT$li <- factor(needPulf2014DT$li,
labels = c("Has loft insulation"))
## Q0: Check out the data ----
# Q0.0: What are the variable names?
# Hint: use names()
names(needPulf2014DT)
# Q0.1: What kind of data do we have and what are the central tendencies?
# Hint: use summary() and the with() function so you don't have to keep typing the data table name
with(needPulf2014DT, summary(gcons2012))
with(needPulf2014DT, summary(econs2012))
# Note that there are NA in both - why?
# Hint: see above and also http://bit.ly/1LazmvE -> "Excel workbook explaining variable codes"
# Q0.2: How many properties do not have 'valid' elec/gas readings for 2012?
# Hint: use table(x,y) and the econsvalid variable
table(needPulf2014DT$econs2012valid, needPulf2014DT$gcons2012valid)
## Q1: Predicting gas consumption in 2012 ----
# Task: Run a linear regression model to understand the relationship between
# gas consumption in 2012 and dwelling characteristics
# First test your assumptions:
# Q1a: Is the outcome variable (gas consumption in 2012) normally distributed?
# why do you think that happened?
# so the gas data is not normally distributed - there is a long right tail
# Transform it using a sqrt function to try to fix this
# now check the results
# better?
# Q1b: Are the observations likely to be independent? Justify your answer
# Q1c: Run the linear regression model
# Hint: use gasModel <- lm(needPulf2014DT$sqrtgcons2012 ~ dwelling characteristics of your choice)
# Notice that 1 category within each of your factor variables 'vanishes' in the
# regression summary output. Why? Answer: r is comparing all the other categories to
# the 'vanished' contrast category - ask for help if this is not clear!
# Be careful if you include LOFT_DEPTH (why?)
# Hint: check variable frequencies for 'unknown'
# If you use an incremental hierarchical/nested approach be sure to compare the models
# use confint to report confidence intervals with 99% level)
# You have a large sample so use confint to report confidence intervals at 99% level
# instead of the usual 95%
# Hint: see ?confint to find out how
# compare the models
# Q1d: check for collinearity
# collinearity
# Hint: vif()
# tolerance
# Q1e: check for homoskedasticity
# hint: library(car) and then a spreadLevelPlot()
# do we think the variance of the residuals is constant?
# why did the plot suggest a transformation?
# formal test
# Q1f: Normality of residuals
# Hint: library(car) and then qqPlot
# Q1g autocorrelation/independence of errors
# What do you conclude from the tests and the models?
# If necessary adjust your model and re-run
# save your results out so you can put them into an excel table/chart etc
# these will be saved to the folder reported by getwd()
print(paste0("* Look for your results in: ",getwd()))
# Hint:
# conf ints
write.csv(confint(my_final_model),
file = "my_final_model_confint.csv")
# coefficients
write.csv(coef(my_final_model),
file = "my_final_model_coef.csv")
# Q1 conclusion: which factors have the strongest effect on gas consumption?
# Meta ----
# Script to load & analyse DECC NEED public use file data
# Data & documentation at:
# https://www.gov.uk/government/collections/national-energy-efficiency-data-need-framework
# code by: b.anderson@soton.ak.uk (@dataknut)
### Housekeeping ------------------------
# clear the workspace
rm(list=ls())
# set input file name
#ifile <- "~/OneDrive - University of Southampton/PG/Southampton/FEEG6025 Data Analysis & Experimental Methods for Engineers/Data/need_public_use_file_2014_2012_10pc_extract.csv"
ifile <- "http://www.soton.ac.uk/~ba1e12/need_public_use_file_2014_2012_10pc_extract.csv"
# which week is this?
week <- "week8"
### Required packages ----
#install.packages("data.table") # if needed
library(data.table)
#install.packages("car") # if needed
library(car)
### Load data ----
# load the data into a data table
# this may take a few seconds - it is quite large!
needPulf2014DT <- as.data.table(read.csv(ifile))
# how much data do we have?
dim(needPulf2014DT)
# create some factor variables ----
# see http://bit.ly/1LazmvE -> "Excel workbook explaining variable codes"
# Valid gas values?
needPulf2014DT$gcons2012valid <- factor(needPulf2014DT$gcons2012valid,
labels = c("> 50k",
"< 100",
"Missing",
"No gas",
"Valid 100 - 25k" ))
# Valid electricity values?