From 5d05c34ee15684602393a22a0db4011ecae572e7 Mon Sep 17 00:00:00 2001
From: Ben Anderson <b.anderson@soton.ac.uk>
Date: Wed, 17 Sep 2014 15:42:35 +0100
Subject: [PATCH] updated readme and models

---
 NEED/analyse-NEED-EULF-2014-descriptives.do   |   9 +-
 ... => analyse-NEED-EULF-2014-models-v1.1.do} |  10 +-
 NEED/analyse-NEED-EULF-2014-models-v2.0.do    | 238 ++++++++++++++++++
 3 files changed, 253 insertions(+), 4 deletions(-)
 rename NEED/{analyse-NEED-EULF-2014-models.do => analyse-NEED-EULF-2014-models-v1.1.do} (96%)
 create mode 100644 NEED/analyse-NEED-EULF-2014-models-v2.0.do

diff --git a/NEED/analyse-NEED-EULF-2014-descriptives.do b/NEED/analyse-NEED-EULF-2014-descriptives.do
index 858cc1b..4b0b538 100644
--- a/NEED/analyse-NEED-EULF-2014-descriptives.do
+++ b/NEED/analyse-NEED-EULF-2014-descriptives.do
@@ -38,8 +38,13 @@ table EconsValid year, c(count Econs min Econs mean Econs max Econs)
 table GconsValid year, c(count Gcons min Gcons mean Gcons max Gcons)
 
 * distributions
-histogram Econs, by(year) name(histo_econs)
-histogram Gcons, by(year) name(histo_gcons)
+local vars "Econs Gcons"
+foreach v of local vars {
+	histogram `v', by(year) name(histo_`v')
+	graph export using "`rpath'/NEED-EULF-2014-histo_`v'_by_year.png", replace 
+	graph box `v', by(year) name(box_`v')
+	graph export using "`rpath'/NEED-EULF-2014-box_`v'_by_year.png", replace 
+}
 
 di "* Done!"
 
diff --git a/NEED/analyse-NEED-EULF-2014-models.do b/NEED/analyse-NEED-EULF-2014-models-v1.1.do
similarity index 96%
rename from NEED/analyse-NEED-EULF-2014-models.do
rename to NEED/analyse-NEED-EULF-2014-models-v1.1.do
index afa9f78..cf12a97 100644
--- a/NEED/analyse-NEED-EULF-2014-models.do
+++ b/NEED/analyse-NEED-EULF-2014-models-v1.1.do
@@ -25,7 +25,13 @@ local proot "`home'/Work/Data/Social Science Datatsets/DECC"
 local dpath "`proot'/NEED/End User Licence File 2014/processed"
 local rpath "`proot'/results/NEED/"
 
-local version "v1.1"
+* local verrsion "1.0"
+* initial models - all households for electricity models
+
+local verrsion "1.1"
+* restrict to gas only households to avoid complications of:
+* - primary electric heating (presumably)
+* - oil heating
 
 set more off
 
@@ -34,7 +40,7 @@ log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replac
 * use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file
 use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear
 
-* we're goinmg to use 2012 data only
+* we're going to use 2012 data only
 
 keep HH_ID *2012*
 
diff --git a/NEED/analyse-NEED-EULF-2014-models-v2.0.do b/NEED/analyse-NEED-EULF-2014-models-v2.0.do
new file mode 100644
index 0000000..8cb9c0b
--- /dev/null
+++ b/NEED/analyse-NEED-EULF-2014-models-v2.0.do
@@ -0,0 +1,238 @@
+* Script to analyse DECC's NEED data to:
+* investigate % variance of energy consumption due to dwelling type variables as a way to infer the % of variance due to people
+
+* NB this script uses 2 data files derived from the original data using the 'process' script
+
+* Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014
+* http://discover.ukdataservice.ac.uk/catalogue/?sn=7518
+
+* Ben Anderson, Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton
+* b.anderson@soton.ac.uk
+* (c) University of Southampton
+
+* Unless there is a different license file in the folder in which this script is found, the Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license applies
+* http://creativecommons.org/licenses/by-nc/4.0/
+
+clear all
+
+capture noisily log close
+
+* written for Mac OSX - remember to change filesystem delimiter for other platforms
+local home "/Users/ben/Documents"
+local proot "`home'/Work/Data/Social Science Datatsets/DECC"
+* for clam
+* local proot "`home'/Work/NEED"
+local dpath "`proot'/NEED/End User Licence File 2014/processed"
+local rpath "`proot'/results/NEED"
+
+*local verrsion "1.0"
+* initial models - all households for electricity models
+
+*local verrsion "1.1"
+* restrict to gas only households to avoid complications of:
+* - primary electric heating (presumably)
+* - oil heating
+
+*local version "v2a_1pc"
+*local sample 1
+*local sampleby "EE_BAND PROP_TYPE"
+* changed from using log consumption to consumption decile to avoid complications due to variable rounding ranges in original data (see readme)
+* restricted analysis to households where gas is main heat source as it is better predicted by variables included & is more relevant to EPC (heat)
+* uses 1% sample (c 30k) making sure keep proportions of property type and EE_Band to see if linktest fails with smaller n
+
+*local version "v2b_10pc"
+*local sample 10
+*local sampleby "EE_BAND PROP_TYPE"
+* uses 10% sample (c 300k) making sure keep proportions of property type and EE_Band to see if margin plots and co-efficients are the same 
+* (linktest etc will probably now fail due to larger n)
+
+local version "v2c_full"
+local sample 10
+local sampleby "EE_BAND PROP_TYPE"
+* uses 10% sample (c 300k) making sure keep proportions of property type and EE_Band to see if margin plots and co-efficients are the same 
+* (linktest etc will probably now fail due to larger n)
+
+set more off
+
+log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replace
+
+* use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file
+use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear
+
+* we're going to use 2012 data only
+
+keep HH_ID *2012*
+
+* merge in the pre-processed cross-year fixed values file
+merge 1:1 HH_ID using "`dpath'/need_eul_may2014_xwavefile.dta"
+
+* check what's valid
+tab Gcons2012Valid Econs2012Valid, mi // O = off gas, V = valid, L = too low, G = too big, M = missing
+tabstat Gcons2012, by(Gcons2012Valid) s(mean min max n)
+* do off-gas use a lot more electricty (heating)?
+tabstat Econs2012, by(Gcons2012Valid) s(mean min max n)
+
+histogram Gcons2012, by(MAIN_HEAT_FUEL, total) name(histo_Gcons2012)
+graph export "`rpath'/histo_Gcons2012_by_main_heating_fuel.png", replace
+
+tabstat Gcons2012, by(MAIN_HEAT_FUEL) s(n mean min max)
+
+* keep if valid gas & gas = main heat fuel
+keep if Gcons2012Valid == "V" & MAIN_HEAT_FUEL == 1
+
+***** random sample ****
+* select a random sample but ensure proportions of sampleby are kept
+di "* Keeping `sample'% sample by `sampleby'"
+sample `sample', by(`sampleby')
+
+tab `sampleby', mi
+
+* log the consumption as it's very skewed -> becomes semi-normal & OK for linear regression
+* Gcons = gas
+* Econs = Electricity
+
+* create log & deciles
+* log - creates a normal distribution
+* deciles - avoids the consumption rounding range differences (hopefully)
+gen log_Gcons2012 = log(Gcons2012)
+egen Gcons2012_dec = cut(Gcons2012), group(10)
+gen log_Econs2012 = log(Econs2012)
+egen Econs2012_dec = cut(Econs2012), group(10)
+
+* combine consumption
+* treat missing (gas) as 0
+egen Allcons2012 = rowtotal(Gcons2012 Econs2012)
+
+*gen log_Allcons2012 = log(Allcons2012)
+egen Allcons2012_dec = cut(Allcons2012), group(10)
+
+* create log consumption quintiles
+*egen quinlog_Allcons2012 = cut(log_Allcons2012), group(5)
+*egen quinlog_Gcons2012 = cut(log_Gcons2012), group(5)
+*egen quinlog_Econs2012 = cut(log_Econs2012), group(5)
+
+* fix some of the variables
+
+* combine IMD: this is a bit dodgy as they are not strictly comparable
+gen ba_imd = IMD_ENG
+replace ba_imd = IMD_WALES if ba_imd == .
+
+* must use as category variables!!
+* set unkown to be 10 -> adds to end of contrasts so can see effect
+replace LOFT_DEPTH = 10 if LOFT_DEPTH == .
+
+* set unkown to be 2020 -> adds to end of contrasts so can see effect
+replace BOILER_YEAR = 2020 if BOILER_YEAR == .
+replace CWI_YEAR = 2020 if CWI_YEAR == .
+replace LI_YEAR  = 2020 if LI_YEAR  == .
+
+* 0 = no
+destring BOILER, force replace
+replace BOILER = 0 if BOILER == .
+
+* household level vars
+local generic_hvars "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_TYPE i.PROP_AGE i.EE_BAND"
+local generic_hvarsnp "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_AGE i.EE_BAND"
+
+* area level vars
+local generic_rvars "i.ba_region i.ba_imd"
+
+* define different property types
+local ptypes "101 102 103 104 105 106"
+local pt101 "detached"
+local pt102 "semi"
+local pt103 "end_terr"
+local pt104 "mid_terr"
+local pt105 "bung"
+local pt106 "flat"
+
+* now loop over the energy types & run linear regression models
+* NB - the rounding of the consumption values may lead to modelling problems
+
+* add Econs Allcons for electricity & sum of both
+* rename so graph names don't break
+rename log_Gcons2012 lg2012
+rename Gcons2012_dec g2012dec
+local vars "lg2012 g2012dec"
+foreach v of local vars {
+	* check distributions of original consumption values
+	
+	* all hhs model
+	qui: regress `v' `generic_hvars' ///
+		`generic_rvars' ///
+		i.BOILER_YEAR
+	
+	est store `v'
+	di "* -> `v' estat to test for heteroskedasticity & omitted vars"
+	estat ovtest
+	estat hettest
+	
+	* we ought to be testing for linearity too
+	di "* -> `v' linktest to test for model specification"
+	di "* if p of _hatsq < 0.05 -> mis-spec"
+	di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
+	linktest	
+	
+	di "* test EPC margins for `v'"
+	margins EE_BAND
+	marginsplot, name(mplot_`v'_EE_BAND)
+	graph export "`rpath'/mplot_`v'_EE_BAND-`version'.png", replace
+	
+	* models by property type - to see if rsq & coefficients vary
+	foreach p of local ptypes {
+		di "* -> testing `v' for `pt`p''"
+		qui: regress `v' `generic_hvarsnp' ///
+			`generic_rvars'	///
+			i.BOILER_YEAR ///
+			if PROP_TYPE == `p'
+		est store `v'_`pt`p''
+		
+		di "* -> `v' 2012 `pt`p'' - estat to test for heteroskedasticity & omitted vars"
+		estat ovtest
+		estat hettest
+		
+		* we ought to be testing for linearity too
+		di "* -> `v' `pt`p'' linktest to test for model specification"
+		di "* if p of _hatsq < 0.05 -> mis-spec"
+		di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
+		linktest
+		di "* test EPC margins for `v' (`pt`p'')"
+		margins EE_BAND
+		marginsplot, name(mplot_`v'_EE_BAND_`pt`p'')
+		graph export "`rpath'/mplot_`v'_EE_BAND_`pt`p''-`version'.png", replace
+
+	}
+	* models for different consumption quintiles - to see if rsq & coefficients vary
+	/* doesn't make much sense to do this if using deciles as dependent variable
+	foreach q of numlist 0/4 {
+		di "* -> testing log_`v'2012 for quintile: `q'"
+		qui: regress log_`v'2012 `generic_hvars' ///
+			`generic_rvars'	///
+			i.BOILER_YEAR ///
+			if quinlog_`v'2012 == `q'
+		est store rlog_`v'2012q`q'
+		
+		di "* -> quintile: `q' - estat to test for heteroskedasticity & omitted vars"
+		estat ovtest
+		estat hettest
+		
+		* we ought to be testing for linearity too
+		di "* -> quintile: `q' - linktest"
+		di "* if p of _hatsq < 0.05 -> mis-spec"
+		di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
+		linktest	
+	}
+	*/
+}
+
+* output all the results - that's a lot of t tests!
+* we could put them all out in one file but it would be really hard to find the ones you want!
+estout lg2012 using "`rpath'/NEED-EULF-2014-log-gas-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
+estout lg2012_* using "`rpath'/NEED-EULF-2014-log-gas-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
+
+estout g2012dec using "`rpath'/NEED-EULF-2014-gas-deciles-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
+estout g2012dec_* using "`rpath'/NEED-EULF-2014-gas-deciles-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
+
+di "* Done!"
+
+log close
-- 
GitLab