analyse-NEED-EULF-2014-models.do 7.19 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
* Script to analyse DECC's NEED data to:
* investigate % variance of energy consumption due to dwelling type variables as a way to infer the % of variance due to people

* NB this script uses 2 data files derived from the original data using the 'process' script

* Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014
* http://discover.ukdataservice.ac.uk/catalogue/?sn=7518

* Ben Anderson, Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton
* b.anderson@soton.ac.uk
* (c) University of Southampton

* Unless there is a different license file in the folder in which this script is found, the Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license applies
* http://creativecommons.org/licenses/by-nc/4.0/

clear all

capture noisily log close

* written for Mac OSX - remember to change filesystem delimiter for other platforms
local home "/Users/ben/Documents"
local proot "`home'/Work/Data/Social Science Datatsets/DECC"
* for clam
* local proot "`home'/Work/NEED"
local dpath "`proot'/NEED/End User Licence File 2014/processed"
local rpath "`proot'/results/NEED/"

local version "v1.1"

set more off

log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replace

* use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file
use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear

* we're goinmg to use 2012 data only

keep HH_ID *2012*

* log the consumption as it's very skewed -> becomes semi-normal & OK for linear regression
* Gcons = gas
* Econs = Electricity
* Presumably those without gas use oil or electricity for heating - we don't have oil so we should probably restrict analysis to gas-using hosueholds only to avoid this confounding factor?

* check what's valid
tab Gcons2012Valid Econs2012Valid, mi // what does G,L,M mean? Presumably O = off gas?
tabstat Gcons2012, by(Gcons2012Valid) s(mean min max n)

keep if Gcons2012Valid == "V"

gen log_Gcons2012 = log(Gcons2012)
gen log_Econs2012 = log(Econs2012)

* combine consumption
* treat missing (gas) as 0
egen Allcons2012 = rowtotal(Gcons2012 Econs2012)

gen log_Allcons2012 = log(Allcons2012)
* create log consumption quintiles
egen quinlog_Allcons2012 = cut(log_Allcons2012), group(5)
egen quinlog_Gcons2012 = cut(log_Gcons2012), group(5)
egen quinlog_Econs2012 = cut(log_Econs2012), group(5)

* merge in the pre-processed cross-year fixed values file
merge 1:1 HH_ID using "`dpath'/need_eul_may2014_xwavefile.dta"

* fix some of the variables

* combine IMD: this is a bit dodgy as they are not strictly comparable
gen ba_imd = IMD_ENG
replace ba_imd = IMD_WALES if ba_imd == .

* must use as category variables!!
* set unkown to be 10 -> adds to end of contrasts so can see effect
replace LOFT_DEPTH = 10 if LOFT_DEPTH == .

* set unkown to be 2020 -> adds to end of contrasts so can see effect
replace BOILER_YEAR = 2020 if BOILER_YEAR == .
replace CWI_YEAR = 2020 if CWI_YEAR == .
replace LI_YEAR  = 2020 if LI_YEAR  == .

* 0 = no
destring BOILER, force replace
replace BOILER = 0 if BOILER == .

* household level vars
local generic_hvars "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_TYPE i.PROP_AGE i.EE_BAND "
local generic_hvarsnp "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_AGE i.EE_BAND "

* area level vars
local generic_rvars "i.ba_region i.ba_imd"

* define different property types
local ptypes "101 102 103 104 105 106"
local pt101 "detached"
local pt102 "semi"
local pt103 "end_terr"
local pt104 "mid_terr"
local pt105 "bung"
local pt106 "flat"

* now loop over the energy types
local vars "Gcons Econs Allcons"
foreach v of local vars {
	* check distributions of original consumption values
	histogram `v'2012, by(MAIN_HEAT_FUEL, total) name(histo_`v')
	tabstat `v'2012, by(MAIN_HEAT_FUEL) s(n mean min max)
	
	* all hhs model
	qui: regress log_`v'2012 `generic_hvars' ///
		`generic_rvars' ///
		i.BOILER_YEAR
	
	est store rlog_`v'2012
	di "* -> `v' estat to test for heteroskedasticity & omitted vars"
	estat ovtest
	estat hettest
	
	* we ought to be testing for linearity too
	di "* -> `v' linktest to test for model specification"
	di "* if p of _hatsq < 0.05 -> mis-spec"
	di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
	linktest	
	
	* models by property type - to see if rsq & coefficients vary
	foreach p of local ptypes {
		di "* -> testing log_`v'2012 for `pt`p''"
		qui: regress log_`v'2012 `generic_hvarsnp' ///
			`generic_rvars'	///
			i.BOILER_YEAR ///
			if PROP_TYPE == `p'
		est store rlog_`v'2012_`pt`p''
		
		di "* -> `v' 2012 `pt`p'' - estat to test for heteroskedasticity & omitted vars"
		estat ovtest
		estat hettest
		
		* we ought to be testing for linearity too
		di "* -> `v' `pt`p'' linktest to test for model specification"
		di "* if p of _hatsq < 0.05 -> mis-spec"
		di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
		linktest	
	}
	* models for different consumption quintiles - to see if rsq & coefficients vary
	foreach q of numlist 0/4 {
		di "* -> testing log_`v'2012 for quintile: `q'"
		qui: regress log_`v'2012 `generic_hvars' ///
			`generic_rvars'	///
			i.BOILER_YEAR ///
			if quinlog_`v'2012 == `q'
		est store rlog_`v'2012q`q'
		
		di "* -> quintile: `q' - estat to test for heteroskedasticity & omitted vars"
		estat ovtest
		estat hettest
		
		* we ought to be testing for linearity too
		di "* -> quintile: `q' - linktest"
		di "* if p of _hatsq < 0.05 -> mis-spec"
		di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
		linktest	
	}
}

* output all the results - that's a lot of t tests!
* we could put them all out in one file but it would be really hard to find the ones you want!

estout rlog_Gcons2012 using "`rpath'/NEED-EULF-2014-log-gas-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Gcons2012q* using "`rpath'/NEED-EULF-2014-log-gas-models-quintiles-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Gcons2012_* using "`rpath'/NEED-EULF-2014-log-gas-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Econs2012 using "`rpath'/NEED-EULF-2014-log-elec-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Econs2012q* using "`rpath'/NEED-EULF-2014-log-elec-models-quintiles-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Econs2012_* using "`rpath'/NEED-EULF-2014-log-elec-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Allcons2012 using "`rpath'/NEED-EULF-2014-log-energy-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Allcons2012q* using "`rpath'/NEED-EULF-2014-log-energy-models-quintiles-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout rlog_Allcons2012_* using "`rpath'/NEED-EULF-2014-log-energy-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)

di "* Done!"

log close