analyse-NEED-EULF-2014-electricity-consumption.do 14.5 KB
Newer Older
1
2
*******************************************
* Script to:
Ben Anderson's avatar
Ben Anderson committed
3
* - analyse DECC's EULF 2014 NEED data to examine distributions etc
4
5
6
7
8
9
10
11

* Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014
* http://discover.ukdataservice.ac.uk/catalogue/?sn=7518

* Most recent version of this script can be found at https://github.com/dataknut/DECC-data/tree/master/NEED
* The script requires the following to have been run first:
* https://github.com/dataknut/DECC-data/blob/master/NEED/process-NEED-EULF-2014.do

Ben Anderson's avatar
Ben Anderson committed
12
/*
13
14
15

Copyright (C) 2014  University of Southampton

Ben Anderson's avatar
Ben Anderson committed
16
Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut)
17
18
19
20
	[Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Ben Anderson's avatar
Ben Anderson committed
21
the Free Software Foundation; either version 2 of the License
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
(http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

#YMMV - http://en.wiktionary.org/wiki/YMMV

*/

clear all

capture noisily log close

set more off

* written for Mac OSX - remember to change filesystem delimiter for other platforms
40
global home "~/Documents/Work"
41

42
43
global dpath "$home/Data/Social Science Datasets/DECC/NEED/End User Licence File 2014/processed"
global rpath "$home/Papers and Conferences/RSS-2015/results"
44
45

local version "v1"
46
47
48

log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace

49
50
51
* set sample
* 100 = 100pc
* etc
52
local sample = 100
53

54
55
56
57
58
59
* use weights (leave blank to ignore)
/*
"Weighting based on Region, property age, property type and floor area band. 
Summing all weights gives (approximate) total number of households in England and Wales 2011."
*/
local need_weight = "[pw = WEIGHT]" // use probability weights
60

61
62
63
64
65
66
* control what happens
local do_desc = 0 // do descriptives
local do_histo = 0 // toggle graph drawing
local do_box = 0 // toggle graph drawing
local do_xsec = 1 // run cross-sectional analysis
local do_longit = 1 // run longitudinal analysis
67

Ben Anderson's avatar
Ben Anderson committed
68

69
* Be aware that the consumption is rounded in buckets:
70
/*
Ben Anderson's avatar
Ben Anderson committed
71
GconsYEAR	.	Missing, off gas or invalid consumption
72
73
74
75
76
	100 - 7,999	Gas consumption kWh rounded to nearest 500 kWh
	8,000 - 15,999	Gas consumption kWh rounded to nearest 100 kWh
	16,000 - 24,999	Gas consumption kWh rounded to nearest 500 kWh
	25,000 - 34,999	Gas consumption kWh rounded to nearest 1,000 kWh
	35,000 - 50,000	Gas consumption kWh rounded to nearest 5,000 kWh
Ben Anderson's avatar
Ben Anderson committed
77
78
79
80
81
82
83
84


EconsYEAR	.	Missing or invalid consumption
	100 - 9,999	Electricity consumption kWh rounded to nearest 50 kWh
	10,000 - 11,999	Electricity consumption kWh rounded to nearest 100 kWh
	12,000 - 14,999	Electricity consumption kWh rounded to nearest 500 kWh
	15,000 - 19,999	Electricity consumption kWh rounded to nearest 1,000 kWh
	20,000 - 25,000	Electricity consumption kWh rounded to nearest 5,000 kWh
85
86
87
88
set more off
*/


Ben Anderson's avatar
Ben Anderson committed
89

Ben Anderson's avatar
Ben Anderson committed
90
91
92
93
di "************************"
di "* Using `sample'% sample"
* load the yearly consumption data
use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
94

Ben Anderson's avatar
Ben Anderson committed
95
96
* merge in the xwave file (fixed data - we assume!)
merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta"
97

98
99
100
101
102
* set variable labels
* ref DECC look up table
lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)"
* NB DECC look up table says max elec = 50,000
lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
Ben Anderson's avatar
Ben Anderson committed
103

104
105
* set up correct long form 'is X present in year' using year (if known)
local vars "BOILER LI CWI"
Ben Anderson's avatar
Ben Anderson committed
106

107
* what will hapen if there are multiple replacements in a household
Ben Anderson's avatar
Ben Anderson committed
108
foreach v of local vars {
109
110
111
112
113
114
115
116
117
118
119
120
121
	gen ba_have_`v' = 0
	destring `v'_YEAR, force replace
	replace ba_have_`v' = 1 if `v'_YEAR <= year
}

* labels
lab var Econs "Electricity (kWh/year)"
lab var Gcons "Gas (kWh/year)"

* set up data
* include Gcons if you want gas too
local setupvars "Econs Gcons"
foreach v of local setupvars {
Ben Anderson's avatar
Ben Anderson committed
122
123
124
125
126
127
128
129
130
131
132
133
	di "***************"
	di "* Testing `v' for `sample'% sample"

	di "* check the panel transitions for each valid"
	gen `v'Validr = 1 if `v'Valid == "V"
	replace `v'Validr = 2 if `v'Valid == "O" // off gas (from EPC) only relevant for gas
	replace `v'Validr = 3 if `v'Valid == "L"
	replace `v'Validr = 4 if `v'Valid == "G"
	replace `v'Validr = 5 if `v'Valid == "M"

	lab var `v'Validr "Recoded `v'Valid"
	lab val `v'Validr `v'Validr
134
	tab `v'Validr `v'Valid
Ben Anderson's avatar
Ben Anderson committed
135

136
137
138
139
140
	* set up consumption deciles and outlier flags
	gen u99_`v' = 0
	gen l99_`v' = 0
	gen m98_`v' = 0
	gen m90_`v' = 0
Ben Anderson's avatar
Ben Anderson committed
141

Ben Anderson's avatar
Ben Anderson committed
142
143
	levelsof(year), local(levels)
	foreach l of local levels {
144
145
146
147
		di "* Calculating consumption deciles and outlier flags for `v' for `l'"
		
		* creates missing for other years have to do this as egen does not allow by & can't 'replace'
		di "* `v' deciles for `l'"
Ben Anderson's avatar
Ben Anderson committed
148
		egen `v'_dec_`l' = cut(`v') if year == `l', group(10)
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

		di "* `v' outlier flags for `l'"
		qui: su `v' if year == `l', de
		
		di "* define outliers"
		di "* -> Upper: valid = G & in highest 1% of valid"
		replace u99_`v' = 1 if (`v' > `r(p99)' | `v'Validr == 4) & year == `l' //  upper 1% 'valid'
		
		di "* -> Lower: valid = L & in lowest 1% of valid"
		replace l99_`v' = 1 if (`v' < `r(p1)' | `v'Validr == 3) & year == `l' // lower 1% 'valid'
		replace m98_`v' = 1 if u99_`v' == 0 & l99_`v' == 0 & year == `l' // middle 98% 'valid'	
		replace m90_`v' = 1 if `v' > `r(p5)' & `v' < `r(p95)' & year == `l' // middle 90% 'valid'
		
		* do checks here otherwise tab puts things into the return list
		*tab u99_`v' if year == `l', mi 
		*tab l99_`v' if year == `l', mi 
		*tab m98_`v' if year == `l', mi
		*tab m90_`v' if year == `l', mi
		
Ben Anderson's avatar
Ben Anderson committed
168
	}
169
	* now combine the deciles - set missing option otherwise it counts a row where all are missing as 0
Ben Anderson's avatar
Ben Anderson committed
170
171
172
	egen `v'_dec = rowtotal(`v'_dec_*), missing
	* remove temporary ones
	drop `v'_dec_*
173
174
175
176
177
178
179
180
181
182
183
	* check unweighted
	table `v'_dec year

	tabstat u* l* m*, by(year)

	tab l99_`v' `v'Validr
	tab u99_`v' `v'Validr

	* log vars
	gen log`v' = log(`v')
	lab var log`v' "Log `v'"
Ben Anderson's avatar
Ben Anderson committed
184
185
}

186
187
188
* set 'survey' weight
svyset `need_weight'

Ben Anderson's avatar
Ben Anderson committed
189
190
191
192
193
194
* flag dwellings which are off gas for electricity
* NB - in this dataset we don't know if they use electricity as main heat (could be oil)
gen ba_off_gas = 0
replace ba_off_gas = 1 if  GconsValidr == 2
lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC)"
lab val ba_off_gas ba_off_gas
Ben Anderson's avatar
Ben Anderson committed
195

Ben Anderson's avatar
Ben Anderson committed
196
* check
197
svy: mean Gcons Econs, over(ba_off_gas)
Ben Anderson's avatar
Ben Anderson committed
198
di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'"
Ben Anderson's avatar
Ben Anderson committed
199

200
table ba_off_gas MAIN_HEAT_FUEL `need_weight', missing // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
201

202
table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas)
Ben Anderson's avatar
Ben Anderson committed
203
* roughly constant rate throughout years
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) c(mean Gcons n Gcons)
* but off gas have no gas readings as you'd expect (DECC applied filter)

local descvars "Econs"
if `do_desc' {
	foreach v of local descvars {
		di "***************"
		di "* Testing `v' for `sample'% sample"

		di "* Check `v' by year"
		di "* Check `v' for 0s (`sample'% sample)"
		if "`v'" == "Econs" {
			di "* -> `v' always rounded to nearest 50 so min should always be 100"
			local min = 500
		}
		if "`v'" == "Gcons" {
			di "* -> 100 < `v' < 250 so included but rounded to nearest 500 which in this case = 0"
			local min = 1000
		}
		table `v' year if `v' < `min' `need_weight'

		di "* Check no values of `v' appear for 'non valid' codes"
		di "* test values for all valid codes"
		table `v'Valid `need_weight', c(count `v' mean `v')

		di "* test `v' per year for valid codes (use pw)"
		local table_stats "count `v' min `v' mean `v' p50 `v' max `v'" // set table stats - 5 max!!
		table year if `v'Valid == "V" `need_weight', c(`table_stats`n'')

		di "* table won't allow semean for iw so use svy: mean `v' instead"
		svy: mean `v' if `v'Valid == "V", over(year)
		
		di "* use table to force sd `v' calculation using iw not pw, include mean to check"
		table year [iw=WEIGHT], c(mean `v' sd `v')

		if `do_histo' {
			di "* Running graphs: histo for `v' (can't use `need_weight')"
			histogram `v' if `v'Valid == "V", frequency by(year) ylab(,angle(0)) scale(0.75)
			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-histo_`v'_by_year_valid.png", replace
		}

		if `do_box' {
			di "* Running graphs: boxes for `v'"
			di "* `v' over year"
			graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
				ylab(,angle(0)) scale(0.75)
			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_over_year_valid.png", replace

			di "* `v' over year by floor area"
			svy: mean `v', over(FLOOR_AREA_BAND)
			graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical)))  ///
				ylab(,angle(0)) by(FLOOR_AREA_BAND) scale(0.75)
			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_floor_valid.png", replace

			di "* `v' over year by EE band"
			graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical)))  ///
				ylab(,angle(0)) by(EE_BAND) scale(0.75)
			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_ee_valid.png", replace

		}
		
		di "* check the distributions of the outliers"
		local tvars "MAIN_HEAT_FUEL E7Flag2012 ba_off_gas FLOOR_AREA_BAND EE_BAND IMD_ENG"
		foreach tv of local tvars {
			di "* Checking top 1% against `tv'"
			tab `tv' u99_Econs , col
		}  
		
Ben Anderson's avatar
Ben Anderson committed
272
273


274
275
	}
}
Ben Anderson's avatar
Ben Anderson committed
276

277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
if `do_xsec' {
	di "* Running cross sectional analysis using 2012 (all valid cases)"
	* make sure test vars are destringed
	local tvars "IMD_ENG FP_ENG E7Flag2012 MAIN_HEAT_FUEL PROP_AGE PROP_TYPE FLOOR_AREA_BAND EE_BAND LOFT_DEPTH WALL_CONS CWI BOILER"
	foreach tv of local tvars {
		destring `tv', force replace
	}
	* kitchen sink model - use 99% signif level
	* use vce(robust) as running without & using hettest suggests heteroscedasticity
	* regress would throw out collinear variables
	* do not include IMD & FP as really need multilvel model for that
	regress logEcons i.ba_region i.E7Flag2012 i.MAIN_HEAT_FUEL ///
		i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
		if year == 2012, level(99) vce(robust)
	est store est_logEcons_2012

	estout est_logEcons_2012 using "$rpath/regress_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace

	* tests
	predict logEconsr if e(sample), residuals
	di "* test for normality of residuals using graphs"
	* see http://www-personal.umich.edu/~bwest/ch3diags.do
	* do not use sktest as large n & also see 
	* http://www.stata.com/statalist/archive/2005-12/msg00238.html
	
	* Plot residuals
	* pnormal plot of residuals
	pnorm logEconsr, name(pnorm_logEconsr)
	graph export "$rpath/graphs/pnorm_logEconsr.png", replace
	
	* qnormal plot
	qnorm logEconsr, name(qnorm_logEconsr)
	graph export "$rpath/graphs/qnorm_logEconsr.png", replace

	* test the null hypothesis that the variance of the residuals is homogenous. 
	* Therefore, if the p-value is very small, we would have to reject the hypothesis 
	* and accept the alternative hypothesis that the variance is not homogenous. 
	* so if p < 0.05 -> heteroscedasticity and so need robust
	* BUT check rvfplot (http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm)
	
	/* can't use after cluster
	di "* test for Heteroscedasticity"
	di "* if p < 0.05 -> heteroscedasticity and so need robust"
	estat hettest
	*/

	di "* test for omitted vars"
	* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
	di "* if p < 0.05 -> omitted vars"
	estat ovtest

	* check for mis-specification
	* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
	* The model is then refit using these two variables as predictors. 
	* _hat should be significant since it is the predicted value. 
	* On the other hand, _hatsq shouldn't, because if our model is specified correctly, 
	* the squared predictions should not have much explanatory power. 
	* That is we wouldn't  expect  _hatsq to be a significant predictor if our model is specified correctly. 
	* So we will be looking at the p-value for _hatsq.
	* p of _hatsq < 0.05 -> mis-spec
		
	di "* linktest"
	di "* if p of _hatsq < 0.05 -> mis-spec"
	linktest
	
	* test BIC etc
	estat ic
Ben Anderson's avatar
Ben Anderson committed
344

345
}
Ben Anderson's avatar
Ben Anderson committed
346

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
if `do_longit' {
	di "* Running longitudinal analysis"
	di "* Check boiler transitions"
	xttrans ba_have_BOILER, freq
	
	di "* Check loft insulation transitions"
	xttrans ba_have_LI, freq
	
	di "* Check cavity wall insulation transitions"
	xttrans ba_have_CWI, freq

	foreach v of local descvars {
		di "***************"
		di "* Longitudinal (panel) analysis"
		* set as panel in case it wasn't
		* fix format of year so xtset doesn't break
		format year %ty
		xtset HH_ID year, delta(1 year)

		di "* examine panel status"
		xtdescribe

		di "* examine panel status for valid observations"
		xtdescribe if Econs < .
		
		di "* xtsum `v' for all"
		xtsum `v'
		
		di "* Check transitions (`v'Validr)"
		xttrans `v'Validr, freq

		di "* Check `v' consumption decile transitions"
		xttrans `v'_dec, freq

		di "* Check `v' outlier transitions"
		bysort MAIN_HEAT_FUEL: xttrans u99_`v', freq
		bysort MAIN_HEAT_FUEL: xttrans l99_`v', freq

		di "* check lag model"
		bysort HH_ID (year): gen logEconslag = logEcons[_n-1]

		gen yeart = year - 2005
		regress logEcons logEconslag i.ba_region i.E7Flag2012 ///
			 i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
		 	yeart ba_have_BOILER ba_have_LI ba_have_CWI
		est store lag_model
		estout lag_model using "$rpath/lag_model_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
		
		di "* xt regression model of change (use log Econs)"
		xtreg  logEcons i.ba_region i.E7Flag2012 ///
			 i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
		 	yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) fe
		 
		est store xtr_fe_logEcons
		estout xtr_fe_logEcons using "$rpath/xtr_fe_logEcons_`version'.txt",  replace
		 
		di "* now including time constant variables"
		xtreg  logEcons i.ba_region i.E7Flag2012 ///
			 i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
		 	yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) re
		 
		est store xtr_re_logEcons
		estout xtr_re_logEcons using "$rpath/xtr_re_logEcons_`version'.txt",  replace
410
411
412
413
414
415
	}
}

di "* Done!"

log close