******************************************* * Script to: * - analyse DECC's EULF 2014 NEED data to examine distributions etc * Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014 * http://discover.ukdataservice.ac.uk/catalogue/?sn=7518 * Most recent version of this script can be found at https://github.com/dataknut/DECC-data/tree/master/NEED * The script requires the following to have been run first: * https://github.com/dataknut/DECC-data/blob/master/NEED/process-NEED-EULF-2014.do /* Copyright (C) 2014 University of Southampton Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut) [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton] This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. #YMMV - http://en.wiktionary.org/wiki/YMMV */ clear all capture noisily log close set more off * written for Mac OSX - remember to change filesystem delimiter for other platforms global home "~/Documents/Work" global dpath "$home/Data/Social Science Datasets/DECC/NEED/End User Licence File 2014/processed" global rpath "$home/Papers and Conferences/RSS-2015/results" local version "v1" log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace * set sample * 100 = 100pc * etc local sample = 100 * use weights (leave blank to ignore) /* "Weighting based on Region, property age, property type and floor area band. Summing all weights gives (approximate) total number of households in England and Wales 2011." */ local need_weight = "[pw = WEIGHT]" // use probability weights * control what happens local do_desc = 0 // do descriptives local do_histo = 0 // toggle graph drawing local do_box = 0 // toggle graph drawing local do_xsec = 1 // run cross-sectional analysis local do_longit = 1 // run longitudinal analysis * Be aware that the consumption is rounded in buckets: /* GconsYEAR . Missing, off gas or invalid consumption 100 - 7,999 Gas consumption kWh rounded to nearest 500 kWh 8,000 - 15,999 Gas consumption kWh rounded to nearest 100 kWh 16,000 - 24,999 Gas consumption kWh rounded to nearest 500 kWh 25,000 - 34,999 Gas consumption kWh rounded to nearest 1,000 kWh 35,000 - 50,000 Gas consumption kWh rounded to nearest 5,000 kWh EconsYEAR . Missing or invalid consumption 100 - 9,999 Electricity consumption kWh rounded to nearest 50 kWh 10,000 - 11,999 Electricity consumption kWh rounded to nearest 100 kWh 12,000 - 14,999 Electricity consumption kWh rounded to nearest 500 kWh 15,000 - 19,999 Electricity consumption kWh rounded to nearest 1,000 kWh 20,000 - 25,000 Electricity consumption kWh rounded to nearest 5,000 kWh set more off */ di "************************" di "* Using `sample'% sample" * load the yearly consumption data use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear * merge in the xwave file (fixed data - we assume!) merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta" * set variable labels * ref DECC look up table lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)" * NB DECC look up table says max elec = 50,000 lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)" * set up correct long form 'is X present in year' using year (if known) local vars "BOILER LI CWI" * what will hapen if there are multiple replacements in a household foreach v of local vars { gen ba_have_`v' = 0 destring `v'_YEAR, force replace replace ba_have_`v' = 1 if `v'_YEAR <= year } * labels lab var Econs "Electricity (kWh/year)" lab var Gcons "Gas (kWh/year)" * set up data * include Gcons if you want gas too local setupvars "Econs Gcons" foreach v of local setupvars { di "***************" di "* Testing `v' for `sample'% sample" di "* check the panel transitions for each valid" gen `v'Validr = 1 if `v'Valid == "V" replace `v'Validr = 2 if `v'Valid == "O" // off gas (from EPC) only relevant for gas replace `v'Validr = 3 if `v'Valid == "L" replace `v'Validr = 4 if `v'Valid == "G" replace `v'Validr = 5 if `v'Valid == "M" lab var `v'Validr "Recoded `v'Valid" lab val `v'Validr `v'Validr tab `v'Validr `v'Valid * set up consumption deciles and outlier flags gen u99_`v' = 0 gen l99_`v' = 0 gen m98_`v' = 0 gen m90_`v' = 0 levelsof(year), local(levels) foreach l of local levels { di "* Calculating consumption deciles and outlier flags for `v' for `l'" * creates missing for other years have to do this as egen does not allow by & can't 'replace' di "* `v' deciles for `l'" egen `v'_dec_`l' = cut(`v') if year == `l', group(10) di "* `v' outlier flags for `l'" qui: su `v' if year == `l', de di "* define outliers" di "* -> Upper: valid = G & in highest 1% of valid" replace u99_`v' = 1 if (`v' > `r(p99)' | `v'Validr == 4) & year == `l' // upper 1% 'valid' di "* -> Lower: valid = L & in lowest 1% of valid" replace l99_`v' = 1 if (`v' < `r(p1)' | `v'Validr == 3) & year == `l' // lower 1% 'valid' replace m98_`v' = 1 if u99_`v' == 0 & l99_`v' == 0 & year == `l' // middle 98% 'valid' replace m90_`v' = 1 if `v' > `r(p5)' & `v' < `r(p95)' & year == `l' // middle 90% 'valid' * do checks here otherwise tab puts things into the return list *tab u99_`v' if year == `l', mi *tab l99_`v' if year == `l', mi *tab m98_`v' if year == `l', mi *tab m90_`v' if year == `l', mi } * now combine the deciles - set missing option otherwise it counts a row where all are missing as 0 egen `v'_dec = rowtotal(`v'_dec_*), missing * remove temporary ones drop `v'_dec_* * check unweighted table `v'_dec year tabstat u* l* m*, by(year) tab l99_`v' `v'Validr tab u99_`v' `v'Validr * log vars gen log`v' = log(`v') lab var log`v' "Log `v'" } * set 'survey' weight svyset `need_weight' * flag dwellings which are off gas for electricity * NB - in this dataset we don't know if they use electricity as main heat (could be oil) gen ba_off_gas = 0 replace ba_off_gas = 1 if GconsValidr == 2 lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC)" lab val ba_off_gas ba_off_gas * check svy: mean Gcons Econs, over(ba_off_gas) di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'" table ba_off_gas MAIN_HEAT_FUEL `need_weight', missing // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'? table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) * roughly constant rate throughout years table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) c(mean Gcons n Gcons) * but off gas have no gas readings as you'd expect (DECC applied filter) local descvars "Econs" if `do_desc' { foreach v of local descvars { di "***************" di "* Testing `v' for `sample'% sample" di "* Check `v' by year" di "* Check `v' for 0s (`sample'% sample)" if "`v'" == "Econs" { di "* -> `v' always rounded to nearest 50 so min should always be 100" local min = 500 } if "`v'" == "Gcons" { di "* -> 100 < `v' < 250 so included but rounded to nearest 500 which in this case = 0" local min = 1000 } table `v' year if `v' < `min' `need_weight' di "* Check no values of `v' appear for 'non valid' codes" di "* test values for all valid codes" table `v'Valid `need_weight', c(count `v' mean `v') di "* test `v' per year for valid codes (use pw)" local table_stats "count `v' min `v' mean `v' p50 `v' max `v'" // set table stats - 5 max!! table year if `v'Valid == "V" `need_weight', c(`table_stats`n'') di "* table won't allow semean for iw so use svy: mean `v' instead" svy: mean `v' if `v'Valid == "V", over(year) di "* use table to force sd `v' calculation using iw not pw, include mean to check" table year [iw=WEIGHT], c(mean `v' sd `v') if `do_histo' { di "* Running graphs: histo for `v' (can't use `need_weight')" histogram `v' if `v'Valid == "V", frequency by(year) ylab(,angle(0)) scale(0.75) graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-histo_`v'_by_year_valid.png", replace } if `do_box' { di "* Running graphs: boxes for `v'" di "* `v' over year" graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) /// ylab(,angle(0)) scale(0.75) graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_over_year_valid.png", replace di "* `v' over year by floor area" svy: mean `v', over(FLOOR_AREA_BAND) graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) /// ylab(,angle(0)) by(FLOOR_AREA_BAND) scale(0.75) graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_floor_valid.png", replace di "* `v' over year by EE band" graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) /// ylab(,angle(0)) by(EE_BAND) scale(0.75) graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_ee_valid.png", replace } di "* check the distributions of the outliers" local tvars "MAIN_HEAT_FUEL E7Flag2012 ba_off_gas FLOOR_AREA_BAND EE_BAND IMD_ENG" foreach tv of local tvars { di "* Checking top 1% against `tv'" tab `tv' u99_Econs , col } } } if `do_xsec' { di "* Running cross sectional analysis using 2012 (all valid cases)" * make sure test vars are destringed local tvars "IMD_ENG FP_ENG E7Flag2012 MAIN_HEAT_FUEL PROP_AGE PROP_TYPE FLOOR_AREA_BAND EE_BAND LOFT_DEPTH WALL_CONS CWI BOILER" foreach tv of local tvars { destring `tv', force replace } * kitchen sink model - use 99% signif level * use vce(robust) as running without & using hettest suggests heteroscedasticity * regress would throw out collinear variables * do not include IMD & FP as really need multilvel model for that regress logEcons i.ba_region i.E7Flag2012 i.MAIN_HEAT_FUEL /// i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// if year == 2012, level(99) vce(robust) est store est_logEcons_2012 estout est_logEcons_2012 using "$rpath/regress_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace * tests predict logEconsr if e(sample), residuals di "* test for normality of residuals using graphs" * see http://www-personal.umich.edu/~bwest/ch3diags.do * do not use sktest as large n & also see * http://www.stata.com/statalist/archive/2005-12/msg00238.html * Plot residuals * pnormal plot of residuals pnorm logEconsr, name(pnorm_logEconsr) graph export "$rpath/graphs/pnorm_logEconsr.png", replace * qnormal plot qnorm logEconsr, name(qnorm_logEconsr) graph export "$rpath/graphs/qnorm_logEconsr.png", replace * test the null hypothesis that the variance of the residuals is homogenous. * Therefore, if the p-value is very small, we would have to reject the hypothesis * and accept the alternative hypothesis that the variance is not homogenous. * so if p < 0.05 -> heteroscedasticity and so need robust * BUT check rvfplot (http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm) /* can't use after cluster di "* test for Heteroscedasticity" di "* if p < 0.05 -> heteroscedasticity and so need robust" estat hettest */ di "* test for omitted vars" * http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm di "* if p < 0.05 -> omitted vars" estat ovtest * check for mis-specification * http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm * The model is then refit using these two variables as predictors. * _hat should be significant since it is the predicted value. * On the other hand, _hatsq shouldn't, because if our model is specified correctly, * the squared predictions should not have much explanatory power. * That is we wouldn't expect _hatsq to be a significant predictor if our model is specified correctly. * So we will be looking at the p-value for _hatsq. * p of _hatsq < 0.05 -> mis-spec di "* linktest" di "* if p of _hatsq < 0.05 -> mis-spec" linktest * test BIC etc estat ic } if `do_longit' { di "* Running longitudinal analysis" di "* Check boiler transitions" xttrans ba_have_BOILER, freq di "* Check loft insulation transitions" xttrans ba_have_LI, freq di "* Check cavity wall insulation transitions" xttrans ba_have_CWI, freq foreach v of local descvars { di "***************" di "* Longitudinal (panel) analysis" * set as panel in case it wasn't * fix format of year so xtset doesn't break format year %ty xtset HH_ID year, delta(1 year) di "* examine panel status" xtdescribe di "* examine panel status for valid observations" xtdescribe if Econs < . di "* xtsum `v' for all" xtsum `v' di "* Check transitions (`v'Validr)" xttrans `v'Validr, freq di "* Check `v' consumption decile transitions" xttrans `v'_dec, freq di "* Check `v' outlier transitions" bysort MAIN_HEAT_FUEL: xttrans u99_`v', freq bysort MAIN_HEAT_FUEL: xttrans l99_`v', freq di "* check lag model" bysort HH_ID (year): gen logEconslag = logEcons[_n-1] gen yeart = year - 2005 regress logEcons logEconslag i.ba_region i.E7Flag2012 /// i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// yeart ba_have_BOILER ba_have_LI ba_have_CWI est store lag_model estout lag_model using "$rpath/lag_model_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace di "* xt regression model of change (use log Econs)" xtreg logEcons i.ba_region i.E7Flag2012 /// i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) fe est store xtr_fe_logEcons estout xtr_fe_logEcons using "$rpath/xtr_fe_logEcons_`version'.txt", replace di "* now including time constant variables" xtreg logEcons i.ba_region i.E7Flag2012 /// i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) re est store xtr_re_logEcons estout xtr_re_logEcons using "$rpath/xtr_re_logEcons_`version'.txt", replace } } di "* Done!" log close