From 7dac12c9767a9ee776877fa586f4394b48aee4e9 Mon Sep 17 00:00:00 2001 From: Ben Anderson <dataknut@icloud.com> Date: Mon, 24 Aug 2015 16:22:00 +0100 Subject: [PATCH] included RSS Conference analysis - longitudinal analysis of variation - cross-sectional analysis of variation for 2012 - some distributional charts --- ...-NEED-EULF-2014-electricity-consumption.do | 367 ++++++++++++++---- 1 file changed, 294 insertions(+), 73 deletions(-) diff --git a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do index 08c4100..8ec7093 100644 --- a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do +++ b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do @@ -37,37 +37,43 @@ capture noisily log close set more off * written for Mac OSX - remember to change filesystem delimiter for other platforms -global home "~/Documents" +global home "~/Documents/Work" -global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed" -global rpath "$home/Work/Papers and Conferences/RSS-2015/results" +global dpath "$home/Data/Social Science Datasets/DECC/NEED/End User Licence File 2014/processed" +global rpath "$home/Papers and Conferences/RSS-2015/results" local version "v1" + +log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace + * set sample * 100 = 100pc * etc -local sample "100" - -* control what happens -local do_desc = 1 +local sample = 100 -* toggle graph drawing -local do_graphs = 1 +* use weights (leave blank to ignore) +/* +"Weighting based on Region, property age, property type and floor area band. +Summing all weights gives (approximate) total number of households in England and Wales 2011." +*/ +local need_weight = "[pw = WEIGHT]" // use probability weights -* ref DECC look up table -lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)" -* NB DECC look up table says max elec = 50,000 -lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)" +* control what happens +local do_desc = 0 // do descriptives +local do_histo = 0 // toggle graph drawing +local do_box = 0 // toggle graph drawing +local do_xsec = 1 // run cross-sectional analysis +local do_longit = 1 // run longitudinal analysis -* also be aware that the consumption is rounded in buckets: +* Be aware that the consumption is rounded in buckets: /* GconsYEAR . Missing, off gas or invalid consumption - 100 � 7,999 Gas consumption kWh rounded to nearest 500 kWh - 8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh - 16,000 � 24,999 Gas consumption kWh rounded to nearest 500 kWh - 25,000 � 34,999 Gas consumption kWh rounded to nearest 1,000 kWh - 35,000 � 50,000 Gas consumption kWh rounded to nearest 5,000 kWh + 100 - 7,999 Gas consumption kWh rounded to nearest 500 kWh + 8,000 - 15,999 Gas consumption kWh rounded to nearest 100 kWh + 16,000 - 24,999 Gas consumption kWh rounded to nearest 500 kWh + 25,000 - 34,999 Gas consumption kWh rounded to nearest 1,000 kWh + 35,000 - 50,000 Gas consumption kWh rounded to nearest 5,000 kWh EconsYEAR . Missing or invalid consumption @@ -79,7 +85,6 @@ EconsYEAR . Missing or invalid consumption set more off */ -log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace di "************************" @@ -90,20 +95,30 @@ use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear * merge in the xwave file (fixed data - we assume!) merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta" -lab var Econs "Electricity (KwH/year)" -lab var Gcons "Gas (KwH/year)" - -* set as panel in case it wasn't -* fix format of year so xtset doesn't break -format year %ty -xtset HH_ID year, delta(1 year) +* set variable labels +* ref DECC look up table +lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)" +* NB DECC look up table says max elec = 50,000 +lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)" -* examine panel status -xtdescribe +* set up correct long form 'is X present in year' using year (if known) +local vars "BOILER LI CWI" -* set up -local vars "Econs Gcons" +* what will hapen if there are multiple replacements in a household foreach v of local vars { + gen ba_have_`v' = 0 + destring `v'_YEAR, force replace + replace ba_have_`v' = 1 if `v'_YEAR <= year +} + +* labels +lab var Econs "Electricity (kWh/year)" +lab var Gcons "Gas (kWh/year)" + +* set up data +* include Gcons if you want gas too +local setupvars "Econs Gcons" +foreach v of local setupvars { di "***************" di "* Testing `v' for `sample'% sample" @@ -116,25 +131,61 @@ foreach v of local vars { lab var `v'Validr "Recoded `v'Valid" lab val `v'Validr `v'Validr + tab `v'Validr `v'Valid - di "* Check transitions (`v'Validr)" - xttrans `v'Validr, freq + * set up consumption deciles and outlier flags + gen u99_`v' = 0 + gen l99_`v' = 0 + gen m98_`v' = 0 + gen m90_`v' = 0 - * set up consumption deciles levelsof(year), local(levels) foreach l of local levels { - di "* Calculating consumption deciles for `v' for `l'" - * creates missing for other years have to do this as egen does not allow by + di "* Calculating consumption deciles and outlier flags for `v' for `l'" + + * creates missing for other years have to do this as egen does not allow by & can't 'replace' + di "* `v' deciles for `l'" egen `v'_dec_`l' = cut(`v') if year == `l', group(10) + + di "* `v' outlier flags for `l'" + qui: su `v' if year == `l', de + + di "* define outliers" + di "* -> Upper: valid = G & in highest 1% of valid" + replace u99_`v' = 1 if (`v' > `r(p99)' | `v'Validr == 4) & year == `l' // upper 1% 'valid' + + di "* -> Lower: valid = L & in lowest 1% of valid" + replace l99_`v' = 1 if (`v' < `r(p1)' | `v'Validr == 3) & year == `l' // lower 1% 'valid' + replace m98_`v' = 1 if u99_`v' == 0 & l99_`v' == 0 & year == `l' // middle 98% 'valid' + replace m90_`v' = 1 if `v' > `r(p5)' & `v' < `r(p95)' & year == `l' // middle 90% 'valid' + + * do checks here otherwise tab puts things into the return list + *tab u99_`v' if year == `l', mi + *tab l99_`v' if year == `l', mi + *tab m98_`v' if year == `l', mi + *tab m90_`v' if year == `l', mi + } - * now combine them - set missing option otherwise it counts a row where all are missing as 0 + * now combine the deciles - set missing option otherwise it counts a row where all are missing as 0 egen `v'_dec = rowtotal(`v'_dec_*), missing * remove temporary ones drop `v'_dec_* - * check - tab `v'_dec year + * check unweighted + table `v'_dec year + + tabstat u* l* m*, by(year) + + tab l99_`v' `v'Validr + tab u99_`v' `v'Validr + + * log vars + gen log`v' = log(`v') + lab var log`v' "Log `v'" } +* set 'survey' weight +svyset `need_weight' + * flag dwellings which are off gas for electricity * NB - in this dataset we don't know if they use electricity as main heat (could be oil) gen ba_off_gas = 0 @@ -143,49 +194,219 @@ lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC lab val ba_off_gas ba_off_gas * check -tabstat Gcons Econs, by(ba_off_gas) +svy: mean Gcons Econs, over(ba_off_gas) di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'" -tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'? +table ba_off_gas MAIN_HEAT_FUEL `need_weight', missing // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'? -table year MAIN_HEAT_FUEL, by(ba_off_gas) +table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) * roughly constant rate throughout years -table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons) -* but off gas have no gas readings as you'd expect (DECC filter) - -foreach v of local vars { - di "***************" - di "* Testing `v' for `sample'% sample" - - * overall - xtsum `v' if `v'Valid == "V" - * test values for valid - check for valid 0s for example. This only happens for gas where: - * 100 < gcons < 250 so included but rounded to nearest 500 = 0 - - * elec always rounded to nearest 50 so min should always be 100 - - tabstat `v', by(`v'Valid) s(n mean semean min max) - * by year - di "* check `v' for 0s (`s'% sample)" - table `v' year if `v' < 1000 - table `v'Valid year, c(count `v' min `v' mean `v' max `v') +table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) c(mean Gcons n Gcons) +* but off gas have no gas readings as you'd expect (DECC applied filter) + +local descvars "Econs" +if `do_desc' { + foreach v of local descvars { + di "***************" + di "* Testing `v' for `sample'% sample" + + di "* Check `v' by year" + di "* Check `v' for 0s (`sample'% sample)" + if "`v'" == "Econs" { + di "* -> `v' always rounded to nearest 50 so min should always be 100" + local min = 500 + } + if "`v'" == "Gcons" { + di "* -> 100 < `v' < 250 so included but rounded to nearest 500 which in this case = 0" + local min = 1000 + } + table `v' year if `v' < `min' `need_weight' + + di "* Check no values of `v' appear for 'non valid' codes" + di "* test values for all valid codes" + table `v'Valid `need_weight', c(count `v' mean `v') + + di "* test `v' per year for valid codes (use pw)" + local table_stats "count `v' min `v' mean `v' p50 `v' max `v'" // set table stats - 5 max!! + table year if `v'Valid == "V" `need_weight', c(`table_stats`n'') + + di "* table won't allow semean for iw so use svy: mean `v' instead" + svy: mean `v' if `v'Valid == "V", over(year) + + di "* use table to force sd `v' calculation using iw not pw, include mean to check" + table year [iw=WEIGHT], c(mean `v' sd `v') + + if `do_histo' { + di "* Running graphs: histo for `v' (can't use `need_weight')" + histogram `v' if `v'Valid == "V", frequency by(year) ylab(,angle(0)) scale(0.75) + graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-histo_`v'_by_year_valid.png", replace + } + + if `do_box' { + di "* Running graphs: boxes for `v'" + di "* `v' over year" + graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) /// + ylab(,angle(0)) scale(0.75) + graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_over_year_valid.png", replace + + di "* `v' over year by floor area" + svy: mean `v', over(FLOOR_AREA_BAND) + graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) /// + ylab(,angle(0)) by(FLOOR_AREA_BAND) scale(0.75) + graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_floor_valid.png", replace + + di "* `v' over year by EE band" + graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) /// + ylab(,angle(0)) by(EE_BAND) scale(0.75) + graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_ee_valid.png", replace + + } + + di "* check the distributions of the outliers" + local tvars "MAIN_HEAT_FUEL E7Flag2012 ba_off_gas FLOOR_AREA_BAND EE_BAND IMD_ENG" + foreach tv of local tvars { + di "* Checking top 1% against `tv'" + tab `tv' u99_Econs , col + } + - if `do_graphs' { - di "* Running graphs - do not keep in memory, just save out" - di "* Running graphs: histo" - histogram `v' if `v'Valid == "V", by(year) scale(0.75) - graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace - di "* Running graphs: boxes" - graph box `v' if `v'Valid == "V", over(year) scale(0.75) - graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace + } +} - graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND) scale(0.75) - graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace +if `do_xsec' { + di "* Running cross sectional analysis using 2012 (all valid cases)" + * make sure test vars are destringed + local tvars "IMD_ENG FP_ENG E7Flag2012 MAIN_HEAT_FUEL PROP_AGE PROP_TYPE FLOOR_AREA_BAND EE_BAND LOFT_DEPTH WALL_CONS CWI BOILER" + foreach tv of local tvars { + destring `tv', force replace + } + * kitchen sink model - use 99% signif level + * use vce(robust) as running without & using hettest suggests heteroscedasticity + * regress would throw out collinear variables + * do not include IMD & FP as really need multilvel model for that + regress logEcons i.ba_region i.E7Flag2012 i.MAIN_HEAT_FUEL /// + i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// + if year == 2012, level(99) vce(robust) + est store est_logEcons_2012 + + estout est_logEcons_2012 using "$rpath/regress_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace + + * tests + predict logEconsr if e(sample), residuals + di "* test for normality of residuals using graphs" + * see http://www-personal.umich.edu/~bwest/ch3diags.do + * do not use sktest as large n & also see + * http://www.stata.com/statalist/archive/2005-12/msg00238.html + + * Plot residuals + * pnormal plot of residuals + pnorm logEconsr, name(pnorm_logEconsr) + graph export "$rpath/graphs/pnorm_logEconsr.png", replace + + * qnormal plot + qnorm logEconsr, name(qnorm_logEconsr) + graph export "$rpath/graphs/qnorm_logEconsr.png", replace + + * test the null hypothesis that the variance of the residuals is homogenous. + * Therefore, if the p-value is very small, we would have to reject the hypothesis + * and accept the alternative hypothesis that the variance is not homogenous. + * so if p < 0.05 -> heteroscedasticity and so need robust + * BUT check rvfplot (http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm) + + /* can't use after cluster + di "* test for Heteroscedasticity" + di "* if p < 0.05 -> heteroscedasticity and so need robust" + estat hettest + */ + + di "* test for omitted vars" + * http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm + di "* if p < 0.05 -> omitted vars" + estat ovtest + + * check for mis-specification + * http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm + * The model is then refit using these two variables as predictors. + * _hat should be significant since it is the predicted value. + * On the other hand, _hatsq shouldn't, because if our model is specified correctly, + * the squared predictions should not have much explanatory power. + * That is we wouldn't expect _hatsq to be a significant predictor if our model is specified correctly. + * So we will be looking at the p-value for _hatsq. + * p of _hatsq < 0.05 -> mis-spec + + di "* linktest" + di "* if p of _hatsq < 0.05 -> mis-spec" + linktest + + * test BIC etc + estat ic - graph box `v' if `v'Valid == "V", over(year) by(EE_BAND) scale(0.75) - graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace +} +if `do_longit' { + di "* Running longitudinal analysis" + di "* Check boiler transitions" + xttrans ba_have_BOILER, freq + + di "* Check loft insulation transitions" + xttrans ba_have_LI, freq + + di "* Check cavity wall insulation transitions" + xttrans ba_have_CWI, freq + + foreach v of local descvars { + di "***************" + di "* Longitudinal (panel) analysis" + * set as panel in case it wasn't + * fix format of year so xtset doesn't break + format year %ty + xtset HH_ID year, delta(1 year) + + di "* examine panel status" + xtdescribe + + di "* examine panel status for valid observations" + xtdescribe if Econs < . + + di "* xtsum `v' for all" + xtsum `v' + + di "* Check transitions (`v'Validr)" + xttrans `v'Validr, freq + + di "* Check `v' consumption decile transitions" + xttrans `v'_dec, freq + + di "* Check `v' outlier transitions" + bysort MAIN_HEAT_FUEL: xttrans u99_`v', freq + bysort MAIN_HEAT_FUEL: xttrans l99_`v', freq + + di "* check lag model" + bysort HH_ID (year): gen logEconslag = logEcons[_n-1] + + gen yeart = year - 2005 + regress logEcons logEconslag i.ba_region i.E7Flag2012 /// + i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// + yeart ba_have_BOILER ba_have_LI ba_have_CWI + est store lag_model + estout lag_model using "$rpath/lag_model_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace + + di "* xt regression model of change (use log Econs)" + xtreg logEcons i.ba_region i.E7Flag2012 /// + i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// + yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) fe + + est store xtr_fe_logEcons + estout xtr_fe_logEcons using "$rpath/xtr_fe_logEcons_`version'.txt", replace + + di "* now including time constant variables" + xtreg logEcons i.ba_region i.E7Flag2012 /// + i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS /// + yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) re + + est store xtr_re_logEcons + estout xtr_re_logEcons using "$rpath/xtr_re_logEcons_`version'.txt", replace } } -- GitLab