Skip to content
Snippets Groups Projects
Commit 7dac12c9 authored by Ben Anderson's avatar Ben Anderson
Browse files

included RSS Conference analysis

- longitudinal analysis of variation
- cross-sectional analysis of variation for 2012
- some distributional charts
parent 2d3309f7
No related branches found
No related tags found
No related merge requests found
......@@ -37,37 +37,43 @@ capture noisily log close
set more off
* written for Mac OSX - remember to change filesystem delimiter for other platforms
global home "~/Documents"
global home "~/Documents/Work"
global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed"
global rpath "$home/Work/Papers and Conferences/RSS-2015/results"
global dpath "$home/Data/Social Science Datasets/DECC/NEED/End User Licence File 2014/processed"
global rpath "$home/Papers and Conferences/RSS-2015/results"
local version "v1"
log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
* set sample
* 100 = 100pc
* etc
local sample "100"
* control what happens
local do_desc = 1
local sample = 100
* toggle graph drawing
local do_graphs = 1
* use weights (leave blank to ignore)
/*
"Weighting based on Region, property age, property type and floor area band.
Summing all weights gives (approximate) total number of households in England and Wales 2011."
*/
local need_weight = "[pw = WEIGHT]" // use probability weights
* ref DECC look up table
lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)"
* NB DECC look up table says max elec = 50,000
lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
* control what happens
local do_desc = 0 // do descriptives
local do_histo = 0 // toggle graph drawing
local do_box = 0 // toggle graph drawing
local do_xsec = 1 // run cross-sectional analysis
local do_longit = 1 // run longitudinal analysis
* also be aware that the consumption is rounded in buckets:
* Be aware that the consumption is rounded in buckets:
/*
GconsYEAR . Missing, off gas or invalid consumption
100 7,999 Gas consumption kWh rounded to nearest 500 kWh
8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh
16,000 24,999 Gas consumption kWh rounded to nearest 500 kWh
25,000 34,999 Gas consumption kWh rounded to nearest 1,000 kWh
35,000 50,000 Gas consumption kWh rounded to nearest 5,000 kWh
100 - 7,999 Gas consumption kWh rounded to nearest 500 kWh
8,000 - 15,999 Gas consumption kWh rounded to nearest 100 kWh
16,000 - 24,999 Gas consumption kWh rounded to nearest 500 kWh
25,000 - 34,999 Gas consumption kWh rounded to nearest 1,000 kWh
35,000 - 50,000 Gas consumption kWh rounded to nearest 5,000 kWh
EconsYEAR . Missing or invalid consumption
......@@ -79,7 +85,6 @@ EconsYEAR . Missing or invalid consumption
set more off
*/
log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
di "************************"
......@@ -90,20 +95,30 @@ use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
* merge in the xwave file (fixed data - we assume!)
merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta"
lab var Econs "Electricity (KwH/year)"
lab var Gcons "Gas (KwH/year)"
* set as panel in case it wasn't
* fix format of year so xtset doesn't break
format year %ty
xtset HH_ID year, delta(1 year)
* set variable labels
* ref DECC look up table
lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)"
* NB DECC look up table says max elec = 50,000
lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
* examine panel status
xtdescribe
* set up correct long form 'is X present in year' using year (if known)
local vars "BOILER LI CWI"
* set up
local vars "Econs Gcons"
* what will hapen if there are multiple replacements in a household
foreach v of local vars {
gen ba_have_`v' = 0
destring `v'_YEAR, force replace
replace ba_have_`v' = 1 if `v'_YEAR <= year
}
* labels
lab var Econs "Electricity (kWh/year)"
lab var Gcons "Gas (kWh/year)"
* set up data
* include Gcons if you want gas too
local setupvars "Econs Gcons"
foreach v of local setupvars {
di "***************"
di "* Testing `v' for `sample'% sample"
......@@ -116,25 +131,61 @@ foreach v of local vars {
lab var `v'Validr "Recoded `v'Valid"
lab val `v'Validr `v'Validr
tab `v'Validr `v'Valid
di "* Check transitions (`v'Validr)"
xttrans `v'Validr, freq
* set up consumption deciles and outlier flags
gen u99_`v' = 0
gen l99_`v' = 0
gen m98_`v' = 0
gen m90_`v' = 0
* set up consumption deciles
levelsof(year), local(levels)
foreach l of local levels {
di "* Calculating consumption deciles for `v' for `l'"
* creates missing for other years have to do this as egen does not allow by
di "* Calculating consumption deciles and outlier flags for `v' for `l'"
* creates missing for other years have to do this as egen does not allow by & can't 'replace'
di "* `v' deciles for `l'"
egen `v'_dec_`l' = cut(`v') if year == `l', group(10)
di "* `v' outlier flags for `l'"
qui: su `v' if year == `l', de
di "* define outliers"
di "* -> Upper: valid = G & in highest 1% of valid"
replace u99_`v' = 1 if (`v' > `r(p99)' | `v'Validr == 4) & year == `l' // upper 1% 'valid'
di "* -> Lower: valid = L & in lowest 1% of valid"
replace l99_`v' = 1 if (`v' < `r(p1)' | `v'Validr == 3) & year == `l' // lower 1% 'valid'
replace m98_`v' = 1 if u99_`v' == 0 & l99_`v' == 0 & year == `l' // middle 98% 'valid'
replace m90_`v' = 1 if `v' > `r(p5)' & `v' < `r(p95)' & year == `l' // middle 90% 'valid'
* do checks here otherwise tab puts things into the return list
*tab u99_`v' if year == `l', mi
*tab l99_`v' if year == `l', mi
*tab m98_`v' if year == `l', mi
*tab m90_`v' if year == `l', mi
}
* now combine them - set missing option otherwise it counts a row where all are missing as 0
* now combine the deciles - set missing option otherwise it counts a row where all are missing as 0
egen `v'_dec = rowtotal(`v'_dec_*), missing
* remove temporary ones
drop `v'_dec_*
* check
tab `v'_dec year
* check unweighted
table `v'_dec year
tabstat u* l* m*, by(year)
tab l99_`v' `v'Validr
tab u99_`v' `v'Validr
* log vars
gen log`v' = log(`v')
lab var log`v' "Log `v'"
}
* set 'survey' weight
svyset `need_weight'
* flag dwellings which are off gas for electricity
* NB - in this dataset we don't know if they use electricity as main heat (could be oil)
gen ba_off_gas = 0
......@@ -143,49 +194,219 @@ lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC
lab val ba_off_gas ba_off_gas
* check
tabstat Gcons Econs, by(ba_off_gas)
svy: mean Gcons Econs, over(ba_off_gas)
di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'"
tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
table ba_off_gas MAIN_HEAT_FUEL `need_weight', missing // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
table year MAIN_HEAT_FUEL, by(ba_off_gas)
table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas)
* roughly constant rate throughout years
table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons)
* but off gas have no gas readings as you'd expect (DECC filter)
foreach v of local vars {
di "***************"
di "* Testing `v' for `sample'% sample"
* overall
xtsum `v' if `v'Valid == "V"
* test values for valid - check for valid 0s for example. This only happens for gas where:
* 100 < gcons < 250 so included but rounded to nearest 500 = 0
* elec always rounded to nearest 50 so min should always be 100
tabstat `v', by(`v'Valid) s(n mean semean min max)
* by year
di "* check `v' for 0s (`s'% sample)"
table `v' year if `v' < 1000
table `v'Valid year, c(count `v' min `v' mean `v' max `v')
table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) c(mean Gcons n Gcons)
* but off gas have no gas readings as you'd expect (DECC applied filter)
local descvars "Econs"
if `do_desc' {
foreach v of local descvars {
di "***************"
di "* Testing `v' for `sample'% sample"
di "* Check `v' by year"
di "* Check `v' for 0s (`sample'% sample)"
if "`v'" == "Econs" {
di "* -> `v' always rounded to nearest 50 so min should always be 100"
local min = 500
}
if "`v'" == "Gcons" {
di "* -> 100 < `v' < 250 so included but rounded to nearest 500 which in this case = 0"
local min = 1000
}
table `v' year if `v' < `min' `need_weight'
di "* Check no values of `v' appear for 'non valid' codes"
di "* test values for all valid codes"
table `v'Valid `need_weight', c(count `v' mean `v')
di "* test `v' per year for valid codes (use pw)"
local table_stats "count `v' min `v' mean `v' p50 `v' max `v'" // set table stats - 5 max!!
table year if `v'Valid == "V" `need_weight', c(`table_stats`n'')
di "* table won't allow semean for iw so use svy: mean `v' instead"
svy: mean `v' if `v'Valid == "V", over(year)
di "* use table to force sd `v' calculation using iw not pw, include mean to check"
table year [iw=WEIGHT], c(mean `v' sd `v')
if `do_histo' {
di "* Running graphs: histo for `v' (can't use `need_weight')"
histogram `v' if `v'Valid == "V", frequency by(year) ylab(,angle(0)) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-histo_`v'_by_year_valid.png", replace
}
if `do_box' {
di "* Running graphs: boxes for `v'"
di "* `v' over year"
graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
ylab(,angle(0)) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_over_year_valid.png", replace
di "* `v' over year by floor area"
svy: mean `v', over(FLOOR_AREA_BAND)
graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
ylab(,angle(0)) by(FLOOR_AREA_BAND) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_floor_valid.png", replace
di "* `v' over year by EE band"
graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
ylab(,angle(0)) by(EE_BAND) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_ee_valid.png", replace
}
di "* check the distributions of the outliers"
local tvars "MAIN_HEAT_FUEL E7Flag2012 ba_off_gas FLOOR_AREA_BAND EE_BAND IMD_ENG"
foreach tv of local tvars {
di "* Checking top 1% against `tv'"
tab `tv' u99_Econs , col
}
if `do_graphs' {
di "* Running graphs - do not keep in memory, just save out"
di "* Running graphs: histo"
histogram `v' if `v'Valid == "V", by(year) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace
di "* Running graphs: boxes"
graph box `v' if `v'Valid == "V", over(year) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace
}
}
graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace
if `do_xsec' {
di "* Running cross sectional analysis using 2012 (all valid cases)"
* make sure test vars are destringed
local tvars "IMD_ENG FP_ENG E7Flag2012 MAIN_HEAT_FUEL PROP_AGE PROP_TYPE FLOOR_AREA_BAND EE_BAND LOFT_DEPTH WALL_CONS CWI BOILER"
foreach tv of local tvars {
destring `tv', force replace
}
* kitchen sink model - use 99% signif level
* use vce(robust) as running without & using hettest suggests heteroscedasticity
* regress would throw out collinear variables
* do not include IMD & FP as really need multilvel model for that
regress logEcons i.ba_region i.E7Flag2012 i.MAIN_HEAT_FUEL ///
i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
if year == 2012, level(99) vce(robust)
est store est_logEcons_2012
estout est_logEcons_2012 using "$rpath/regress_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
* tests
predict logEconsr if e(sample), residuals
di "* test for normality of residuals using graphs"
* see http://www-personal.umich.edu/~bwest/ch3diags.do
* do not use sktest as large n & also see
* http://www.stata.com/statalist/archive/2005-12/msg00238.html
* Plot residuals
* pnormal plot of residuals
pnorm logEconsr, name(pnorm_logEconsr)
graph export "$rpath/graphs/pnorm_logEconsr.png", replace
* qnormal plot
qnorm logEconsr, name(qnorm_logEconsr)
graph export "$rpath/graphs/qnorm_logEconsr.png", replace
* test the null hypothesis that the variance of the residuals is homogenous.
* Therefore, if the p-value is very small, we would have to reject the hypothesis
* and accept the alternative hypothesis that the variance is not homogenous.
* so if p < 0.05 -> heteroscedasticity and so need robust
* BUT check rvfplot (http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm)
/* can't use after cluster
di "* test for Heteroscedasticity"
di "* if p < 0.05 -> heteroscedasticity and so need robust"
estat hettest
*/
di "* test for omitted vars"
* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
di "* if p < 0.05 -> omitted vars"
estat ovtest
* check for mis-specification
* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
* The model is then refit using these two variables as predictors.
* _hat should be significant since it is the predicted value.
* On the other hand, _hatsq shouldn't, because if our model is specified correctly,
* the squared predictions should not have much explanatory power.
* That is we wouldn't expect _hatsq to be a significant predictor if our model is specified correctly.
* So we will be looking at the p-value for _hatsq.
* p of _hatsq < 0.05 -> mis-spec
di "* linktest"
di "* if p of _hatsq < 0.05 -> mis-spec"
linktest
* test BIC etc
estat ic
graph box `v' if `v'Valid == "V", over(year) by(EE_BAND) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace
}
if `do_longit' {
di "* Running longitudinal analysis"
di "* Check boiler transitions"
xttrans ba_have_BOILER, freq
di "* Check loft insulation transitions"
xttrans ba_have_LI, freq
di "* Check cavity wall insulation transitions"
xttrans ba_have_CWI, freq
foreach v of local descvars {
di "***************"
di "* Longitudinal (panel) analysis"
* set as panel in case it wasn't
* fix format of year so xtset doesn't break
format year %ty
xtset HH_ID year, delta(1 year)
di "* examine panel status"
xtdescribe
di "* examine panel status for valid observations"
xtdescribe if Econs < .
di "* xtsum `v' for all"
xtsum `v'
di "* Check transitions (`v'Validr)"
xttrans `v'Validr, freq
di "* Check `v' consumption decile transitions"
xttrans `v'_dec, freq
di "* Check `v' outlier transitions"
bysort MAIN_HEAT_FUEL: xttrans u99_`v', freq
bysort MAIN_HEAT_FUEL: xttrans l99_`v', freq
di "* check lag model"
bysort HH_ID (year): gen logEconslag = logEcons[_n-1]
gen yeart = year - 2005
regress logEcons logEconslag i.ba_region i.E7Flag2012 ///
i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
yeart ba_have_BOILER ba_have_LI ba_have_CWI
est store lag_model
estout lag_model using "$rpath/lag_model_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
di "* xt regression model of change (use log Econs)"
xtreg logEcons i.ba_region i.E7Flag2012 ///
i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) fe
est store xtr_fe_logEcons
estout xtr_fe_logEcons using "$rpath/xtr_fe_logEcons_`version'.txt", replace
di "* now including time constant variables"
xtreg logEcons i.ba_region i.E7Flag2012 ///
i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) re
est store xtr_re_logEcons
estout xtr_re_logEcons using "$rpath/xtr_re_logEcons_`version'.txt", replace
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment