Commit 7dac12c9 authored by Ben Anderson's avatar Ben Anderson
Browse files

included RSS Conference analysis

- longitudinal analysis of variation
- cross-sectional analysis of variation for 2012
- some distributional charts
parent 2d3309f7
...@@ -37,37 +37,43 @@ capture noisily log close ...@@ -37,37 +37,43 @@ capture noisily log close
set more off set more off
* written for Mac OSX - remember to change filesystem delimiter for other platforms * written for Mac OSX - remember to change filesystem delimiter for other platforms
global home "~/Documents" global home "~/Documents/Work"
global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed" global dpath "$home/Data/Social Science Datasets/DECC/NEED/End User Licence File 2014/processed"
global rpath "$home/Work/Papers and Conferences/RSS-2015/results" global rpath "$home/Papers and Conferences/RSS-2015/results"
local version "v1" local version "v1"
log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
* set sample * set sample
* 100 = 100pc * 100 = 100pc
* etc * etc
local sample "100" local sample = 100
* control what happens
local do_desc = 1
* toggle graph drawing * use weights (leave blank to ignore)
local do_graphs = 1 /*
"Weighting based on Region, property age, property type and floor area band.
Summing all weights gives (approximate) total number of households in England and Wales 2011."
*/
local need_weight = "[pw = WEIGHT]" // use probability weights
* ref DECC look up table * control what happens
lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)" local do_desc = 0 // do descriptives
* NB DECC look up table says max elec = 50,000 local do_histo = 0 // toggle graph drawing
lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)" local do_box = 0 // toggle graph drawing
local do_xsec = 1 // run cross-sectional analysis
local do_longit = 1 // run longitudinal analysis
* also be aware that the consumption is rounded in buckets: * Be aware that the consumption is rounded in buckets:
/* /*
GconsYEAR . Missing, off gas or invalid consumption GconsYEAR . Missing, off gas or invalid consumption
100 7,999 Gas consumption kWh rounded to nearest 500 kWh 100 - 7,999 Gas consumption kWh rounded to nearest 500 kWh
8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh 8,000 - 15,999 Gas consumption kWh rounded to nearest 100 kWh
16,000 24,999 Gas consumption kWh rounded to nearest 500 kWh 16,000 - 24,999 Gas consumption kWh rounded to nearest 500 kWh
25,000 34,999 Gas consumption kWh rounded to nearest 1,000 kWh 25,000 - 34,999 Gas consumption kWh rounded to nearest 1,000 kWh
35,000 50,000 Gas consumption kWh rounded to nearest 5,000 kWh 35,000 - 50,000 Gas consumption kWh rounded to nearest 5,000 kWh
EconsYEAR . Missing or invalid consumption EconsYEAR . Missing or invalid consumption
...@@ -79,7 +85,6 @@ EconsYEAR . Missing or invalid consumption ...@@ -79,7 +85,6 @@ EconsYEAR . Missing or invalid consumption
set more off set more off
*/ */
log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
di "************************" di "************************"
...@@ -90,20 +95,30 @@ use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear ...@@ -90,20 +95,30 @@ use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
* merge in the xwave file (fixed data - we assume!) * merge in the xwave file (fixed data - we assume!)
merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta" merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta"
lab var Econs "Electricity (KwH/year)" * set variable labels
lab var Gcons "Gas (KwH/year)" * ref DECC look up table
lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)"
* set as panel in case it wasn't * NB DECC look up table says max elec = 50,000
* fix format of year so xtset doesn't break lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
format year %ty
xtset HH_ID year, delta(1 year)
* examine panel status * set up correct long form 'is X present in year' using year (if known)
xtdescribe local vars "BOILER LI CWI"
* set up * what will hapen if there are multiple replacements in a household
local vars "Econs Gcons"
foreach v of local vars { foreach v of local vars {
gen ba_have_`v' = 0
destring `v'_YEAR, force replace
replace ba_have_`v' = 1 if `v'_YEAR <= year
}
* labels
lab var Econs "Electricity (kWh/year)"
lab var Gcons "Gas (kWh/year)"
* set up data
* include Gcons if you want gas too
local setupvars "Econs Gcons"
foreach v of local setupvars {
di "***************" di "***************"
di "* Testing `v' for `sample'% sample" di "* Testing `v' for `sample'% sample"
...@@ -116,25 +131,61 @@ foreach v of local vars { ...@@ -116,25 +131,61 @@ foreach v of local vars {
lab var `v'Validr "Recoded `v'Valid" lab var `v'Validr "Recoded `v'Valid"
lab val `v'Validr `v'Validr lab val `v'Validr `v'Validr
tab `v'Validr `v'Valid
di "* Check transitions (`v'Validr)" * set up consumption deciles and outlier flags
xttrans `v'Validr, freq gen u99_`v' = 0
gen l99_`v' = 0
gen m98_`v' = 0
gen m90_`v' = 0
* set up consumption deciles
levelsof(year), local(levels) levelsof(year), local(levels)
foreach l of local levels { foreach l of local levels {
di "* Calculating consumption deciles for `v' for `l'" di "* Calculating consumption deciles and outlier flags for `v' for `l'"
* creates missing for other years have to do this as egen does not allow by
* creates missing for other years have to do this as egen does not allow by & can't 'replace'
di "* `v' deciles for `l'"
egen `v'_dec_`l' = cut(`v') if year == `l', group(10) egen `v'_dec_`l' = cut(`v') if year == `l', group(10)
di "* `v' outlier flags for `l'"
qui: su `v' if year == `l', de
di "* define outliers"
di "* -> Upper: valid = G & in highest 1% of valid"
replace u99_`v' = 1 if (`v' > `r(p99)' | `v'Validr == 4) & year == `l' // upper 1% 'valid'
di "* -> Lower: valid = L & in lowest 1% of valid"
replace l99_`v' = 1 if (`v' < `r(p1)' | `v'Validr == 3) & year == `l' // lower 1% 'valid'
replace m98_`v' = 1 if u99_`v' == 0 & l99_`v' == 0 & year == `l' // middle 98% 'valid'
replace m90_`v' = 1 if `v' > `r(p5)' & `v' < `r(p95)' & year == `l' // middle 90% 'valid'
* do checks here otherwise tab puts things into the return list
*tab u99_`v' if year == `l', mi
*tab l99_`v' if year == `l', mi
*tab m98_`v' if year == `l', mi
*tab m90_`v' if year == `l', mi
} }
* now combine them - set missing option otherwise it counts a row where all are missing as 0 * now combine the deciles - set missing option otherwise it counts a row where all are missing as 0
egen `v'_dec = rowtotal(`v'_dec_*), missing egen `v'_dec = rowtotal(`v'_dec_*), missing
* remove temporary ones * remove temporary ones
drop `v'_dec_* drop `v'_dec_*
* check * check unweighted
tab `v'_dec year table `v'_dec year
tabstat u* l* m*, by(year)
tab l99_`v' `v'Validr
tab u99_`v' `v'Validr
* log vars
gen log`v' = log(`v')
lab var log`v' "Log `v'"
} }
* set 'survey' weight
svyset `need_weight'
* flag dwellings which are off gas for electricity * flag dwellings which are off gas for electricity
* NB - in this dataset we don't know if they use electricity as main heat (could be oil) * NB - in this dataset we don't know if they use electricity as main heat (could be oil)
gen ba_off_gas = 0 gen ba_off_gas = 0
...@@ -143,49 +194,219 @@ lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC ...@@ -143,49 +194,219 @@ lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC
lab val ba_off_gas ba_off_gas lab val ba_off_gas ba_off_gas
* check * check
tabstat Gcons Econs, by(ba_off_gas) svy: mean Gcons Econs, over(ba_off_gas)
di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'" di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'"
tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'? table ba_off_gas MAIN_HEAT_FUEL `need_weight', missing // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
table year MAIN_HEAT_FUEL, by(ba_off_gas) table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas)
* roughly constant rate throughout years * roughly constant rate throughout years
table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons) table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) c(mean Gcons n Gcons)
* but off gas have no gas readings as you'd expect (DECC filter) * but off gas have no gas readings as you'd expect (DECC applied filter)
foreach v of local vars { local descvars "Econs"
di "***************" if `do_desc' {
di "* Testing `v' for `sample'% sample" foreach v of local descvars {
di "***************"
* overall di "* Testing `v' for `sample'% sample"
xtsum `v' if `v'Valid == "V"
* test values for valid - check for valid 0s for example. This only happens for gas where: di "* Check `v' by year"
* 100 < gcons < 250 so included but rounded to nearest 500 = 0 di "* Check `v' for 0s (`sample'% sample)"
if "`v'" == "Econs" {
* elec always rounded to nearest 50 so min should always be 100 di "* -> `v' always rounded to nearest 50 so min should always be 100"
local min = 500
tabstat `v', by(`v'Valid) s(n mean semean min max) }
* by year if "`v'" == "Gcons" {
di "* check `v' for 0s (`s'% sample)" di "* -> 100 < `v' < 250 so included but rounded to nearest 500 which in this case = 0"
table `v' year if `v' < 1000 local min = 1000
table `v'Valid year, c(count `v' min `v' mean `v' max `v') }
table `v' year if `v' < `min' `need_weight'
di "* Check no values of `v' appear for 'non valid' codes"
di "* test values for all valid codes"
table `v'Valid `need_weight', c(count `v' mean `v')
di "* test `v' per year for valid codes (use pw)"
local table_stats "count `v' min `v' mean `v' p50 `v' max `v'" // set table stats - 5 max!!
table year if `v'Valid == "V" `need_weight', c(`table_stats`n'')
di "* table won't allow semean for iw so use svy: mean `v' instead"
svy: mean `v' if `v'Valid == "V", over(year)
di "* use table to force sd `v' calculation using iw not pw, include mean to check"
table year [iw=WEIGHT], c(mean `v' sd `v')
if `do_histo' {
di "* Running graphs: histo for `v' (can't use `need_weight')"
histogram `v' if `v'Valid == "V", frequency by(year) ylab(,angle(0)) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-histo_`v'_by_year_valid.png", replace
}
if `do_box' {
di "* Running graphs: boxes for `v'"
di "* `v' over year"
graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
ylab(,angle(0)) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_over_year_valid.png", replace
di "* `v' over year by floor area"
svy: mean `v', over(FLOOR_AREA_BAND)
graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
ylab(,angle(0)) by(FLOOR_AREA_BAND) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_floor_valid.png", replace
di "* `v' over year by EE band"
graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
ylab(,angle(0)) by(EE_BAND) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_ee_valid.png", replace
}
di "* check the distributions of the outliers"
local tvars "MAIN_HEAT_FUEL E7Flag2012 ba_off_gas FLOOR_AREA_BAND EE_BAND IMD_ENG"
foreach tv of local tvars {
di "* Checking top 1% against `tv'"
tab `tv' u99_Econs , col
}
if `do_graphs' {
di "* Running graphs - do not keep in memory, just save out"
di "* Running graphs: histo"
histogram `v' if `v'Valid == "V", by(year) scale(0.75)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace
di "* Running graphs: boxes" }
graph box `v' if `v'Valid == "V", over(year) scale(0.75) }
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace
graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND) scale(0.75) if `do_xsec' {
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace di "* Running cross sectional analysis using 2012 (all valid cases)"
* make sure test vars are destringed
local tvars "IMD_ENG FP_ENG E7Flag2012 MAIN_HEAT_FUEL PROP_AGE PROP_TYPE FLOOR_AREA_BAND EE_BAND LOFT_DEPTH WALL_CONS CWI BOILER"
foreach tv of local tvars {
destring `tv', force replace
}
* kitchen sink model - use 99% signif level
* use vce(robust) as running without & using hettest suggests heteroscedasticity
* regress would throw out collinear variables
* do not include IMD & FP as really need multilvel model for that
regress logEcons i.ba_region i.E7Flag2012 i.MAIN_HEAT_FUEL ///
i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
if year == 2012, level(99) vce(robust)
est store est_logEcons_2012
estout est_logEcons_2012 using "$rpath/regress_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
* tests
predict logEconsr if e(sample), residuals
di "* test for normality of residuals using graphs"
* see http://www-personal.umich.edu/~bwest/ch3diags.do
* do not use sktest as large n & also see
* http://www.stata.com/statalist/archive/2005-12/msg00238.html
* Plot residuals
* pnormal plot of residuals
pnorm logEconsr, name(pnorm_logEconsr)
graph export "$rpath/graphs/pnorm_logEconsr.png", replace
* qnormal plot
qnorm logEconsr, name(qnorm_logEconsr)
graph export "$rpath/graphs/qnorm_logEconsr.png", replace
* test the null hypothesis that the variance of the residuals is homogenous.
* Therefore, if the p-value is very small, we would have to reject the hypothesis
* and accept the alternative hypothesis that the variance is not homogenous.
* so if p < 0.05 -> heteroscedasticity and so need robust
* BUT check rvfplot (http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm)
/* can't use after cluster
di "* test for Heteroscedasticity"
di "* if p < 0.05 -> heteroscedasticity and so need robust"
estat hettest
*/
di "* test for omitted vars"
* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
di "* if p < 0.05 -> omitted vars"
estat ovtest
* check for mis-specification
* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
* The model is then refit using these two variables as predictors.
* _hat should be significant since it is the predicted value.
* On the other hand, _hatsq shouldn't, because if our model is specified correctly,
* the squared predictions should not have much explanatory power.
* That is we wouldn't expect _hatsq to be a significant predictor if our model is specified correctly.
* So we will be looking at the p-value for _hatsq.
* p of _hatsq < 0.05 -> mis-spec
di "* linktest"
di "* if p of _hatsq < 0.05 -> mis-spec"
linktest
* test BIC etc
estat ic
graph box `v' if `v'Valid == "V", over(year) by(EE_BAND) scale(0.75) }
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace
if `do_longit' {
di "* Running longitudinal analysis"
di "* Check boiler transitions"
xttrans ba_have_BOILER, freq
di "* Check loft insulation transitions"
xttrans ba_have_LI, freq
di "* Check cavity wall insulation transitions"
xttrans ba_have_CWI, freq
foreach v of local descvars {
di "***************"
di "* Longitudinal (panel) analysis"
* set as panel in case it wasn't
* fix format of year so xtset doesn't break
format year %ty
xtset HH_ID year, delta(1 year)
di "* examine panel status"
xtdescribe
di "* examine panel status for valid observations"
xtdescribe if Econs < .
di "* xtsum `v' for all"
xtsum `v'
di "* Check transitions (`v'Validr)"
xttrans `v'Validr, freq
di "* Check `v' consumption decile transitions"
xttrans `v'_dec, freq
di "* Check `v' outlier transitions"
bysort MAIN_HEAT_FUEL: xttrans u99_`v', freq
bysort MAIN_HEAT_FUEL: xttrans l99_`v', freq
di "* check lag model"
bysort HH_ID (year): gen logEconslag = logEcons[_n-1]
gen yeart = year - 2005
regress logEcons logEconslag i.ba_region i.E7Flag2012 ///
i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
yeart ba_have_BOILER ba_have_LI ba_have_CWI
est store lag_model
estout lag_model using "$rpath/lag_model_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
di "* xt regression model of change (use log Econs)"
xtreg logEcons i.ba_region i.E7Flag2012 ///
i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) fe
est store xtr_fe_logEcons
estout xtr_fe_logEcons using "$rpath/xtr_fe_logEcons_`version'.txt", replace
di "* now including time constant variables"
xtreg logEcons i.ba_region i.E7Flag2012 ///
i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS ///
yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) re
est store xtr_re_logEcons
estout xtr_re_logEcons using "$rpath/xtr_re_logEcons_`version'.txt", replace
} }
} }
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment