From 5d05c34ee15684602393a22a0db4011ecae572e7 Mon Sep 17 00:00:00 2001 From: Ben Anderson <b.anderson@soton.ac.uk> Date: Wed, 17 Sep 2014 15:42:35 +0100 Subject: [PATCH] updated readme and models --- NEED/analyse-NEED-EULF-2014-descriptives.do | 9 +- ... => analyse-NEED-EULF-2014-models-v1.1.do} | 10 +- NEED/analyse-NEED-EULF-2014-models-v2.0.do | 238 ++++++++++++++++++ 3 files changed, 253 insertions(+), 4 deletions(-) rename NEED/{analyse-NEED-EULF-2014-models.do => analyse-NEED-EULF-2014-models-v1.1.do} (96%) create mode 100644 NEED/analyse-NEED-EULF-2014-models-v2.0.do diff --git a/NEED/analyse-NEED-EULF-2014-descriptives.do b/NEED/analyse-NEED-EULF-2014-descriptives.do index 858cc1b..4b0b538 100644 --- a/NEED/analyse-NEED-EULF-2014-descriptives.do +++ b/NEED/analyse-NEED-EULF-2014-descriptives.do @@ -38,8 +38,13 @@ table EconsValid year, c(count Econs min Econs mean Econs max Econs) table GconsValid year, c(count Gcons min Gcons mean Gcons max Gcons) * distributions -histogram Econs, by(year) name(histo_econs) -histogram Gcons, by(year) name(histo_gcons) +local vars "Econs Gcons" +foreach v of local vars { + histogram `v', by(year) name(histo_`v') + graph export using "`rpath'/NEED-EULF-2014-histo_`v'_by_year.png", replace + graph box `v', by(year) name(box_`v') + graph export using "`rpath'/NEED-EULF-2014-box_`v'_by_year.png", replace +} di "* Done!" diff --git a/NEED/analyse-NEED-EULF-2014-models.do b/NEED/analyse-NEED-EULF-2014-models-v1.1.do similarity index 96% rename from NEED/analyse-NEED-EULF-2014-models.do rename to NEED/analyse-NEED-EULF-2014-models-v1.1.do index afa9f78..cf12a97 100644 --- a/NEED/analyse-NEED-EULF-2014-models.do +++ b/NEED/analyse-NEED-EULF-2014-models-v1.1.do @@ -25,7 +25,13 @@ local proot "`home'/Work/Data/Social Science Datatsets/DECC" local dpath "`proot'/NEED/End User Licence File 2014/processed" local rpath "`proot'/results/NEED/" -local version "v1.1" +* local verrsion "1.0" +* initial models - all households for electricity models + +local verrsion "1.1" +* restrict to gas only households to avoid complications of: +* - primary electric heating (presumably) +* - oil heating set more off @@ -34,7 +40,7 @@ log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replac * use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear -* we're goinmg to use 2012 data only +* we're going to use 2012 data only keep HH_ID *2012* diff --git a/NEED/analyse-NEED-EULF-2014-models-v2.0.do b/NEED/analyse-NEED-EULF-2014-models-v2.0.do new file mode 100644 index 0000000..8cb9c0b --- /dev/null +++ b/NEED/analyse-NEED-EULF-2014-models-v2.0.do @@ -0,0 +1,238 @@ +* Script to analyse DECC's NEED data to: +* investigate % variance of energy consumption due to dwelling type variables as a way to infer the % of variance due to people + +* NB this script uses 2 data files derived from the original data using the 'process' script + +* Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014 +* http://discover.ukdataservice.ac.uk/catalogue/?sn=7518 + +* Ben Anderson, Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton +* b.anderson@soton.ac.uk +* (c) University of Southampton + +* Unless there is a different license file in the folder in which this script is found, the Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license applies +* http://creativecommons.org/licenses/by-nc/4.0/ + +clear all + +capture noisily log close + +* written for Mac OSX - remember to change filesystem delimiter for other platforms +local home "/Users/ben/Documents" +local proot "`home'/Work/Data/Social Science Datatsets/DECC" +* for clam +* local proot "`home'/Work/NEED" +local dpath "`proot'/NEED/End User Licence File 2014/processed" +local rpath "`proot'/results/NEED" + +*local verrsion "1.0" +* initial models - all households for electricity models + +*local verrsion "1.1" +* restrict to gas only households to avoid complications of: +* - primary electric heating (presumably) +* - oil heating + +*local version "v2a_1pc" +*local sample 1 +*local sampleby "EE_BAND PROP_TYPE" +* changed from using log consumption to consumption decile to avoid complications due to variable rounding ranges in original data (see readme) +* restricted analysis to households where gas is main heat source as it is better predicted by variables included & is more relevant to EPC (heat) +* uses 1% sample (c 30k) making sure keep proportions of property type and EE_Band to see if linktest fails with smaller n + +*local version "v2b_10pc" +*local sample 10 +*local sampleby "EE_BAND PROP_TYPE" +* uses 10% sample (c 300k) making sure keep proportions of property type and EE_Band to see if margin plots and co-efficients are the same +* (linktest etc will probably now fail due to larger n) + +local version "v2c_full" +local sample 10 +local sampleby "EE_BAND PROP_TYPE" +* uses 10% sample (c 300k) making sure keep proportions of property type and EE_Band to see if margin plots and co-efficients are the same +* (linktest etc will probably now fail due to larger n) + +set more off + +log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replace + +* use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file +use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear + +* we're going to use 2012 data only + +keep HH_ID *2012* + +* merge in the pre-processed cross-year fixed values file +merge 1:1 HH_ID using "`dpath'/need_eul_may2014_xwavefile.dta" + +* check what's valid +tab Gcons2012Valid Econs2012Valid, mi // O = off gas, V = valid, L = too low, G = too big, M = missing +tabstat Gcons2012, by(Gcons2012Valid) s(mean min max n) +* do off-gas use a lot more electricty (heating)? +tabstat Econs2012, by(Gcons2012Valid) s(mean min max n) + +histogram Gcons2012, by(MAIN_HEAT_FUEL, total) name(histo_Gcons2012) +graph export "`rpath'/histo_Gcons2012_by_main_heating_fuel.png", replace + +tabstat Gcons2012, by(MAIN_HEAT_FUEL) s(n mean min max) + +* keep if valid gas & gas = main heat fuel +keep if Gcons2012Valid == "V" & MAIN_HEAT_FUEL == 1 + +***** random sample **** +* select a random sample but ensure proportions of sampleby are kept +di "* Keeping `sample'% sample by `sampleby'" +sample `sample', by(`sampleby') + +tab `sampleby', mi + +* log the consumption as it's very skewed -> becomes semi-normal & OK for linear regression +* Gcons = gas +* Econs = Electricity + +* create log & deciles +* log - creates a normal distribution +* deciles - avoids the consumption rounding range differences (hopefully) +gen log_Gcons2012 = log(Gcons2012) +egen Gcons2012_dec = cut(Gcons2012), group(10) +gen log_Econs2012 = log(Econs2012) +egen Econs2012_dec = cut(Econs2012), group(10) + +* combine consumption +* treat missing (gas) as 0 +egen Allcons2012 = rowtotal(Gcons2012 Econs2012) + +*gen log_Allcons2012 = log(Allcons2012) +egen Allcons2012_dec = cut(Allcons2012), group(10) + +* create log consumption quintiles +*egen quinlog_Allcons2012 = cut(log_Allcons2012), group(5) +*egen quinlog_Gcons2012 = cut(log_Gcons2012), group(5) +*egen quinlog_Econs2012 = cut(log_Econs2012), group(5) + +* fix some of the variables + +* combine IMD: this is a bit dodgy as they are not strictly comparable +gen ba_imd = IMD_ENG +replace ba_imd = IMD_WALES if ba_imd == . + +* must use as category variables!! +* set unkown to be 10 -> adds to end of contrasts so can see effect +replace LOFT_DEPTH = 10 if LOFT_DEPTH == . + +* set unkown to be 2020 -> adds to end of contrasts so can see effect +replace BOILER_YEAR = 2020 if BOILER_YEAR == . +replace CWI_YEAR = 2020 if CWI_YEAR == . +replace LI_YEAR = 2020 if LI_YEAR == . + +* 0 = no +destring BOILER, force replace +replace BOILER = 0 if BOILER == . + +* household level vars +local generic_hvars "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_TYPE i.PROP_AGE i.EE_BAND" +local generic_hvarsnp "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_AGE i.EE_BAND" + +* area level vars +local generic_rvars "i.ba_region i.ba_imd" + +* define different property types +local ptypes "101 102 103 104 105 106" +local pt101 "detached" +local pt102 "semi" +local pt103 "end_terr" +local pt104 "mid_terr" +local pt105 "bung" +local pt106 "flat" + +* now loop over the energy types & run linear regression models +* NB - the rounding of the consumption values may lead to modelling problems + +* add Econs Allcons for electricity & sum of both +* rename so graph names don't break +rename log_Gcons2012 lg2012 +rename Gcons2012_dec g2012dec +local vars "lg2012 g2012dec" +foreach v of local vars { + * check distributions of original consumption values + + * all hhs model + qui: regress `v' `generic_hvars' /// + `generic_rvars' /// + i.BOILER_YEAR + + est store `v' + di "* -> `v' estat to test for heteroskedasticity & omitted vars" + estat ovtest + estat hettest + + * we ought to be testing for linearity too + di "* -> `v' linktest to test for model specification" + di "* if p of _hatsq < 0.05 -> mis-spec" + di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm" + linktest + + di "* test EPC margins for `v'" + margins EE_BAND + marginsplot, name(mplot_`v'_EE_BAND) + graph export "`rpath'/mplot_`v'_EE_BAND-`version'.png", replace + + * models by property type - to see if rsq & coefficients vary + foreach p of local ptypes { + di "* -> testing `v' for `pt`p''" + qui: regress `v' `generic_hvarsnp' /// + `generic_rvars' /// + i.BOILER_YEAR /// + if PROP_TYPE == `p' + est store `v'_`pt`p'' + + di "* -> `v' 2012 `pt`p'' - estat to test for heteroskedasticity & omitted vars" + estat ovtest + estat hettest + + * we ought to be testing for linearity too + di "* -> `v' `pt`p'' linktest to test for model specification" + di "* if p of _hatsq < 0.05 -> mis-spec" + di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm" + linktest + di "* test EPC margins for `v' (`pt`p'')" + margins EE_BAND + marginsplot, name(mplot_`v'_EE_BAND_`pt`p'') + graph export "`rpath'/mplot_`v'_EE_BAND_`pt`p''-`version'.png", replace + + } + * models for different consumption quintiles - to see if rsq & coefficients vary + /* doesn't make much sense to do this if using deciles as dependent variable + foreach q of numlist 0/4 { + di "* -> testing log_`v'2012 for quintile: `q'" + qui: regress log_`v'2012 `generic_hvars' /// + `generic_rvars' /// + i.BOILER_YEAR /// + if quinlog_`v'2012 == `q' + est store rlog_`v'2012q`q' + + di "* -> quintile: `q' - estat to test for heteroskedasticity & omitted vars" + estat ovtest + estat hettest + + * we ought to be testing for linearity too + di "* -> quintile: `q' - linktest" + di "* if p of _hatsq < 0.05 -> mis-spec" + di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm" + linktest + } + */ +} + +* output all the results - that's a lot of t tests! +* we could put them all out in one file but it would be really hard to find the ones you want! +estout lg2012 using "`rpath'/NEED-EULF-2014-log-gas-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll) +estout lg2012_* using "`rpath'/NEED-EULF-2014-log-gas-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll) + +estout g2012dec using "`rpath'/NEED-EULF-2014-gas-deciles-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll) +estout g2012dec_* using "`rpath'/NEED-EULF-2014-gas-deciles-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll) + +di "* Done!" + +log close -- GitLab