Skip to content
Snippets Groups Projects
Commit 5d05c34e authored by Ben Anderson's avatar Ben Anderson
Browse files

updated readme and models

parent 4637dc68
No related branches found
No related tags found
No related merge requests found
...@@ -38,8 +38,13 @@ table EconsValid year, c(count Econs min Econs mean Econs max Econs) ...@@ -38,8 +38,13 @@ table EconsValid year, c(count Econs min Econs mean Econs max Econs)
table GconsValid year, c(count Gcons min Gcons mean Gcons max Gcons) table GconsValid year, c(count Gcons min Gcons mean Gcons max Gcons)
* distributions * distributions
histogram Econs, by(year) name(histo_econs) local vars "Econs Gcons"
histogram Gcons, by(year) name(histo_gcons) foreach v of local vars {
histogram `v', by(year) name(histo_`v')
graph export using "`rpath'/NEED-EULF-2014-histo_`v'_by_year.png", replace
graph box `v', by(year) name(box_`v')
graph export using "`rpath'/NEED-EULF-2014-box_`v'_by_year.png", replace
}
di "* Done!" di "* Done!"
......
...@@ -25,7 +25,13 @@ local proot "`home'/Work/Data/Social Science Datatsets/DECC" ...@@ -25,7 +25,13 @@ local proot "`home'/Work/Data/Social Science Datatsets/DECC"
local dpath "`proot'/NEED/End User Licence File 2014/processed" local dpath "`proot'/NEED/End User Licence File 2014/processed"
local rpath "`proot'/results/NEED/" local rpath "`proot'/results/NEED/"
local version "v1.1" * local verrsion "1.0"
* initial models - all households for electricity models
local verrsion "1.1"
* restrict to gas only households to avoid complications of:
* - primary electric heating (presumably)
* - oil heating
set more off set more off
...@@ -34,7 +40,7 @@ log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replac ...@@ -34,7 +40,7 @@ log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replac
* use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file * use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file
use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear
* we're goinmg to use 2012 data only * we're going to use 2012 data only
keep HH_ID *2012* keep HH_ID *2012*
......
* Script to analyse DECC's NEED data to:
* investigate % variance of energy consumption due to dwelling type variables as a way to infer the % of variance due to people
* NB this script uses 2 data files derived from the original data using the 'process' script
* Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014
* http://discover.ukdataservice.ac.uk/catalogue/?sn=7518
* Ben Anderson, Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton
* b.anderson@soton.ac.uk
* (c) University of Southampton
* Unless there is a different license file in the folder in which this script is found, the Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license applies
* http://creativecommons.org/licenses/by-nc/4.0/
clear all
capture noisily log close
* written for Mac OSX - remember to change filesystem delimiter for other platforms
local home "/Users/ben/Documents"
local proot "`home'/Work/Data/Social Science Datatsets/DECC"
* for clam
* local proot "`home'/Work/NEED"
local dpath "`proot'/NEED/End User Licence File 2014/processed"
local rpath "`proot'/results/NEED"
*local verrsion "1.0"
* initial models - all households for electricity models
*local verrsion "1.1"
* restrict to gas only households to avoid complications of:
* - primary electric heating (presumably)
* - oil heating
*local version "v2a_1pc"
*local sample 1
*local sampleby "EE_BAND PROP_TYPE"
* changed from using log consumption to consumption decile to avoid complications due to variable rounding ranges in original data (see readme)
* restricted analysis to households where gas is main heat source as it is better predicted by variables included & is more relevant to EPC (heat)
* uses 1% sample (c 30k) making sure keep proportions of property type and EE_Band to see if linktest fails with smaller n
*local version "v2b_10pc"
*local sample 10
*local sampleby "EE_BAND PROP_TYPE"
* uses 10% sample (c 300k) making sure keep proportions of property type and EE_Band to see if margin plots and co-efficients are the same
* (linktest etc will probably now fail due to larger n)
local version "v2c_full"
local sample 10
local sampleby "EE_BAND PROP_TYPE"
* uses 10% sample (c 300k) making sure keep proportions of property type and EE_Band to see if margin plots and co-efficients are the same
* (linktest etc will probably now fail due to larger n)
set more off
log using "`rpath'/analyse-NEED-EULF-2014-models-`version'-$S_DATE.smcl", replace
* use the pre-processed wide form file which contains all years of consumption data but not the constant values which are in the xwave file
use "`dpath'/need_eul_may2014_consumptionfile_wide.dta", clear
* we're going to use 2012 data only
keep HH_ID *2012*
* merge in the pre-processed cross-year fixed values file
merge 1:1 HH_ID using "`dpath'/need_eul_may2014_xwavefile.dta"
* check what's valid
tab Gcons2012Valid Econs2012Valid, mi // O = off gas, V = valid, L = too low, G = too big, M = missing
tabstat Gcons2012, by(Gcons2012Valid) s(mean min max n)
* do off-gas use a lot more electricty (heating)?
tabstat Econs2012, by(Gcons2012Valid) s(mean min max n)
histogram Gcons2012, by(MAIN_HEAT_FUEL, total) name(histo_Gcons2012)
graph export "`rpath'/histo_Gcons2012_by_main_heating_fuel.png", replace
tabstat Gcons2012, by(MAIN_HEAT_FUEL) s(n mean min max)
* keep if valid gas & gas = main heat fuel
keep if Gcons2012Valid == "V" & MAIN_HEAT_FUEL == 1
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `sample'% sample by `sampleby'"
sample `sample', by(`sampleby')
tab `sampleby', mi
* log the consumption as it's very skewed -> becomes semi-normal & OK for linear regression
* Gcons = gas
* Econs = Electricity
* create log & deciles
* log - creates a normal distribution
* deciles - avoids the consumption rounding range differences (hopefully)
gen log_Gcons2012 = log(Gcons2012)
egen Gcons2012_dec = cut(Gcons2012), group(10)
gen log_Econs2012 = log(Econs2012)
egen Econs2012_dec = cut(Econs2012), group(10)
* combine consumption
* treat missing (gas) as 0
egen Allcons2012 = rowtotal(Gcons2012 Econs2012)
*gen log_Allcons2012 = log(Allcons2012)
egen Allcons2012_dec = cut(Allcons2012), group(10)
* create log consumption quintiles
*egen quinlog_Allcons2012 = cut(log_Allcons2012), group(5)
*egen quinlog_Gcons2012 = cut(log_Gcons2012), group(5)
*egen quinlog_Econs2012 = cut(log_Econs2012), group(5)
* fix some of the variables
* combine IMD: this is a bit dodgy as they are not strictly comparable
gen ba_imd = IMD_ENG
replace ba_imd = IMD_WALES if ba_imd == .
* must use as category variables!!
* set unkown to be 10 -> adds to end of contrasts so can see effect
replace LOFT_DEPTH = 10 if LOFT_DEPTH == .
* set unkown to be 2020 -> adds to end of contrasts so can see effect
replace BOILER_YEAR = 2020 if BOILER_YEAR == .
replace CWI_YEAR = 2020 if CWI_YEAR == .
replace LI_YEAR = 2020 if LI_YEAR == .
* 0 = no
destring BOILER, force replace
replace BOILER = 0 if BOILER == .
* household level vars
local generic_hvars "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_TYPE i.PROP_AGE i.EE_BAND"
local generic_hvarsnp "i.BOILER_YEAR i.MAIN_HEAT_FUEL i.LI_YEAR i.LOFT_DEPTH i.FLOOR_AREA_BAND WALL_CONS i.CWI_YEAR i.PROP_AGE i.EE_BAND"
* area level vars
local generic_rvars "i.ba_region i.ba_imd"
* define different property types
local ptypes "101 102 103 104 105 106"
local pt101 "detached"
local pt102 "semi"
local pt103 "end_terr"
local pt104 "mid_terr"
local pt105 "bung"
local pt106 "flat"
* now loop over the energy types & run linear regression models
* NB - the rounding of the consumption values may lead to modelling problems
* add Econs Allcons for electricity & sum of both
* rename so graph names don't break
rename log_Gcons2012 lg2012
rename Gcons2012_dec g2012dec
local vars "lg2012 g2012dec"
foreach v of local vars {
* check distributions of original consumption values
* all hhs model
qui: regress `v' `generic_hvars' ///
`generic_rvars' ///
i.BOILER_YEAR
est store `v'
di "* -> `v' estat to test for heteroskedasticity & omitted vars"
estat ovtest
estat hettest
* we ought to be testing for linearity too
di "* -> `v' linktest to test for model specification"
di "* if p of _hatsq < 0.05 -> mis-spec"
di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
linktest
di "* test EPC margins for `v'"
margins EE_BAND
marginsplot, name(mplot_`v'_EE_BAND)
graph export "`rpath'/mplot_`v'_EE_BAND-`version'.png", replace
* models by property type - to see if rsq & coefficients vary
foreach p of local ptypes {
di "* -> testing `v' for `pt`p''"
qui: regress `v' `generic_hvarsnp' ///
`generic_rvars' ///
i.BOILER_YEAR ///
if PROP_TYPE == `p'
est store `v'_`pt`p''
di "* -> `v' 2012 `pt`p'' - estat to test for heteroskedasticity & omitted vars"
estat ovtest
estat hettest
* we ought to be testing for linearity too
di "* -> `v' `pt`p'' linktest to test for model specification"
di "* if p of _hatsq < 0.05 -> mis-spec"
di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
linktest
di "* test EPC margins for `v' (`pt`p'')"
margins EE_BAND
marginsplot, name(mplot_`v'_EE_BAND_`pt`p'')
graph export "`rpath'/mplot_`v'_EE_BAND_`pt`p''-`version'.png", replace
}
* models for different consumption quintiles - to see if rsq & coefficients vary
/* doesn't make much sense to do this if using deciles as dependent variable
foreach q of numlist 0/4 {
di "* -> testing log_`v'2012 for quintile: `q'"
qui: regress log_`v'2012 `generic_hvars' ///
`generic_rvars' ///
i.BOILER_YEAR ///
if quinlog_`v'2012 == `q'
est store rlog_`v'2012q`q'
di "* -> quintile: `q' - estat to test for heteroskedasticity & omitted vars"
estat ovtest
estat hettest
* we ought to be testing for linearity too
di "* -> quintile: `q' - linktest"
di "* if p of _hatsq < 0.05 -> mis-spec"
di "* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm"
linktest
}
*/
}
* output all the results - that's a lot of t tests!
* we could put them all out in one file but it would be really hard to find the ones you want!
estout lg2012 using "`rpath'/NEED-EULF-2014-log-gas-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout lg2012_* using "`rpath'/NEED-EULF-2014-log-gas-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout g2012dec using "`rpath'/NEED-EULF-2014-gas-deciles-model-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
estout g2012dec_* using "`rpath'/NEED-EULF-2014-gas-deciles-models-by-property-type-`version'-$S_DATE.txt", replace cells("b se p _star") stats(r2 r2_a N ll)
di "* Done!"
log close
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment