Administrator approval is now required for registering new accounts. If you are registering a new account, and are external to the University, please ask the repository owner to contact ServiceLine to request your account be approved. Repository owners must include the newly registered email address, and specific repository in the request for approval.

Commit 36611cf4 authored by Ben Anderson's avatar Ben Anderson
Browse files

added valid data checks

parent ee5a243d
*******************************************
* Script to:
* - analyse DECC's EULF 2014 NEED data to examine distributions etc
* - analyse DECC's EULF 2014 NEED data to examine distributions etc
* Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014
* http://discover.ukdataservice.ac.uk/catalogue/?sn=7518
......@@ -9,16 +9,16 @@
* The script requires the following to have been run first:
* https://github.com/dataknut/DECC-data/blob/master/NEED/process-NEED-EULF-2014.do
/*
/*
Copyright (C) 2014 University of Southampton
Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut)
Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut)
[Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License
the Free Software Foundation; either version 2 of the License
(http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
This program is distributed in the hope that it will be useful,
......@@ -39,8 +39,8 @@ set more off
* written for Mac OSX - remember to change filesystem delimiter for other platforms
global home "/Users/ben/Documents"
local dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed"
local rpath "$home/Work/Papers and Conferences/RSS-2015/results"
global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed"
global rpath "$home/Work/Papers and Conferences/RSS-2015/results"
local version "v1"
* set sample
......@@ -59,28 +59,80 @@ lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,0
* NB DECC look up table says max elec = 50,000
lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
* also be aware that the consumption is rounded in buckets:
/*
GconsYEAR . Missing, off gas or invalid consumption 100 7,999 Gas consumption kWh rounded to nearest 500 kWh 8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh 16,000 24,999 Gas consumption kWh rounded to nearest 500 kWh 25,000 34,999 Gas consumption kWh rounded to nearest 1,000 kWh 35,000 50,000 Gas consumption kWh rounded to nearest 5,000 kWh EconsYEAR . Missing or invalid consumption 100 - 9,999 Electricity consumption kWh rounded to nearest 50 kWh 10,000 - 11,999 Electricity consumption kWh rounded to nearest 100 kWh 12,000 - 14,999 Electricity consumption kWh rounded to nearest 500 kWh 15,000 - 19,999 Electricity consumption kWh rounded to nearest 1,000 kWh 20,000 - 25,000 Electricity consumption kWh rounded to nearest 5,000 kWh
GconsYEAR . Missing, off gas or invalid consumption 100 7,999 Gas consumption kWh rounded to nearest 500 kWh 8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh 16,000 24,999 Gas consumption kWh rounded to nearest 500 kWh 25,000 34,999 Gas consumption kWh rounded to nearest 1,000 kWh 35,000 50,000 Gas consumption kWh rounded to nearest 5,000 kWh EconsYEAR . Missing or invalid consumption 100 - 9,999 Electricity consumption kWh rounded to nearest 50 kWh 10,000 - 11,999 Electricity consumption kWh rounded to nearest 100 kWh 12,000 - 14,999 Electricity consumption kWh rounded to nearest 500 kWh 15,000 - 19,999 Electricity consumption kWh rounded to nearest 1,000 kWh 20,000 - 25,000 Electricity consumption kWh rounded to nearest 5,000 kWh
set more off
*/
log using "`rpath'/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
if `do_desc' {
di "************************"
di "* Using `sample'% sample"
use "`dpath'/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
* load the yearly consumption data
use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
* merge in the xwave file (fixed data - we assume!)
merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta"
* set as panel in case it wasn't
xtset HH_ID year
* fix format of year so xtset doesn't break
format year %ty
xtset HH_ID year, delta(1 year)
* examine panel status
xtdescribe
* distributions for valid obs
* Gcons
local vars "Econs Gcons"
xtdescribe
* set up
local vars "Econs Gcons"
foreach v of local vars {
di "***************"
di "* Testing `v' for `sample'% sample"
di "* check the panel transitions for each valid"
gen `v'Validr = 1 if `v'Valid == "V"
replace `v'Validr = 2 if `v'Valid == "O" // off gas (from EPC) only relevant for gas
replace `v'Validr = 3 if `v'Valid == "L"
replace `v'Validr = 4 if `v'Valid == "G"
replace `v'Validr = 5 if `v'Valid == "M"
lab var `v'Validr "Recoded `v'Valid"
lab val `v'Validr `v'Validr
* set up consumption deciles
levelsof(year), local(levels)
foreach l of local levels {
di "* Calculating consumption deciles for `v' for `l'"
* creates missing for other years have to do this as egen does not allow by
egen `v'_dec_`l' = cut(`v') if year == `l', group(10)
}
* now combine them - set missing option otherwise it counts a row where all are missing as 0
egen `v'_dec = rowtotal(`v'_dec_*), missing
* remove temporary ones
drop `v'_dec_*
* check
tab `v'_dec year
}
stop
* flag dwellings which are off gas for electricity
* NB - in this dataset we don't know if they use electricity as main heat (could be oil)
gen ba_off_gas = 0
replace ba_off_gas = 1 if GconsValidr == 2
lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC)"
lab val ba_off_gas ba_off_gas
* check
tabstat Gcons Econs, by(ba_off_gas)
di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'"
tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
table year MAIN_HEAT_FUEL, by(ba_off_gas)
* roughly constant rate throughout years
table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons)
* but off gas have no gas readings as you'd expect (DECC filter)
foreach v of local vars {
di "***************"
di "* Testing `v' for `sample'% sample"
......@@ -91,30 +143,32 @@ if `do_desc' {
* 100 < gcons < 250 so included but rounded to nearest 500 = 0
* elec always rounded to nearest 50 so min should always be 100
tabstat `v', by(`v'Valid) s(n mean semean min max)
tabstat `v', by(`v'Valid) s(n mean semean min max)
* by year
di "* check `v' for 0s (`s'% sample)"
table `v' year if `v' < 1000
table `v'Valid year, c(count `v' min `v' mean `v' max `v')
if `do_graphs' {
histogram `v' if `v'Valid == "V", by(year) name(histo_`s'pc_`v')
graph export "`rpath'/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace
graph box `v' if `v'Valid == "V", over(year) name(box_`s'pc_`v')
graph export "`rpath'/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace
di "* Running graphs - do not keep in memory, just save out"
di "* Running graphs: histo"
histogram `v' if `v'Valid == "V", by(year)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace
di "* Running graphs: boxes"
graph box `v' if `v'Valid == "V", over(year)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace
graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace
graph box `v' if `v'Valid == "V", over(year) by(EE_BAND)
graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace
}
di "* check the panel transitions for each valid"
gen `v'Validr = 1 if `v'Valid == "V"
replace `v'Validr = 2 if `v'Valid == "O"
replace `v'Validr = 3 if `v'Valid == "L"
replace `v'Validr = 4 if `v'Valid == "G"
replace `v'Validr = 5 if `v'Valid == "M"
lab var `v'Validr "Recoded `v'Valid"
lab val `v'Validr `v'Validr
* di "Check transitions (`v'Validr)"
di "* Check transitions (`v'Validr)"
xttrans `v'Validr, freq
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment