From 36611cf4cfc6c5e4c52c6ffdf4ae7cab6fe771d8 Mon Sep 17 00:00:00 2001 From: Ben Anderson <b.anderson@soton.ac.uk> Date: Mon, 29 Jun 2015 19:20:43 +0100 Subject: [PATCH] added valid data checks --- ...-NEED-EULF-2014-electricity-consumption.do | 120 +++++++++++++----- 1 file changed, 87 insertions(+), 33 deletions(-) diff --git a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do index 3c96669..1e05ac6 100644 --- a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do +++ b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do @@ -1,6 +1,6 @@ ******************************************* * Script to: -* - analyse DECC's EULF 2014 NEED data to examine distributions etc +* - analyse DECC's EULF 2014 NEED data to examine distributions etc * Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014 * http://discover.ukdataservice.ac.uk/catalogue/?sn=7518 @@ -9,16 +9,16 @@ * The script requires the following to have been run first: * https://github.com/dataknut/DECC-data/blob/master/NEED/process-NEED-EULF-2014.do -/* +/* Copyright (C) 2014 University of Southampton -Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut) +Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut) [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton] This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License +the Free Software Foundation; either version 2 of the License (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version. This program is distributed in the hope that it will be useful, @@ -39,8 +39,8 @@ set more off * written for Mac OSX - remember to change filesystem delimiter for other platforms global home "/Users/ben/Documents" -local dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed" -local rpath "$home/Work/Papers and Conferences/RSS-2015/results" +global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed" +global rpath "$home/Work/Papers and Conferences/RSS-2015/results" local version "v1" * set sample @@ -59,28 +59,80 @@ lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,0 * NB DECC look up table says max elec = 50,000 lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)" + * also be aware that the consumption is rounded in buckets: /* -GconsYEAR . Missing, off gas or invalid consumption 100 Р7,999 Gas consumption kWh rounded to nearest 500 kWh 8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh 16,000 Р24,999 Gas consumption kWh rounded to nearest 500 kWh 25,000 Р34,999 Gas consumption kWh rounded to nearest 1,000 kWh 35,000 Р50,000 Gas consumption kWh rounded to nearest 5,000 kWh EconsYEAR . Missing or invalid consumption 100 - 9,999 Electricity consumption kWh rounded to nearest 50 kWh 10,000 - 11,999 Electricity consumption kWh rounded to nearest 100 kWh 12,000 - 14,999 Electricity consumption kWh rounded to nearest 500 kWh 15,000 - 19,999 Electricity consumption kWh rounded to nearest 1,000 kWh 20,000 - 25,000 Electricity consumption kWh rounded to nearest 5,000 kWh +GconsYEAR . Missing, off gas or invalid consumption 100 � 7,999 Gas consumption kWh rounded to nearest 500 kWh 8,000- 15,999 Gas consumption kWh rounded to nearest 100 kWh 16,000 � 24,999 Gas consumption kWh rounded to nearest 500 kWh 25,000 � 34,999 Gas consumption kWh rounded to nearest 1,000 kWh 35,000 � 50,000 Gas consumption kWh rounded to nearest 5,000 kWh EconsYEAR . Missing or invalid consumption 100 - 9,999 Electricity consumption kWh rounded to nearest 50 kWh 10,000 - 11,999 Electricity consumption kWh rounded to nearest 100 kWh 12,000 - 14,999 Electricity consumption kWh rounded to nearest 500 kWh 15,000 - 19,999 Electricity consumption kWh rounded to nearest 1,000 kWh 20,000 - 25,000 Electricity consumption kWh rounded to nearest 5,000 kWh set more off */ -log using "`rpath'/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace +log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace if `do_desc' { di "************************" di "* Using `sample'% sample" - use "`dpath'/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear + * load the yearly consumption data + use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear + + * merge in the xwave file (fixed data - we assume!) + merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta" * set as panel in case it wasn't - xtset HH_ID year + * fix format of year so xtset doesn't break + format year %ty + xtset HH_ID year, delta(1 year) * examine panel status - xtdescribe - - * distributions for valid obs - * Gcons - local vars "Econs Gcons" + xtdescribe + + * set up +local vars "Econs Gcons" +foreach v of local vars { + di "***************" + di "* Testing `v' for `sample'% sample" + + di "* check the panel transitions for each valid" + gen `v'Validr = 1 if `v'Valid == "V" + replace `v'Validr = 2 if `v'Valid == "O" // off gas (from EPC) only relevant for gas + replace `v'Validr = 3 if `v'Valid == "L" + replace `v'Validr = 4 if `v'Valid == "G" + replace `v'Validr = 5 if `v'Valid == "M" + + lab var `v'Validr "Recoded `v'Valid" + lab val `v'Validr `v'Validr + + * set up consumption deciles + levelsof(year), local(levels) + foreach l of local levels { + di "* Calculating consumption deciles for `v' for `l'" + * creates missing for other years have to do this as egen does not allow by + egen `v'_dec_`l' = cut(`v') if year == `l', group(10) + } + * now combine them - set missing option otherwise it counts a row where all are missing as 0 + egen `v'_dec = rowtotal(`v'_dec_*), missing + * remove temporary ones + drop `v'_dec_* + * check + tab `v'_dec year +} + +stop + * flag dwellings which are off gas for electricity + * NB - in this dataset we don't know if they use electricity as main heat (could be oil) + gen ba_off_gas = 0 + replace ba_off_gas = 1 if GconsValidr == 2 + lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC)" + lab val ba_off_gas ba_off_gas + * check + tabstat Gcons Econs, by(ba_off_gas) + di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'" + + tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'? + table year MAIN_HEAT_FUEL, by(ba_off_gas) + * roughly constant rate throughout years + table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons) + * but off gas have no gas readings as you'd expect (DECC filter) + foreach v of local vars { di "***************" di "* Testing `v' for `sample'% sample" @@ -91,30 +143,32 @@ if `do_desc' { * 100 < gcons < 250 so included but rounded to nearest 500 = 0 * elec always rounded to nearest 50 so min should always be 100 - - tabstat `v', by(`v'Valid) s(n mean semean min max) + + tabstat `v', by(`v'Valid) s(n mean semean min max) * by year di "* check `v' for 0s (`s'% sample)" table `v' year if `v' < 1000 table `v'Valid year, c(count `v' min `v' mean `v' max `v') - + if `do_graphs' { - histogram `v' if `v'Valid == "V", by(year) name(histo_`s'pc_`v') - graph export "`rpath'/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace - graph box `v' if `v'Valid == "V", over(year) name(box_`s'pc_`v') - graph export "`rpath'/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace + di "* Running graphs - do not keep in memory, just save out" + di "* Running graphs: histo" + histogram `v' if `v'Valid == "V", by(year) + graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace + + di "* Running graphs: boxes" + graph box `v' if `v'Valid == "V", over(year) + graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace + + graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND) + graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace + + graph box `v' if `v'Valid == "V", over(year) by(EE_BAND) + graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace + } - - di "* check the panel transitions for each valid" - gen `v'Validr = 1 if `v'Valid == "V" - replace `v'Validr = 2 if `v'Valid == "O" - replace `v'Validr = 3 if `v'Valid == "L" - replace `v'Validr = 4 if `v'Valid == "G" - replace `v'Validr = 5 if `v'Valid == "M" - - lab var `v'Validr "Recoded `v'Valid" - lab val `v'Validr `v'Validr - * di "Check transitions (`v'Validr)" + + di "* Check transitions (`v'Validr)" xttrans `v'Validr, freq } } -- GitLab