From 36611cf4cfc6c5e4c52c6ffdf4ae7cab6fe771d8 Mon Sep 17 00:00:00 2001
From: Ben Anderson <b.anderson@soton.ac.uk>
Date: Mon, 29 Jun 2015 19:20:43 +0100
Subject: [PATCH] added valid data checks

---
 ...-NEED-EULF-2014-electricity-consumption.do | 120 +++++++++++++-----
 1 file changed, 87 insertions(+), 33 deletions(-)

diff --git a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do
index 3c96669..1e05ac6 100644
--- a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do
+++ b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do
@@ -1,6 +1,6 @@
 *******************************************
 * Script to:
-* - analyse DECC's EULF 2014 NEED data to examine distributions etc 
+* - analyse DECC's EULF 2014 NEED data to examine distributions etc
 
 * Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014
 * http://discover.ukdataservice.ac.uk/catalogue/?sn=7518
@@ -9,16 +9,16 @@
 * The script requires the following to have been run first:
 * https://github.com/dataknut/DECC-data/blob/master/NEED/process-NEED-EULF-2014.do
 
-/*   
+/*
 
 Copyright (C) 2014  University of Southampton
 
-Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut) 
+Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut)
 	[Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License 
+the Free Software Foundation; either version 2 of the License
 (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
@@ -39,8 +39,8 @@ set more off
 * written for Mac OSX - remember to change filesystem delimiter for other platforms
 global home "/Users/ben/Documents"
 
-local dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed"
-local rpath "$home/Work/Papers and Conferences/RSS-2015/results"
+global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed"
+global rpath "$home/Work/Papers and Conferences/RSS-2015/results"
 
 local version "v1"
 * set sample
@@ -59,28 +59,80 @@ lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,0
 * NB DECC look up table says max elec = 50,000
 lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
 
+
 * also be aware that the consumption is rounded in buckets:
 /*
-GconsYEAR	.	Missing, off gas or invalid consumption
	100 Ð 7,999	Gas consumption kWh rounded to nearest 500 kWh
	8,000- 15,999	Gas consumption kWh rounded to nearest 100 kWh
	16,000 Ð 24,999	Gas consumption kWh rounded to nearest 500 kWh
	25,000 Ð 34,999	Gas consumption kWh rounded to nearest 1,000 kWh
	35,000 Ð 50,000	Gas consumption kWh rounded to nearest 5,000 kWh
		
		
EconsYEAR	.	Missing or invalid consumption
	100 - 9,999	Electricity consumption kWh rounded to nearest 50 kWh
	10,000 - 11,999	Electricity consumption kWh rounded to nearest 100 kWh
	12,000 - 14,999	Electricity consumption kWh rounded to nearest 500 kWh
	15,000 - 19,999	Electricity consumption kWh rounded to nearest 1,000 kWh
	20,000 - 25,000	Electricity consumption kWh rounded to nearest 5,000 kWh
+GconsYEAR	.	Missing, off gas or invalid consumption
	100 � 7,999	Gas consumption kWh rounded to nearest 500 kWh
	8,000- 15,999	Gas consumption kWh rounded to nearest 100 kWh
	16,000 � 24,999	Gas consumption kWh rounded to nearest 500 kWh
	25,000 � 34,999	Gas consumption kWh rounded to nearest 1,000 kWh
	35,000 � 50,000	Gas consumption kWh rounded to nearest 5,000 kWh


EconsYEAR	.	Missing or invalid consumption
	100 - 9,999	Electricity consumption kWh rounded to nearest 50 kWh
	10,000 - 11,999	Electricity consumption kWh rounded to nearest 100 kWh
	12,000 - 14,999	Electricity consumption kWh rounded to nearest 500 kWh
	15,000 - 19,999	Electricity consumption kWh rounded to nearest 1,000 kWh
	20,000 - 25,000	Electricity consumption kWh rounded to nearest 5,000 kWh
 set more off
 */
 
-log using "`rpath'/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
+log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
 
 if `do_desc' {
 	di "************************"
 	di "* Using `sample'% sample"
-	use "`dpath'/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
+	* load the yearly consumption data
+	use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
+
+	* merge in the xwave file (fixed data - we assume!)
+	merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta"
 
 	* set as panel in case it wasn't
-	xtset HH_ID year
+	* fix format of year so xtset doesn't break
+	format year %ty
+	xtset HH_ID year, delta(1 year)
 
 	* examine panel status
-	xtdescribe 
-	
-	* distributions for valid obs
-	* Gcons
-	local vars "Econs Gcons"
+	xtdescribe
+
+	* set up
+local vars "Econs Gcons"
+foreach v of local vars {
+	di "***************"
+	di "* Testing `v' for `sample'% sample"
+
+	di "* check the panel transitions for each valid"
+	gen `v'Validr = 1 if `v'Valid == "V"
+	replace `v'Validr = 2 if `v'Valid == "O" // off gas (from EPC) only relevant for gas
+	replace `v'Validr = 3 if `v'Valid == "L"
+	replace `v'Validr = 4 if `v'Valid == "G"
+	replace `v'Validr = 5 if `v'Valid == "M"
+
+	lab var `v'Validr "Recoded `v'Valid"
+	lab val `v'Validr `v'Validr
+
+	* set up consumption deciles
+	levelsof(year), local(levels)
+	foreach l of local levels {
+		di "* Calculating consumption deciles for `v' for `l'"
+		* creates missing for other years have to do this as egen does not allow by
+		egen `v'_dec_`l' = cut(`v') if year == `l', group(10)
+	}
+	* now combine them - set missing option otherwise it counts a row where all are missing as 0
+	egen `v'_dec = rowtotal(`v'_dec_*), missing
+	* remove temporary ones
+	drop `v'_dec_*
+	* check
+	tab `v'_dec year
+}
+
+stop
+	* flag dwellings which are off gas for electricity
+	* NB - in this dataset we don't know if they use electricity as main heat (could be oil)
+	gen ba_off_gas = 0
+	replace ba_off_gas = 1 if  GconsValidr == 2
+	lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC)"
+	lab val ba_off_gas ba_off_gas
+	* check
+	tabstat Gcons Econs, by(ba_off_gas)
+	di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'"
+
+	tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
+	table year MAIN_HEAT_FUEL, by(ba_off_gas)
+	* roughly constant rate throughout years
+	table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons)
+	* but off gas have no gas readings as you'd expect (DECC filter)
+
 	foreach v of local vars {
 		di "***************"
 		di "* Testing `v' for `sample'% sample"
@@ -91,30 +143,32 @@ if `do_desc' {
 		* 100 < gcons < 250 so included but rounded to nearest 500 = 0
 
 		* elec always rounded to nearest 50 so min should always be 100
-		
-		tabstat `v', by(`v'Valid) s(n mean semean min max) 
+
+		tabstat `v', by(`v'Valid) s(n mean semean min max)
 		* by year
 		di "* check `v' for 0s (`s'% sample)"
 		table `v' year if `v' < 1000
 		table `v'Valid year, c(count `v' min `v' mean `v' max `v')
-		 
+
 		if `do_graphs' {
-			histogram `v' if `v'Valid == "V", by(year) name(histo_`s'pc_`v')
-			graph export "`rpath'/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace 
-			graph box `v' if `v'Valid == "V", over(year) name(box_`s'pc_`v')
-			graph export "`rpath'/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace 
+			di "* Running graphs - do not keep in memory, just save out"
+			di "* Running graphs: histo"
+			histogram `v' if `v'Valid == "V", by(year)
+			graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace
+
+			di "* Running graphs: boxes"
+			graph box `v' if `v'Valid == "V", over(year)
+			graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace
+
+			graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND)
+			graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace
+
+			graph box `v' if `v'Valid == "V", over(year) by(EE_BAND)
+			graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace
+
 		}
-		
-		di "* check the panel transitions for each valid"
-		gen `v'Validr = 1 if `v'Valid == "V"
-		replace `v'Validr = 2 if `v'Valid == "O"
-		replace `v'Validr = 3 if `v'Valid == "L"
-		replace `v'Validr = 4 if `v'Valid == "G"
-		replace `v'Validr = 5 if `v'Valid == "M"
-		
-		lab var `v'Validr "Recoded `v'Valid"
-		lab val `v'Validr `v'Validr
-		* di "Check transitions (`v'Validr)"
+
+	di "* Check transitions (`v'Validr)"
 		xttrans `v'Validr, freq
 	}
 }
-- 
GitLab