process-NEED-EULF-2014.do

/* 
Script to turn original wide 2014 EULF version of DECC's NEED data into:
 1. a stata wide form xwave file containing the fixed value variables
 2. a stata wide form file containing just the yearly consumption variables (linked to 1. via HH_ID)
 3. a stata long form file containing just the yearly consumption variables (linked to 1. via HH_ID)
 4. Create codebooks from the above

Original data available from: UK DATA ARCHIVE: Study Number 7518 - National Energy Efficiency Data-Framework, 2014

http://discover.ukdataservice.ac.uk/catalogue/?sn=7518

Copyright (C) 2014  University of Southampton

Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut) 
	[Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License 
(http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

#YMMV - http://en.wiktionary.org/wiki/YMMV

******************
Notes:
This dataset is a sample of just over 4 million households which have had an Energy Performance Certificate from the full NEED 'all dwellings' dataset
Is this all those who have had an EPC or a random sample of all those who've had an EPC?
Sample bias is unkown - which kinds of dwellings have an EPC?
Ideally DECC should set missing to -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
The Gcons*valid variable codes:
    G = Gas consumption invalid, greater than 50,000
    L = Gas consumption invalid, less than 100
    M = Gas consumption data is missing in source data
    0 = Property does not have a gas connection
    V = Valid gas consumption (between 100 and 50,000 inclusive)
    NB - there are valid gas readings of '0' which presumably were > 100 but < 249 (first gas 'heap' = 'nearest 500')
The Econs*valid variable codes:
    G Electricity consumption invalid, greater than 25,000 (DECC lookup table says 50,000)
    L Electricity consumption invalid, less than 100
    M Electricity consumption data is missing in source dataset
    V Valid electricity consumption (between 100 and 25,000 inclusive)

*/

clear all

capture noisily log close _all

* written for Mac OSX - remember to change filesystem delimiter for other platforms
local home "~/Documents"
local proot "`home'/Work/Data/Social Science Datatsets/DECC"
* for clam
* local proot "`home'/Work/NEED"
local dpath "`proot'/NEED/End User Licence File 2014/"
* NB this is the 2014 EULF we're using
local ifile "need_eul_may2014"
* original data file
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"


* If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
* For now we'll just create a random sub-sample of sample% to make testing models etc easier
* 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
local sample "10 50 100"
local sampleby "EE_BAND PROP_TYPE"

local version "v1.1"
* includes production of % samples which maintain the original dimensions used to 
* produce the EULF samples: EE_BAND PROP_TYPE

*local version "v1"

set more off

log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", replace name(main)

* use these locals to control what happens (set to 0 to skip the code)
* create codebook & some descriptives
local create_codebook = 1
* create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
local create_xwavefile = 1
* create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
local create_longfile = 1

* load the original file
use "`dfile_orig'.dta", clear

if `create_codebook' {
	* create original EULF codebook
	* not much point running this for each % sample although the counts etc reported in the codebook won't match
	log off main
		log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
		desc
		codebook
		log close cb
	log on main
}


foreach s of local sample {
	local samplet "`s'pc"
	***** random sample ****
	* select a random sample but ensure proportions of sampleby are kept
	di "* Keeping `s'% sample by `sampleby'"
	sample `s', by(`sampleby')
	
	tab `sampleby', mi
	
	
	if `create_xwavefile' {
		* create the file with data that (notionally) doesn't change
	
		* create a wide consumption file
		preserve
			keep HH_ID Gcons* Econs*
			compress
			save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
		restore
		
		
		preserve
			drop Gcons* Econs*
			* fix some mis-codings (or lack of coding of missing)
			local vars "E7Flag2012 CWI LI BOILER"
			foreach v of local vars {
				destring `v', force replace
				replace `v' = 0 if `v' !=1
				label def `v' 0 "No or N/A" 1 "Yes"
				label val `v' `v'
			}
			
			* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
			* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
			replace FP_ENG = -99 if FP_ENG  == 99
			replace LOFT_DEPTH = -99 if LOFT_DEPTH  == 99
			* comes in as missing?
			replace LI_YEAR = -99 if LI_YEAR == .
			replace BOILER_YEAR = -99 if BOILER_YEAR == .
			replace CWI_YEAR = -99 if CWI_YEAR == .
			
			lab val 
						
			tabstat IMD_WALES, by(REGION) s(mean min max n)
			* there seem to be some welsh LSOAs allocated to English GORs?
			
			tabstat IMD_ENG, by(REGION) s(mean min max n)
			* there seem to be some English LSOAs allocated to Wales?
		
			tab FP_ENG REGION
			
			* REGION is ONS admin codes
			* create a new variable with meaningful labels
			gen ba_region = 1 if REGION == "E12000001"
			replace ba_region = 2 if REGION == "E12000002"
			replace ba_region = 3 if REGION == "E12000003"
			replace ba_region = 4 if REGION == "E12000004"
			replace ba_region = 5 if REGION == "E12000005"
			replace ba_region = 6 if REGION == "E12000006"
			replace ba_region = 7 if REGION == "E12000007"
			replace ba_region = 8 if REGION == "E12000008"
			replace ba_region = 9 if REGION == "E12000009"
			replace ba_region = 10 if REGION == "W99999999"
			
			lab var ba_region "former Govt Office region (labelled)"
			* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
			lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
				5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
			lab val ba_region ba_region
			
			compress
			log off main
				log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
				desc
				di "** urban/rural would be helpful"
				codebook
				compress
				save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
				log close cb_xwave
			log on main
		restore
	}
	
	if `create_longfile' {
		* create the long file with as few vars as possible (quicker)
		* still takes a while...
		preserve
			keep HH_ID Gcons* Econs*
		
			* panel vars:
			* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
			local vars "Gcons Econs"
			foreach v of local vars {
				di "* Renaming -> `v'"
				foreach y of numlist 2005/2012 {
					di "* -> `v' (`y')"
					* put year on the end so reshape works
					gen `v'Valid`y' = `v'`y'Valid
					* remove old variable to save time in reshape & space
					drop `v'`y'Valid
				}
			}
			* this takes a LONG time for the full dataset
			reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
			rename _j year
			* set as panel
			xtset HH_ID year
			compress
			save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
			
			log off main
				log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
				desc
				codebook
				log close cbl
			log on main
		restore
		* this leaves us with the original file in memory
	}
}

* done!

log close _all