From 15d12e2f48bb22ae985b321dcae9d21085c8ede3 Mon Sep 17 00:00:00 2001
From: Ben Anderson <b.anderson@soton.ac.uk>
Date: Wed, 29 Apr 2015 12:12:26 +0100
Subject: [PATCH] simplified extraction code

only keeps a basic core of variables to produce a smaller file on the
assumption that this will be linked back to source expenditure data for
specific analyses
---
 ONS-UK-EFS-time-series-extract.do | 101 ++++++++++++++----------------
 1 file changed, 48 insertions(+), 53 deletions(-)

diff --git a/ONS-UK-EFS-time-series-extract.do b/ONS-UK-EFS-time-series-extract.do
index de319e4..64da14e 100755
--- a/ONS-UK-EFS-time-series-extract.do
+++ b/ONS-UK-EFS-time-series-extract.do
@@ -35,9 +35,11 @@ GNU General Public License for more details.
 
 * History
 * 15/8/2012 moved creation of ba_quarter etc to per-year processing
-
-* TO DO
-* update to 2012
+* 28/4/2015 - adding 2011 & 2012
+* 	LCFS database changes:
+* 		2010-2011 - 7272volume_h_changes_database_2011.xls
+* 		2011-2012 - 7472_volume_h_changes_database_2012.xls
+* 29/4/2015 - changed to only produce basic file on the assumption that this will be linked back to source expenditure data for specific analyses
 
 * NB - the script assumes a certain folder structure for the source EFS data like so:
 * `efsd'/<year>/stata/<datafile>.dta
@@ -52,7 +54,7 @@ local place = "/Users/ben/Documents/Work"
local efsd = "`place'/Data/Social Scie
 local logd = "`efsd'/log_files"
local outd = "`efsd'/processed"
 
 * Years to be extracted
-* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010
+* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011
 
 * NB: from 2006 the EFS is collected on a calendar year basis. 
 * This means that 2005-6_Q4 and 2006_Q1 are exactly the same cases
@@ -61,15 +63,18 @@ local logd = "`efsd'/log_files"
local outd = "`efsd'/processed"
 * To save time you can leave out years you have already processed
 * just paste the ones you want into the allyears local variable below
 
-local years "2001-2010" // just a label
-local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010" // years to process
local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010"
+local years "2001-2012" // just a label
+* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011
+local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process
local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012"
 
 * set to 1 to refresh each yearly extract you listed in do_years
 local do_extracts "1"
 
 * drop the first survey for the merge as it is loaded first
-local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010"
-
+local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012"
+
+* be sure to keep only these from the dv hh file - last ones needed for income equivalisation later
+local dvhh_keepvars = "incanon weight* a055 g018 g019 p116* p344* p389*"  	
 
 capture log close
 log using "`logd'/ONS-UK-EFS-time-series-extract-$S_DATE.smcl", replace
@@ -121,8 +126,8 @@ if `do_extracts' {
 		 
 		 * c_white_0	c_white_1	
		 * 0 "White HRP" 1 "Non-white HRP"	 
 		 
		 */
		
		*********
		di "* dv household file"
		use "`efsd'/`y'/stata/dvhh.dta", clear
		
-		* 2010 data = mixed/uppercase
-		if "`y'" === "2010" {
+		* 2010 onwards data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
 			rename *, lower	
 		}
		** sex of HRP
		gen c_gender = -1
		replace c_gender = 0 if sexhrp == 1
		replace c_gender = 1 if sexhrp == 2
		lab var c_gender "Constraint: Gender of HRP"
		lab def c_gender 0 "Male" 1 "Female"
		lab val c_gender c_gender
		
		** age of HRP
 		* need to use 75+ as few 80+ after 2001-2
		recode p396p (min/15= . ) (16/24 = 0) (25/34 = 1) (35/44 = 2) (45/54 = 3) ///
			(55/64 = 4) (65/74 = 5) (75/max = 6), gen(c_age)
		lab var c_age "Constraint: Age of HRP"
@@ -134,29 +139,24 @@ if `do_extracts' {
 		
 		* has 1, 2 and more than 2 - so needs recoding.
		recode c_nchild (0=0) (1=1) (2/max=2)
		lab var c_nchild "Constraint: number of children"
		lab define c_nchild 0 None 1 One 2 "Two or more"
		lab val c_nchild c_nchild
		
		** Sorting
		sort case
 		
-		* keep all variables and base final merge on it
-		
+		* keep only basic variables
+		keep case* c_* region `dvhh_keepvars'
 		qui: compress
		* save all vars
		save "`efsd'/`y'/stata/dvhh-temp.dta", replace
		
		******************************
		di "*  raw household file for c_comp and c_accom"
 		
 		* also to pick up electricity water payments periodicity etc for error analysis
		
		use "`efsd'/`y'/stata/rawhh.dta", clear
		
-		* 2010 data = mixed/uppercase
-		if "`y'" === "2010" {
+		* 2010 onwards data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
 			rename *, lower	
 		}

		* accomodation type
		* Object=
		* 0 Detached
		* 1 Semi
		* 2 Terrace
		* 3 flat/maisontte
		* 4 other
		gen ba_c_accom = -1
		replace ba_c_accom = 0 if hsetype == 1
		replace ba_c_accom = 1 if hsetype == 2
		replace ba_c_accom = 2 if hsetype == 3
 		if survyr > 2001 {
 			* grr, why can't var names stay the same?!
 			gen acomtype = accom 
-		}
		replace ba_c_accom = 3 if acomtype == 2
		replace ba_c_accom = 4 if acomtype == 3
		replace ba_c_accom = 4 if acomtype == 4
		
		lab var ba_c_accom "Constraint: accommodation type"
		lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other"
		lab val ba_c_accom c_accom
		
		** Composition.
		* Object =
		* 0 'married/partnered'
		* 1 'single parent'
		* 2 'single person'
		* 3 'other'.
		
		* co-habiting.
		gen ba_c_comp=1 if numcpart>0
		* married.
		replace ba_c_comp=0 if nummpart>0 
		* single parent - assumes a single adult living with 1 or more
		* children is a single parent.
		replace ba_c_comp=1 if (numadult==1 & numchild>0)
		* single person.
		replace ba_c_comp=2 if (numadult==1 & numchild==0)
		* the rest - this is a cheat!
		recode ba_c_comp (missing=3)
		lab var ba_c_comp "Constraint: household composition"
		lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other"
		lab val ba_c_comp c_comp
		tab ba_c_comp numcpart
		tab ba_c_comp nummpart
		
		
		* rawhh derived cable tv dummies
		* if tvtype = 3 = cable, if = 4 is cable + phone
		gen tvtype_2 = tvtype2
		gen tvtype_3 = tvtype3
		gen tvtype_4 = tvtype4
		* if exists - gen tvtype_5 = tvtype5
		egen ba_c_has_cable_rawhh = anymatch(tvtype_*), values(3 4)
		
		* check for tvtype(1) as well
		replace ba_c_has_cable_rawhh = 1 if tvtype == 3
		
		lab var ba_c_has_cable_rawhh "Sim: Has cable based on tvtype in rawhh"
-		
-		gen ba_calyear = -1
+		}
		replace ba_c_accom = 3 if acomtype == 2
		replace ba_c_accom = 4 if acomtype == 3
		replace ba_c_accom = 4 if acomtype == 4
		
		lab var ba_c_accom "Constraint: accommodation type"
		lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other"
		lab val ba_c_accom c_accom
		
		** Composition.
		* Object =
		* 0 'married/partnered'
		* 1 'single parent'
		* 2 'single person'
		* 3 'other'.
		
		* co-habiting.
		gen ba_c_comp=1 if numcpart>0
		* married.
		replace ba_c_comp=0 if nummpart>0 
		* single parent - assumes a single adult living with 1 or more
		* children is a single parent.
		replace ba_c_comp=1 if (numadult==1 & numchild>0)
		* single person.
		replace ba_c_comp=2 if (numadult==1 & numchild==0)
		* the rest - this is a cheat!
		recode ba_c_comp (missing=3)
		lab var ba_c_comp "Constraint: household composition"
		lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other"
		lab val ba_c_comp c_comp
		tab ba_c_comp numcpart
		tab ba_c_comp nummpart
		
		gen ba_calyear = -1
 		
-		local keepvars_orig "waterpay watermet elecpay eacamt eacper elecpayo dveac estndordamt estndord estndo_1 estndo_2 dvestndo dsselecf dsselecp dwpelecf dwpelecp dwpper"
-		local keepvars ""
-	
 		if "`y'" == "2001-2002" | "`y'" == "2002-2003" | "`y'" == "2003-2004" | "`y'" == "2004-2005" | "`y'" == "2005-2006" | "`y'" == "2006" | "`y'" == "2007" {
 			di "* Setting up cal year for `y'"
 			tab survyr sampyear, mi
-			replace ba_sampyear = sampyear
+			gen ba_sampyear = sampyear
 			* ba_calyear removed as was actually sample year
 			
 			* construct list of vars to keep based on ideal
@@ -171,18 +171,14 @@ if `do_extracts' {
 					*di in red "weight does not exist"
                }
             }
-			di "***"
-			di "* Want: `keepvars_orig'"
-			di "* Have: `keepvars'"
-			di "***"
-			keep case ba_* wsinc water* sewsep ctwat percwat percsew percwsew ctwat `keepvars'
+			keep case* ba_*
 			di "* Done setting up sample year for `y'"
 		}
 
-		if  "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
+		if  "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
 			di "* Setting up cal year for `y'"
 			* sampyear variable removed
-			replace ba_sampyear = survyr
+			gen ba_sampyear = survyr
 		    * construct list of vars to keep based on ideal
 			foreach v of local keepvars_orig {
 				capture confirm variable `v'
@@ -194,12 +190,8 @@ if `do_extracts' {
 					*di in red "weight does not exist"
                }
             }
-			di "***"	
-			di "* Want: `keepvars_orig'"
-			di "* Have: `keepvars'"
-			di "***"
 			lab var ba_sampyear "Sample year"
-			keep case ba_* wsinc water* sewsep ctwat percwat percsew percwsew ctwat `keepvars'
+			keep case* ba_*
 			di "* Done setting up sample year for `y'"
 		}
 				
@@ -215,7 +207,7 @@ if `do_extracts' {
 		use "`efsd'/`y'/stata/rawper.dta", clear
 		
 		* 2010 data = mixed/uppercase
-		if "`y'" === "2010" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
 			rename *, lower	
 		}
 		* keep HRPs
@@ -233,7 +225,7 @@ if `do_extracts' {
 			di "* Age error: `y'"
 			drop if dvage18 == 2
 		}
-		else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
+		else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
 			di "Removing HRPs where age < 16 for `y'"
 			drop if dvage_p < 16
 		}
@@ -242,12 +234,21 @@ if `do_extracts' {
 			drop if dvage < 16
 		}
 		
-		* ethnic detail
+		di "* ethnic detail"
 		
 		if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
+			di "* year = `y'"
 			gen c_ethnicd = eth01p
 		}
+		else if "`y'" == "2011" | "`y'" == "2012" {
+			di "* year = `y'"
+			gen c_ethnicd = ethep
+			replace c_ethnicd = ethwp if c_ethnicd == .
+			replace c_ethnicd = ethsp if c_ethnicd == .
+			replace c_ethnicd = ethnip if c_ethnicd == .
+		}
 		else {
+			di "* year = `y'"
 			* before 2008
 			gen c_ethnicd = ethnic_p
 		}
@@ -256,20 +257,20 @@ if `do_extracts' {
 		lab val c_ethnicd c_ethnicd
 	
 		* keep only new variables
-		keep case c_*
+		keep case* c_*
 		save "`efsd'/`y'/stata/rawper-temp.dta", replace
 		
 		************
		di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)"
 		
 
		use "`efsd'/`y'/stata/dvper.dta", clear
		
 		* 2010 data = mixed/uppercase
-		if "`y'" === "2010" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
 			rename *, lower	
 		}

		* keep only hrps
		keep if a003 == 1
 		
 		* Ethnicity 
 		* NB a012 and a013 changed categories 2007->2008 to 0 = n/a, 1 = white, 2 = everyone else
-		* this still works, you can get detail from rawper.dta
		recode a012p (1=0) (2/max=1), gen(c_white)
		lab var c_white "Constraint: non-white HRP"
		lab def c_white 0 "White HRP" 1 "Non-white HRP"
		lab val c_white c_white
		* NB - 20 HRPs are not classified - they will show up as missing
		keep case c_white a010
+		* this still works, you can get detail from rawper.dta
		recode a012p (1=0) (2/max=1), gen(c_white)
		lab var c_white "Constraint: non-white HRP"
		lab def c_white 0 "White HRP" 1 "Non-white HRP"
		lab val c_white c_white
		* if HRPs are not classified - they will show up as missing
		keep case* c_white
 		
 		qui: compress
 		
		save "`efsd'/`y'/stata/dvper-temp.dta", replace
@@ -278,8 +279,8 @@ if `do_extracts' {
 		* and count the number of children of various ages
		
 		use "`efsd'/`y'/stata/dvper.dta", clear
 	
-		* 2010 data = mixed/uppercase
-		if "`y'" === "2010" {
+		* 2010 onwards data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" {
 			rename *, lower	
 		}
 		
@@ -309,7 +310,7 @@ if `do_extracts' {
 		lab var ba_under14 "Number of children aged under 14"
 
 		* keep only new variables
-		keep case ba_* c_*
+		keep case* ba_* c_*
 		qui: compress
 		save "`efsd'/`y'/stata/dvper-lli.dta", replace
 	
		
		di "*** MERGE ALL FILES for year = `y'"
@@ -334,7 +335,7 @@ if `do_extracts' {
 		* keep all (makes life easier below as some vars are in some years and not others) 
 		
 		***********************************************
-		* calculate OECD equivaisation weight in order to equivalise income or expenditure
+		* calculate OECD equivalisation weight in order to equivalise income or expenditure
 		* see e.g. DWP HBAI reports
 		* can then calculate n over 14 by substracting from g019 
 		
@@ -419,7 +420,7 @@ foreach y of local mergeyears {
 
 tabstat c_*, c(s) s(mean min max)
 		
-lab var year "Calendar year"
lab var survey_year "EFS/FES year"
+lab var survey_year "EFS/FES year"
 
 * finally check for duplicate months 2005-6 -> 2006 samples
 tab ba_calyear ba_month
@@ -436,17 +437,9 @@ tab ba_calyear ba_month
 di "Compressing"
 qui: compress 
 
-* drop fs* as only exist in 2005-6
-drop fs*
  
 aorder
 
-* full version
-save "`outd'/EFS-`years'-extract-BA.dta", replace
-
-* use this version to match to case in older files for specific variable input
-keep case survey_year year gorx incanon c_* ba_* weight*
-
 gen caseno = case
 
 if `do_extracts' {
@@ -457,7 +450,9 @@ else
 	di "do_extracts = `do_extracts', years not extracted so individual files not refreshed"
 }
 
-su c_*
+
+* to test
+tabstat c_ncars c_nchild c_npersons c_nearners c_nrooms,  by(ba_sampyear)
 
 save "`outd'/EFS-`years'-extract-reduced-BA.dta", replace
 
di "Job ended at $S_DATE"
-- 
GitLab