From fa52d4d8fdb72b059427a9ed82da55fd4b782b3f Mon Sep 17 00:00:00 2001
From: Ben Anderson <b.anderson@soton.ac.uk>
Date: Mon, 20 Jul 2015 11:10:22 +0100
Subject: [PATCH] updated extraction script to include 2013

---
 ONS-UK-EFS-LCFS-time-series-extract.do |  25 +-
 sublime2stata.do                       | 733 +++++++++++++++++++++++++
 2 files changed, 746 insertions(+), 12 deletions(-)
 create mode 100644 sublime2stata.do

diff --git a/ONS-UK-EFS-LCFS-time-series-extract.do b/ONS-UK-EFS-LCFS-time-series-extract.do
index 70025b8..3a84046 100755
--- a/ONS-UK-EFS-LCFS-time-series-extract.do
+++ b/ONS-UK-EFS-LCFS-time-series-extract.do
@@ -62,12 +62,12 @@ local outd = "`efsd'/processed"
 * `efsd'/processed/
 ***********************
 
-local extract_years "2001-2012" // just a name for the FINAL extracted file
+local extract_years "2001-2013" // just a name for the FINAL extracted file
 
 * To save time you can leave out years you have already processed
 * just paste the ones you want into the do_years local variable below
 * choose any of 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012 
-local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process
+local do_years = "2013" // years to process
 
 * set to 1 to refresh each yearly extract you listed in do_years & append the files
 * set to 0 to just append previously extracted files
@@ -174,7 +174,7 @@ if `do_extracts' {
 		use "`efsd'/`y'/stata/dvhh.dta", clear
 		
 		* 2010 onwards data = mixed/uppercase
-		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			rename *, lower	
 		}
 		** sex of HRP
@@ -298,11 +298,12 @@ if `do_extracts' {
 		* has 1, 2 and more than 2 - so needs recoding.
 		recode c_nchild (0=0) (1=1) (2/max=2)
 		lab var c_nchild "Constraint: number of children"
-		lab define c_nchild 0 None 1 One 2 "Two or more"
+		lab define c_nchild 0 "None" 1 "One" 2 "Two or more"
 		lab val c_nchild c_nchild
 		
+		di "Year = `y'"
 		gen ba_year = `y'
-		
+
 		* construct list of vars to keep
 		* if dvhh_keepvars is empty STATA will skip
 		local keepvars = ""
@@ -345,7 +346,7 @@ if `do_extracts' {
 		use "`efsd'/`y'/stata/rawhh.dta", clear
 		
 		* 2010 onwards data = mixed/uppercase
-		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			rename *, lower	
 		}
 
@@ -404,7 +405,7 @@ if `do_extracts' {
 			lab var ba_sampyear "Sample year"
 		}
 
-		if  "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+		if  "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			di "* Setting up cal year for `y'"
 			* sampyear variable removed
 			* tab survyr sampyear, mi
@@ -444,7 +445,7 @@ if `do_extracts' {
 		use "`efsd'/`y'/stata/rawper.dta", clear
 		
 		* 2010 data = mixed/uppercase
-		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			rename *, lower	
 		}
 		* keep HRPs
@@ -462,7 +463,7 @@ if `do_extracts' {
 			di "* Age error: `y'"
 			drop if dvage18 == 2
 		}
-		else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+		else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			di "Removing HRPs where age < 16 for `y'"
 			drop if dvage_p < 16
 		}
@@ -477,7 +478,7 @@ if `do_extracts' {
 			di "* year = `y'"
 			gen c_ethnicd = eth01p
 		}
-		else if "`y'" == "2011" | "`y'" == "2012" {
+		else if "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			di "* year = `y'"
 			gen c_ethnicd = ethep
 			replace c_ethnicd = ethwp if c_ethnicd == .
@@ -519,7 +520,7 @@ if `do_extracts' {
 		use "`efsd'/`y'/stata/dvper.dta", clear
 		
 		* 2010 data = mixed/uppercase
-		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" {
 			rename *, lower	
 		}
 
@@ -562,7 +563,7 @@ if `do_extracts' {
 		use "`efsd'/`y'/stata/dvper.dta", clear
 	
 		* 2010 onwards data = mixed/uppercase
-		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" {
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" | "`y'" == "2013" {
 			rename *, lower	
 		}
 		
diff --git a/sublime2stata.do b/sublime2stata.do
new file mode 100644
index 0000000..6dce73d
--- /dev/null
+++ b/sublime2stata.do
@@ -0,0 +1,733 @@
+
+
+* run without waiting for user
+set more off
+
+* set these to what works for you
+local place = "~/Documents/Work"
+local efsd = "`place'/Data/Social Science Datatsets/Expenditure and Food Survey"
+local logd = "`efsd'/log_files"
+local outd = "`efsd'/processed"
+
+************************
+* NB - the script assumes a certain folder structure for the source EFS/LCFS data like so:
+* `efsd'/<year>/stata/<datafile>.dta
+* You may have to remane some of the downloaded & unzipped UKDA data folders to make this work
+* The script also assumes that this folder exists for the final results:
+* `efsd'/processed/
+***********************
+
+local extract_years "2001-2013" 
+
+* To save time you can leave out years you have already processed
+* just paste the ones you want into the do_years local variable below
+* choose any of 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012 
+local do_years = "2013" 
+
+* set to 1 to refresh each yearly extract you listed in do_years & append the files
+* set to 0 to just append previously extracted files
+local do_extracts 1
+
+************************
+* Set the variables to be extracted here
+* there is error checking below to make sure that they exist in each year or to skip if not
+* put case in each keep var if otherwise empty
+
+* -> dvhh file
+* basic weighting & consumption expenditure codes
+* p60*t p61*t p630* 
+local dvhh_keepvars = ""
+* needed for income equivalisation later
+local dvhh_keepvars = "`dvhh_keepvars' incanon a055 g018 g019 p116* p344* p389* p396*"  	
+* DEMAND 2.3 (older people mobile lives)
+*local dvhh_keepvars = "`dvhh_keepvars' b480 b481 b485 cc5413t c73312t" 
+* DEMAND 3.1 (adapting infrastructures)
+*  a1701 a1711
+local dvhh_keepvars = "`dvhh_keepvars' a103 a108 a128 a130 a15* a16*" 
+
+* -> dvper file
+local dvper_keepvars = "case"
+
+* -> rawhh file
+* DEMAND 2.3 (older people mobile lives)
+local rawhh_keepvars = "flydes*" 
+
+* -> rawper file
+local rawper_keepvars = "case"
+************************
+
+************************
+* set logging
+capture log close
+log using "`logd'/ONS-UK-EFS-time-series-extract-$S_DATE.smcl", replace
+clear all
+
+* increase default number of variables allowed
+set maxvar 10000, perm
+
+di "*******************************************************"
+di "* 	This script will process all of the EFS files for:"
+di "*	`years'"
+di "*	It will keep the following variables:"
+di "*	dvhh: `dvhh_keepvars' "
+di "*	rawhh: `rawhh_keepvars'"
+di "*	This could take some time. "
+di "*	I suggest you check it is running and then get a cup of tea...."
+di "*******************************************************"
+if `do_extracts' {
+	di "do_extracts = `do_extracts', all years (`years') to be extracted and refreshed"
+
+	foreach y of local do_years {
+		di "* * * * * * * * * "
+		di "* -> Processing `y'"
+		/* census vars/labels
+	
+		 * c_accom_0 c_accom_1 c_accom_2 c_accom_3 c_accom_4
+		 * 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other"
+	
+		 * c_age_0	c_age_1	c_age_2	c_age_3	c_age_4	c_age_5	c_age_6	c_age_7
+		 * 0 "16-24" 1 "25-34" 2 "35-44" 3 "45-54" 4 "55-64" 5 "65-74" 6 "75+" 
+	
+		 * c_comp_0 c_comp_1 c_comp_2 c_comp_3 
+		 * 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other"
+		 
+	 	 * c_lli_0	c_lli_1
+		 * 0 "No person with lli" 1 "At least 1 person with lli"
+	
+		 * c_ncars_0 c_ncars_1 c_ncars_2
+		 * 0 None 1 One 2 "Two or more"
+		 
+		 * c_nchild_0 c_nchild_1 c_nchild_2
+		 * 0 None 1 One 2 "Two or more"
+	
+		 * c_nearners_0	c_nearners_1	c_nearners_2	c_nearners_3	
+		 * 0 "0" 1 "1" 2 "2" 3 "3+"
+		 
+		  * c_npersons_0	c_npersons_1	c_npersons_2	c_npersons_3	c_npersons_4	
+		 * 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+"
+		 
+		 * c_nrooms_0	c_nrooms_1	c_nrooms_2 c_nrooms_3	
+		 * 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+" 
+	
+		 * c_empl_0 c_empl_1 c_empl_2 c_empl_3 c_empl_4
+		 * 0 "NS-SEC 1" 1 "NS-SEC 2" 2 "NS-SEC 3" 3 "Inactive" 4 "Retired"
+	
+	 	 * c_gender_0 c_gender_1
+		 * 0 "Male" 1 "Female"
+		
+		 * c_tenure_0	c_tenure_1	c_tenure_2	c_tenure_3 
+		 * 0 "Owned" 1 "Rent from council" 2 "Social rent" 3 "Private rent incl rent free"
+		 
+		 * c_white_0	c_white_1	
+		 * 0 "White HRP" 1 "Non-white HRP"	 
+		 
+		 */
+		
+		******************************
+		* dvhh
+		di "* dv household file"
+		use "`efsd'/`y'/stata/dvhh.dta", clear
+		
+		* 2010 onwards data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+			rename *, lower	
+		}
+		** sex of HRP
+		gen c_gender = -1
+		replace c_gender = 0 if sexhrp == 1
+		replace c_gender = 1 if sexhrp == 2
+		lab var c_gender "Constraint: Gender of HRP"
+		lab def c_gender 0 "Male" 1 "Female"
+		lab val c_gender c_gender
+		
+		** age of HRP
+		* need to use 75+ as few 80+ after 2001-2
+		recode p396p (min/15= . ) (16/24 = 0) (25/34 = 1) (35/44 = 2) (45/54 = 3)  			(55/64 = 4) (65/74 = 5) (75/max = 6), gen(c_age)
+		lab var c_age "Constraint: Age of HRP"
+		* NB for NI need to change these as Census categories are different. Why why why!?
+		label define c_age  			0 "16-24"  			1 "25-34"  			2 "35-44"  			3 "45-54"  			4 "55-64"  			5 "65-74"  			6 "75+" 
+			
+		lab val c_age c_age
+		
+		** number of rooms
+		*1,2,3,4+
+		recode a114 (1=0) (2=1) (3=2) (4=3) (5/max=4), gen(c_nrooms)
+		lab var c_nrooms "Constraint: number of rooms"
+		lab def c_nrooms 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+" 
+		lab val c_nrooms c_nrooms
+	
+		** Number of residents per household
+		recode a049 (1=0) (2=1) (3=2) (4=3) (5/max=4), gen(c_npersons)
+		lab var c_npersons "Constraint: number of persons in household (all ages)"
+		lab def c_npersons 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+"
+		lab val c_npersons c_npersons
+		
+		** Number of earners
+		recode a054 (0=0) (1=1) (2=2) (3/max=3), gen(c_nearners)
+		lab var c_nearners "Constraint: number of earners in household"
+		lab def c_nearners 0 "0" 1 "1" 2 "2" 3 "3+"
+		lab val c_nearners c_nearners
+		
+		** Cars and vans.
+		* Object=
+		* 0 None
+		* 1 One
+		* 2 'Two or more'.
+		 
+		*has 1-12 so need to recode.
+		
+		recode a124 (0=0) (1=1) (2/12=2), gen(c_ncars)
+		lab var c_ncars "Constraint: cars and vans"
+		lab define c_ncars 0 None 1 One 2 "Two or more"
+		lab val c_ncars c_ncars 
+		*tab a124 c_cars
+		
+		** Tenure.
+		* Object =
+		* 0 'Owned'
+		* 1 'Rent from council'
+		* 2 'Social rent'
+		* 3 'Private rent' - incl rent-free
+		
+		*use a121.
+		recode a121 (5/7=0) (1=1) (2=2) (3/4 8=3), gen(c_tenure)
+		lab var c_tenure "Constraint: tenure"
+		lab define c_tenure 0 "Owned" 1 "Rent from council" 2 "Social rent" 3 "Private rent incl rent free"
+		lab val c_tenure c_tenure
+		*tab a121 c_tenure
+		
+		** employment status.
+		* Object =
+		* 0 'NS-SEC 1'
+		* 1 'NS-SEC 2'
+		* 2 'NS-SEC 3'
+		* 3 'Inactive'
+		* 4 'Retired'.
+		
+		* need to combine these - a093 = activity, a094 = NS-SEC.
+		* ref ONS website.
+		
+		* need to put a094 = 9,10,11 ('Never worked and long term unemployed',students, not stated) into 'inactive'
+		recode a094 (0/2=0) (3/4=1) (5/8=2) (9/12=3), gen(c_empl)
+		* the crosstab of a094 against a093 shows that some who are coded as
+		* retired/unoccupied (a093=4/5) have an NS-SEC code as they are recently
+		* retired/unoccupied (?). In this case we use the activity code not the NS-SEC code.
+		
+		replace c_empl=3 if a093==7
+		replace c_empl=4 if a093==6
+		
+		lab var c_empl "Constraint: employment status of HRP"
+		lab define c_empl 0 "NS-SEC 1" 1 "NS-SEC 2" 2 "NS-SEC 3" 3 "Inactive" 4 "Retired"
+		lab val c_empl c_empl
+		*tab a093 c_empl 
+		*tab a094 c_empl
+				
+		** Region.
+		* use gorx.
+		
+		gen region = gorx
+		label define region 	1    "North East"	2    "North West & Merseyside"	3    "Yorkshire and the Humber"  			4    "East Midlands" 	5    "West Midlands" 	6    "Eastern"	7    "London" 	8    "South East"  			9    "South West"	10    "Wales"	11    "Scotland" 	12    "Northern Ireland"
+		lab var region "Govt. Office Region"
+		lab val region region
+		
+		
+		** Number of children - 16 or younger.
+		*Object =
+		* 0 0
+		* 1 1
+		* 2 2+.
+		gen c_nchild = a040+a041+a042
+		
+		* could use g019?
+		
+		* has 1, 2 and more than 2 - so needs recoding.
+		recode c_nchild (0=0) (1=1) (2/max=2)
+		lab var c_nchild "Constraint: number of children"
+		lab define c_nchild 0 None 1 One 2 "Two or more"
+		lab val c_nchild c_nchild
+		
+		gen ba_year = `y'
+		
+		* construct list of vars to keep
+		* if dvhh_keepvars is empty STATA will skip
+		local keepvars = ""
+		foreach v of local dvhh_keepvars {
+			di "* Testing for existence of `v'"
+			capture confirm variable `v'
+			if !_rc {
+				di "* found `v'"
+				local keepvars = "`keepvars' `v'"
+            	}
+			else {
+				* exact string not found as a variable, could be because it needs expansion
+				* try as a varlist (forces expansion)
+				di "* Not found, expanding `v'"
+				capture noisily {
+					* if variable really doesn't exist this throws an exception & moves on
+					foreach vt of varlist `v' {	
+						capture confirm variable `vt'
+						if !_rc {
+							di "* -> found `vt'"
+							local keepvars = "`keepvars' `vt'"
+            			}
+             		}
+            	}
+            }
+        }
+        
+		di "* dvhh: keeping case* ba_* c_* region weight* `keepvars'"
+		keep case* ba_* c_* region weight* `keepvars'
+		qui: compress
+		* save kept dvhh vars
+		save "`efsd'/`y'/stata/dvhh-temp.dta", replace
+		******************************
+		
+		******************************
+		* rawhh
+		di "* raw household file for: c_comp and c_accom"
+		di "* also to pick up: `rawhh_keepvars'"
+		
+		use "`efsd'/`y'/stata/rawhh.dta", clear
+		
+		* 2010 onwards data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+			rename *, lower	
+		}
+
+		* accomodation type
+		* Object=
+		* 0 Detached
+		* 1 Semi
+		* 2 Terrace
+		* 3 flat/maisontte
+		* 4 other
+		gen ba_c_accom = -1
+		replace ba_c_accom = 0 if hsetype == 1
+		replace ba_c_accom = 1 if hsetype == 2
+		replace ba_c_accom = 2 if hsetype == 3
+		if survyr > 2001 {
+			* grr, why can't var names stay the same?!
+			gen acomtype = accom 
+		}
+		replace ba_c_accom = 3 if acomtype == 2
+		replace ba_c_accom = 4 if acomtype == 3
+		replace ba_c_accom = 4 if acomtype == 4
+		
+		lab var ba_c_accom "Constraint: accommodation type"
+		lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other"
+		lab val ba_c_accom c_accom
+		
+		** Composition.
+		* Object =
+		* 0 'married/partnered'
+		* 1 'single parent'
+		* 2 'single person'
+		* 3 'other'.
+		
+		* co-habiting.
+		gen ba_c_comp=1 if numcpart>0
+		* married.
+		replace ba_c_comp=0 if nummpart>0 
+		* single parent - assumes a single adult living with 1 or more
+		* children is a single parent.
+		replace ba_c_comp=1 if (numadult==1 & numchild>0)
+		* single person.
+		replace ba_c_comp=2 if (numadult==1 & numchild==0)
+		* the rest - this is a cheat!
+		recode ba_c_comp (missing=3)
+		lab var ba_c_comp "Constraint: household composition"
+		lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other"
+		lab val ba_c_comp c_comp
+		tab ba_c_comp numcpart
+		tab ba_c_comp nummpart
+		
+		
+		if "`y'" == "2001-2002" | "`y'" == "2002-2003" | "`y'" == "2003-2004" | "`y'" == "2004-2005" | "`y'" == "2005-2006" | "`y'" == "2006" | "`y'" == "2007" {
+			di "* Setting up cal year for `y'"
+			* tab survyr sampyear, mi
+			gen ba_sampyear = sampyear
+			lab var ba_sampyear "Sample year"
+		}
+
+		if  "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+			di "* Setting up cal year for `y'"
+			* sampyear variable removed
+			* tab survyr sampyear, mi
+			gen ba_sampyear = survyr
+			lab var ba_sampyear "Sample year"
+		}
+		if  "`y'" == "2012" {
+			di "* fixing issue with multiple flights varnames differently for `y'"
+			renpfix flydes_ flydest1
+		} 
+		* construct list of vars to keep
+		* if rawhh_keepvars is empty STATA will skip
+		local keepvars = ""
+		foreach v of varlist `rawhh_keepvars' {
+			capture confirm variable `v'
+			if !_rc {
+				* di "* found `v'"
+				local keepvars = "`keepvars' `v'"
+            	}
+			else {
+				di in red "`v' does not exist in `y' - will be missing"
+            }
+        }
+
+		di "* rawhh: keeping case* ba_* c_* `keepvars'"
+		renpfix ba_c_ c_
+		keep case* ba_* c_* `keepvars'		
+		
+		qui: compress
+
+		save "`efsd'/`y'/stata/rawhh-temp.dta", replace
+		******************************
+				
+		******************************
+		* rawper
+		di "* Need rawper file for ethnicity detail"
+		use "`efsd'/`y'/stata/rawper.dta", clear
+		
+		* 2010 data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+			rename *, lower	
+		}
+		* keep HRPs
+		
+		tab hrpid, nol
+		
+		keep if hrpid == 1
+		duplicates tag case, gen(tag)
+		
+		di "* -> Any duplicates in `y' ?"
+		
+		li case person hrpid sex dvage* if tag == 1
+		
+		if "`y'" == "2005-2006" | "`y'" == "2006-2007" {
+			di "* Age error: `y'"
+			drop if dvage18 == 2
+		}
+		else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+			di "Removing HRPs where age < 16 for `y'"
+			drop if dvage_p < 16
+		}
+		else {
+			di "Removing HRPs where age < 16 for `y'"
+			drop if dvage < 16
+		}
+		
+		di "* ethnic detail"
+		
+		if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
+			di "* year = `y'"
+			gen c_ethnicd = eth01p
+		}
+		else if "`y'" == "2011" | "`y'" == "2012" {
+			di "* year = `y'"
+			gen c_ethnicd = ethep
+			replace c_ethnicd = ethwp if c_ethnicd == .
+			replace c_ethnicd = ethsp if c_ethnicd == .
+			replace c_ethnicd = ethnip if c_ethnicd == .
+		}
+		else {
+			di "* year = `y'"
+			* before 2008
+			gen c_ethnicd = ethnic_p
+		}
+		lab var c_ethnicd "Detailed ethnic group"
+		lab def c_ethnicd 0 "Missing/inapplicable" 1 "White" 2 "Mixed" 3 "Asian" 4 "Black" 5 "Other"
+		lab val c_ethnicd c_ethnicd
+	
+		* construct list of vars to keep
+		* if rawper_keepvars is empty STATA will skip
+		local keepvars ""
+		foreach v of varlist `rawper_keepvars' {
+			capture confirm variable `v'
+			if !_rc {
+				*di found `v'"
+				local keepvars = "`keepvars' `v'"
+            	}
+			else {
+				di in red "`v' does not exist in `y' - will be missing"
+            }
+        }
+
+		* keep rawper variables
+		keep case* c_* `keepvars'
+		save "`efsd'/`y'/stata/rawper-temp.dta", replace
+		******************************
+		
+		******************************
+		* dvper
+		di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)"
+		
+		use "`efsd'/`y'/stata/dvper.dta", clear
+		
+		* 2010 data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
+			rename *, lower	
+		}
+
+		* keep only hrps
+		keep if a003 == 1
+		
+		* Ethnicity 
+		* NB a012 and a013 changed categories 2007->2008 to 0 = n/a, 1 = white, 2 = everyone else
+		* this still works, you can get detail from rawper.dta
+		recode a012p (1=0) (2/max=1), gen(c_white)
+		lab var c_white "Constraint: non-white HRP"
+		lab def c_white 0 "White HRP" 1 "Non-white HRP"
+		lab val c_white c_white
+		* if HRPs are not classified - they will show up as missing
+		
+		* construct list of vars to keep
+		* if dvper_keepvars is empty STATA will skip
+		local keepvars = ""
+		foreach v of varlist `dvper_keepvars' {
+			capture confirm variable `v'
+			if !_rc {
+				*di "* found `v'"
+				local keepvars = "`keepvars' `v'"
+            	}
+			else {
+				di in red "`v' does not exist in `y' - will be missing"
+            }
+        }
+		keep case* c_white
+		
+		qui: compress
+		
+		save "`efsd'/`y'/stata/dvper-temp.dta", replace
+		******************************
+		
+		************
+		* Now c_lli but this time need to collapse it so we count the number in the household with/out lli
+		* and count the number of children of various ages
+		
+		use "`efsd'/`y'/stata/dvper.dta", clear
+	
+		* 2010 onwards data = mixed/uppercase
+		if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" {
+			rename *, lower	
+		}
+		
+		* assume if someone is eligible for incapacity benefit then = lli
+		* even if not actually receiving
+		* in rawper dvincap = a + b
+		* in dvper a227 (1 = rec) = a a227 (2=not rec) = b
+		* ought to check prevalence with FRS
+		gen c_lli_sum = 0
+		replace c_lli_sum = 1 if a227 > 0
+		
+		**********
+		* need to count children of ages < 14 for OECD equivalence scale
+		* can then calculate n over 14 by substracting from (a040 + a041 + a042)
+		
+		gen ba_under14 = 0
+		replace ba_under14 = 1 if a005p  < 14
+		
+		* collapse to count the incidence of lli & count the number of children in each age group
+		* case is the household id
+		collapse (sum) c_lli_sum ba_*, by(case)
+		gen c_lli = 0
+		replace c_lli = 1 if c_lli_sum > 0
+		drop c_lli_sum
+		lab var c_lli "Constraint: presence of LLI"
+		lab def c_lli 0 "No person with lli" 1 "At least 1 person with lli"
+		lab var ba_under14 "Number of children aged under 14"
+
+		* keep only new variables as we've collapsed to hh
+		keep case* ba_* c_*
+		qui: compress
+		save "`efsd'/`y'/stata/dvper-lli.dta", replace
+	
+		
+		di "*** MERGE ALL FILES for year = `y'"
+		
+		*
+			
+		qui: merge case using  			"`efsd'/`y'/stata/dvhh-temp.dta"  			"`efsd'/`y'/stata/rawper-temp.dta"  			"`efsd'/`y'/stata/dvper-temp.dta"  			"`efsd'/`y'/stata/dvper-lli.dta"  			"`efsd'/`y'/stata/rawhh-temp.dta" , sort
+		
+		su _merge*
+			
+		* DELETE TEMPORARY FILES
+		erase "`efsd'/`y'/stata/dvhh-temp.dta"
+		erase "`efsd'/`y'/stata/rawper-temp.dta"
+		erase "`efsd'/`y'/stata/dvper-temp.dta"
+		erase "`efsd'/`y'/stata/dvper-lli.dta"
+		erase "`efsd'/`y'/stata/rawhh-temp.dta"
+		
+		gen survey_year = "`y'"
+		tab survey_year
+			
+		aorder
+		* keep all (makes life easier below as some vars are in some years and not others) 
+		
+		***********************************************
+		* calculate OECD equivalisation weight in order to equivalise income or expenditure
+		* see e.g. DWP HBAI reports
+		* can then calculate n over 14 by substracting from g019 
+		
+		gen ba_over14 = 0
+		replace ba_over14 = g019 - ba_under14
+		
+		gen ba_adults = g018 if g018 > 0
+		* ignore hhs with no adults (how many are there?)
+		
+		/*
+		1st adult = .67
+		spouse = .33
+		other 2nd adult = .33
+		3rd adult = .33
+		subsequent adults = .33
+		children aged < 14 = .2
+		children aged 14+ = .33
+		*/
+		* catch hh with no children
+		replace ba_under14 = 0 if ba_under14 == .
+		replace ba_over14 = 0 if ba_over14 == .
+		
+		gen oecd_equivbhcwt = 0.67 if ba_adults >= 1
+		replace oecd_equivbhcwt = oecd_equivbhcwt + ((ba_adults-1) * 0.33) + (ba_under14 * 0.2) + (ba_over14 * 0.33)
+		di "*-> Calculating equiv income (OECD) and quartiles/deciles"
+		
+		* p344, p389 & p396 changed to *p after 2006 and top coded (!)
+		if `y' > 2005 {
+			rename p344p p344
+			rename p389p p389
+			rename p396p p396
+		}
+		
+		gen equiv_p344 = p344/oecd_equivbhcwt
+		gen equiv_p389bhc = p389/oecd_equivbhcwt
+		gen equiv_p389ahc = (p389-p116t)/oecd_equivbhcwt
+		lab var equiv_p344 "Equivalised normal gross household income (OECD)"
+		lab var equiv_p389bhc "Equivalised normal disposable (net) household income (BHC, OECD)"
+		lab var equiv_p389ahc "Equivalised normal disposable (net) household income (AHC, OECD)"
+		
+		local incomes "incanon p344 p389"
+		local incanonl "anonymised hhold inc + allowances"
+		local p344l "gross normal weekly household income"
+		local p389l "normal weekly disposable hhld income"
+		foreach i of local incomes {
+			egen `i'_dec = cut(`i'), group(10)
+			lab var `i'_dec "Deciles: ``i'l'"
+			egen `i'_quart = cut(`i'), group(4)
+			lab var `i'_quart "Quartiles: ``i'l'"
+		}
+
+		* quarter labels changed in 2006
+		
+		* old:
+		* 	1	april to june
+		* 	2	june to september
+		* 	3	october to december
+		* 	4	january to march
+		
+		* new:
+		* a099:
+		*   1 january to march
+		*   2 april to june
+		*   3 july to september
+		*   4 october to december
+	
+		gen ba_month = a055
+		
+		* create a birth cohort variable
+		* remember that after 2005 age is top coded to 80
+		* year of birth
+		gen ba_birthyear = ba_sampyear - p396
+		* create a birth cohort variable
+		recode ba_birth (1900/1909=1 "1900-1909") (1910/1919=2 "1910-1919") (1920/1929=3 "1920-1929")  			(1930/1939=4 "1930-1939") (1940/1949=5 "1940-1949") (1950/1959=6 "1950-1959")  			(1960/1969=7 "1960-1969") (1970/1979=8 "1970-1979") (1980/1989=9 "1980-1989")  			(1990/1999=10 "1990-1999") (2000/2009=11 "2000-2009"),  			gen(ba_birth_cohort)
+			
+		* tab ba_birth_cohort c_age, mi
+	
+		***********************
+		* End of per-year processing
+		
+		gen ba_quarter = -1
+		replace ba_quarter = 1 if ba_month == 1 | ba_month == 2 | ba_month == 3
+		replace ba_quarter = 2 if ba_month == 4 | ba_month == 5 | ba_month == 6
+		replace ba_quarter = 3 if ba_month == 7 | ba_month == 8 | ba_month == 9
+		replace ba_quarter = 4 if ba_month == 10 | ba_month == 11 | ba_month == 12
+		
+		tab ba_quarter
+		
+		egen ba_sampyear_quarter = concat(ba_sampyear ba_quarter), punct("_Q")
+		lab var ba_sampyear_quarter "EFS/FES calendar year & quarter"
+		
+		egen ba_sampyear_month = concat(ba_sampyear ba_month), punct("_")
+		lab var ba_sampyear_month "EFS/FES calendar year & month"
+		
+		gen survey_name = "efs"
+
+		gen uk_country = 1 if region > 0 & region < 10
+		replace uk_country = 2 if region == 10
+		replace uk_country = 3 if region == 11
+		replace uk_country = 4 if region == 12
+		lab def uk_country 1 "England" 2 "Wales" 3 "Scotland" 4 "Northern Ireland"
+		lab val uk_country uk_country
+
+		
+		qui: compress
+		save "`outd'/EFS-`y'-extract-BA.dta", replace
+	}
+}
+
+****************************
+* now merge them all into one big file
+
+clear // start with nothing
+
+foreach y of local do_years {
+	di "Appending `y'"
+	qui: append using "`outd'/EFS-`y'-extract-BA.dta", force
+	*erase "`efsd'/`y'/FES-`y'-extract-BA.dta"
+}
+
+* the above code 
+tabstat c_*, c(s) s(mean min max)
+		
+lab var survey_year "EFS/FES year"
+
+* finally check for duplicate months 2005-6 -> 2006 samples
+tab ba_sampyear ba_month
+tab survey_year ba_month
+
+* Jan/Feb/Mar 2006 are duplicates of Jan/Feb/Mar 2005-6
+drop if survey_year == "2006" & ba_month == 1
+drop if survey_year == "2006" & ba_month == 2
+drop if survey_year == "2006" & ba_month == 3
+
+* check
+tab ba_sampyear ba_month
+
+
+gen caseno = case
+
+* to test
+tabstat c_ncars c_nchild c_npersons c_nearners c_nrooms,  by(ba_sampyear)
+
+drop _merge* a055 g* p116t caseno
+
+di "*-> Compressing"
+qui: compress 
+
+aorder
+
+order case* survey*
+
+save "`outd'/EFS-`extract_years'-extract-BA.dta", replace
+
+di "************************************************************************"
+if `do_extracts' {
+	di "*-> do_extracts = `do_extracts', all years (`years') extracted and refreshed"
+	}
+else 
+	{
+	di "*-> do_extracts = `do_extracts', years not extracted so individual files not refreshed"
+}
+di "*-> Job ended at $S_DATE"
+
+log close
+
-- 
GitLab