From 15d12e2f48bb22ae985b321dcae9d21085c8ede3 Mon Sep 17 00:00:00 2001 From: Ben Anderson <b.anderson@soton.ac.uk> Date: Wed, 29 Apr 2015 12:12:26 +0100 Subject: [PATCH] simplified extraction code only keeps a basic core of variables to produce a smaller file on the assumption that this will be linked back to source expenditure data for specific analyses --- ONS-UK-EFS-time-series-extract.do | 101 ++++++++++++++---------------- 1 file changed, 48 insertions(+), 53 deletions(-) diff --git a/ONS-UK-EFS-time-series-extract.do b/ONS-UK-EFS-time-series-extract.do index de319e4..64da14e 100755 --- a/ONS-UK-EFS-time-series-extract.do +++ b/ONS-UK-EFS-time-series-extract.do @@ -35,9 +35,11 @@ GNU General Public License for more details. * History * 15/8/2012 moved creation of ba_quarter etc to per-year processing - -* TO DO -* update to 2012 +* 28/4/2015 - adding 2011 & 2012 +* LCFS database changes: +* 2010-2011 - 7272volume_h_changes_database_2011.xls +* 2011-2012 - 7472_volume_h_changes_database_2012.xls +* 29/4/2015 - changed to only produce basic file on the assumption that this will be linked back to source expenditure data for specific analyses * NB - the script assumes a certain folder structure for the source EFS data like so: * `efsd'/<year>/stata/<datafile>.dta @@ -52,7 +54,7 @@ local place = "/Users/ben/Documents/Work" local efsd = "`place'/Data/Social Scie local logd = "`efsd'/log_files" local outd = "`efsd'/processed" * Years to be extracted -* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 +* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 * NB: from 2006 the EFS is collected on a calendar year basis. * This means that 2005-6_Q4 and 2006_Q1 are exactly the same cases @@ -61,15 +63,18 @@ local logd = "`efsd'/log_files" local outd = "`efsd'/processed" * To save time you can leave out years you have already processed * just paste the ones you want into the allyears local variable below -local years "2001-2010" // just a label -local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010" // years to process local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010" +local years "2001-2012" // just a label +* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 +local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" * set to 1 to refresh each yearly extract you listed in do_years local do_extracts "1" * drop the first survey for the merge as it is loaded first -local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010" - +local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" + +* be sure to keep only these from the dv hh file - last ones needed for income equivalisation later +local dvhh_keepvars = "incanon weight* a055 g018 g019 p116* p344* p389*" capture log close log using "`logd'/ONS-UK-EFS-time-series-extract-$S_DATE.smcl", replace @@ -121,8 +126,8 @@ if `do_extracts' { * c_white_0 c_white_1 * 0 "White HRP" 1 "Non-white HRP" */ ********* di "* dv household file" use "`efsd'/`y'/stata/dvhh.dta", clear - * 2010 data = mixed/uppercase - if "`y'" === "2010" { + * 2010 onwards data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { rename *, lower } ** sex of HRP gen c_gender = -1 replace c_gender = 0 if sexhrp == 1 replace c_gender = 1 if sexhrp == 2 lab var c_gender "Constraint: Gender of HRP" lab def c_gender 0 "Male" 1 "Female" lab val c_gender c_gender ** age of HRP * need to use 75+ as few 80+ after 2001-2 recode p396p (min/15= . ) (16/24 = 0) (25/34 = 1) (35/44 = 2) (45/54 = 3) /// (55/64 = 4) (65/74 = 5) (75/max = 6), gen(c_age) lab var c_age "Constraint: Age of HRP" @@ -134,29 +139,24 @@ if `do_extracts' { * has 1, 2 and more than 2 - so needs recoding. recode c_nchild (0=0) (1=1) (2/max=2) lab var c_nchild "Constraint: number of children" lab define c_nchild 0 None 1 One 2 "Two or more" lab val c_nchild c_nchild ** Sorting sort case - * keep all variables and base final merge on it - + * keep only basic variables + keep case* c_* region `dvhh_keepvars' qui: compress * save all vars save "`efsd'/`y'/stata/dvhh-temp.dta", replace ****************************** di "* raw household file for c_comp and c_accom" * also to pick up electricity water payments periodicity etc for error analysis use "`efsd'/`y'/stata/rawhh.dta", clear - * 2010 data = mixed/uppercase - if "`y'" === "2010" { + * 2010 onwards data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { rename *, lower } * accomodation type * Object= * 0 Detached * 1 Semi * 2 Terrace * 3 flat/maisontte * 4 other gen ba_c_accom = -1 replace ba_c_accom = 0 if hsetype == 1 replace ba_c_accom = 1 if hsetype == 2 replace ba_c_accom = 2 if hsetype == 3 if survyr > 2001 { * grr, why can't var names stay the same?! gen acomtype = accom - } replace ba_c_accom = 3 if acomtype == 2 replace ba_c_accom = 4 if acomtype == 3 replace ba_c_accom = 4 if acomtype == 4 lab var ba_c_accom "Constraint: accommodation type" lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" lab val ba_c_accom c_accom ** Composition. * Object = * 0 'married/partnered' * 1 'single parent' * 2 'single person' * 3 'other'. * co-habiting. gen ba_c_comp=1 if numcpart>0 * married. replace ba_c_comp=0 if nummpart>0 * single parent - assumes a single adult living with 1 or more * children is a single parent. replace ba_c_comp=1 if (numadult==1 & numchild>0) * single person. replace ba_c_comp=2 if (numadult==1 & numchild==0) * the rest - this is a cheat! recode ba_c_comp (missing=3) lab var ba_c_comp "Constraint: household composition" lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" lab val ba_c_comp c_comp tab ba_c_comp numcpart tab ba_c_comp nummpart * rawhh derived cable tv dummies * if tvtype = 3 = cable, if = 4 is cable + phone gen tvtype_2 = tvtype2 gen tvtype_3 = tvtype3 gen tvtype_4 = tvtype4 * if exists - gen tvtype_5 = tvtype5 egen ba_c_has_cable_rawhh = anymatch(tvtype_*), values(3 4) * check for tvtype(1) as well replace ba_c_has_cable_rawhh = 1 if tvtype == 3 lab var ba_c_has_cable_rawhh "Sim: Has cable based on tvtype in rawhh" - - gen ba_calyear = -1 + } replace ba_c_accom = 3 if acomtype == 2 replace ba_c_accom = 4 if acomtype == 3 replace ba_c_accom = 4 if acomtype == 4 lab var ba_c_accom "Constraint: accommodation type" lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" lab val ba_c_accom c_accom ** Composition. * Object = * 0 'married/partnered' * 1 'single parent' * 2 'single person' * 3 'other'. * co-habiting. gen ba_c_comp=1 if numcpart>0 * married. replace ba_c_comp=0 if nummpart>0 * single parent - assumes a single adult living with 1 or more * children is a single parent. replace ba_c_comp=1 if (numadult==1 & numchild>0) * single person. replace ba_c_comp=2 if (numadult==1 & numchild==0) * the rest - this is a cheat! recode ba_c_comp (missing=3) lab var ba_c_comp "Constraint: household composition" lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" lab val ba_c_comp c_comp tab ba_c_comp numcpart tab ba_c_comp nummpart gen ba_calyear = -1 - local keepvars_orig "waterpay watermet elecpay eacamt eacper elecpayo dveac estndordamt estndord estndo_1 estndo_2 dvestndo dsselecf dsselecp dwpelecf dwpelecp dwpper" - local keepvars "" - if "`y'" == "2001-2002" | "`y'" == "2002-2003" | "`y'" == "2003-2004" | "`y'" == "2004-2005" | "`y'" == "2005-2006" | "`y'" == "2006" | "`y'" == "2007" { di "* Setting up cal year for `y'" tab survyr sampyear, mi - replace ba_sampyear = sampyear + gen ba_sampyear = sampyear * ba_calyear removed as was actually sample year * construct list of vars to keep based on ideal @@ -171,18 +171,14 @@ if `do_extracts' { *di in red "weight does not exist" } } - di "***" - di "* Want: `keepvars_orig'" - di "* Have: `keepvars'" - di "***" - keep case ba_* wsinc water* sewsep ctwat percwat percsew percwsew ctwat `keepvars' + keep case* ba_* di "* Done setting up sample year for `y'" } - if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" { + if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { di "* Setting up cal year for `y'" * sampyear variable removed - replace ba_sampyear = survyr + gen ba_sampyear = survyr * construct list of vars to keep based on ideal foreach v of local keepvars_orig { capture confirm variable `v' @@ -194,12 +190,8 @@ if `do_extracts' { *di in red "weight does not exist" } } - di "***" - di "* Want: `keepvars_orig'" - di "* Have: `keepvars'" - di "***" lab var ba_sampyear "Sample year" - keep case ba_* wsinc water* sewsep ctwat percwat percsew percwsew ctwat `keepvars' + keep case* ba_* di "* Done setting up sample year for `y'" } @@ -215,7 +207,7 @@ if `do_extracts' { use "`efsd'/`y'/stata/rawper.dta", clear * 2010 data = mixed/uppercase - if "`y'" === "2010" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { rename *, lower } * keep HRPs @@ -233,7 +225,7 @@ if `do_extracts' { di "* Age error: `y'" drop if dvage18 == 2 } - else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" { + else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { di "Removing HRPs where age < 16 for `y'" drop if dvage_p < 16 } @@ -242,12 +234,21 @@ if `do_extracts' { drop if dvage < 16 } - * ethnic detail + di "* ethnic detail" if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" { + di "* year = `y'" gen c_ethnicd = eth01p } + else if "`y'" == "2011" | "`y'" == "2012" { + di "* year = `y'" + gen c_ethnicd = ethep + replace c_ethnicd = ethwp if c_ethnicd == . + replace c_ethnicd = ethsp if c_ethnicd == . + replace c_ethnicd = ethnip if c_ethnicd == . + } else { + di "* year = `y'" * before 2008 gen c_ethnicd = ethnic_p } @@ -256,20 +257,20 @@ if `do_extracts' { lab val c_ethnicd c_ethnicd * keep only new variables - keep case c_* + keep case* c_* save "`efsd'/`y'/stata/rawper-temp.dta", replace ************ di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)" use "`efsd'/`y'/stata/dvper.dta", clear * 2010 data = mixed/uppercase - if "`y'" === "2010" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { rename *, lower } * keep only hrps keep if a003 == 1 * Ethnicity * NB a012 and a013 changed categories 2007->2008 to 0 = n/a, 1 = white, 2 = everyone else - * this still works, you can get detail from rawper.dta recode a012p (1=0) (2/max=1), gen(c_white) lab var c_white "Constraint: non-white HRP" lab def c_white 0 "White HRP" 1 "Non-white HRP" lab val c_white c_white * NB - 20 HRPs are not classified - they will show up as missing keep case c_white a010 + * this still works, you can get detail from rawper.dta recode a012p (1=0) (2/max=1), gen(c_white) lab var c_white "Constraint: non-white HRP" lab def c_white 0 "White HRP" 1 "Non-white HRP" lab val c_white c_white * if HRPs are not classified - they will show up as missing keep case* c_white qui: compress save "`efsd'/`y'/stata/dvper-temp.dta", replace @@ -278,8 +279,8 @@ if `do_extracts' { * and count the number of children of various ages use "`efsd'/`y'/stata/dvper.dta", clear - * 2010 data = mixed/uppercase - if "`y'" === "2010" { + * 2010 onwards data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" { rename *, lower } @@ -309,7 +310,7 @@ if `do_extracts' { lab var ba_under14 "Number of children aged under 14" * keep only new variables - keep case ba_* c_* + keep case* ba_* c_* qui: compress save "`efsd'/`y'/stata/dvper-lli.dta", replace di "*** MERGE ALL FILES for year = `y'" @@ -334,7 +335,7 @@ if `do_extracts' { * keep all (makes life easier below as some vars are in some years and not others) *********************************************** - * calculate OECD equivaisation weight in order to equivalise income or expenditure + * calculate OECD equivalisation weight in order to equivalise income or expenditure * see e.g. DWP HBAI reports * can then calculate n over 14 by substracting from g019 @@ -419,7 +420,7 @@ foreach y of local mergeyears { tabstat c_*, c(s) s(mean min max) -lab var year "Calendar year" lab var survey_year "EFS/FES year" +lab var survey_year "EFS/FES year" * finally check for duplicate months 2005-6 -> 2006 samples tab ba_calyear ba_month @@ -436,17 +437,9 @@ tab ba_calyear ba_month di "Compressing" qui: compress -* drop fs* as only exist in 2005-6 -drop fs* aorder -* full version -save "`outd'/EFS-`years'-extract-BA.dta", replace - -* use this version to match to case in older files for specific variable input -keep case survey_year year gorx incanon c_* ba_* weight* - gen caseno = case if `do_extracts' { @@ -457,7 +450,9 @@ else di "do_extracts = `do_extracts', years not extracted so individual files not refreshed" } -su c_* + +* to test +tabstat c_ncars c_nchild c_npersons c_nearners c_nrooms, by(ba_sampyear) save "`outd'/EFS-`years'-extract-reduced-BA.dta", replace di "Job ended at $S_DATE" -- GitLab