diff --git a/ONS-UK-EFS-time-series-extract.do b/ONS-UK-EFS-time-series-extract.do index 555f80973f4b8c2a201934d4b02c66a530430f4a..a68ec3b4e17cdee12742507acdb5a3f5dd0304e0 100755 --- a/ONS-UK-EFS-time-series-extract.do +++ b/ONS-UK-EFS-time-series-extract.do @@ -31,15 +31,17 @@ GNU General Public License for more details. *********************** * Processes the EFS/LCFS into a set of identical files and then merges them -* NB it treats the Living Costs and Food Survey 2008 as simply another EFS (which it is) +* NB it treats the Living Costs and Food Survey 2008 onwards as simply another EFS (which it is) * History -* 15/8/2012 moved creation of ba_quarter etc to per-year processing +* 15/8/2012 moved creation of ba_quarter & income equivalisation, quartiles/deciles to per-year processing * 28/4/2015 - adding 2011 & 2012 * LCFS database changes: * 2010-2011 - 7272volume_h_changes_database_2011.xls * 2011-2012 - 7472_volume_h_changes_database_2012.xls * 29/4/2015 - changed to only produce basic file on the assumption that this will be linked back to source expenditure data for specific analyses +* 2/5/2015 - this does not really work that well as (for example) variable names went to capital letters in 2010 (why why why) +* so a mechanism has been included to allow the extraction of bespoke variables sets once this script has set all variable names to lower case * NB - the script assumes a certain folder structure for the source EFS data like so: * `efsd'/<year>/stata/<datafile>.dta @@ -53,38 +55,49 @@ set more off local place = "/Users/ben/Documents/Work" local efsd = "`place'/Data/Social Science Datatsets/Expenditure and Food Survey" local logd = "`efsd'/log_files" local outd = "`efsd'/processed" -* Years to be extracted -* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 * NB: from 2006 the EFS is collected on a calendar year basis. * This means that 2005-6_Q4 and 2006_Q1 are exactly the same cases * This script removes 2006_Q1 later on to avoid duplication -* To save time you can leave out years you have already processed -* just paste the ones you want into the allyears local variable below - local years "2001-2012" // just a label -* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 -local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" -* set to 1 to refresh each yearly extract you listed in do_years -local do_extracts 1 +* To save time you can leave out years you have already processed +* just paste the ones you want into the do_years local variable below +* choose any of 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012 +local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process + +* set to 1 to refresh each yearly extract you listed in do_years & append the files +* set to 0 to just append previously extracted files +local do_extracts 0 -* drop the first survey for the merge as it is loaded first -local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" +************ +* Set the variables to be extracted here +* there is error checking below to make sure that they exist in each year or to skip if not +* put case in each keep var if otherwise empty -* be sure to keep only these from the dv hh file - these will be 'kept' (along with case and ba_* or c_* after the var names have been reduced to lower case +* -> dvhh file * basic weighting & consumption expenditure codes -local dvhh_keepvars = "weight* p60*t p61*t p630* p396*" +local dvhh_keepvars = "p60*t p61*t p630* p396*" * needed for income equivalisation later local dvhh_keepvars = "`dvhh_keepvars' incanon a055 g018 g019 p116* p344* p389*" * DEMAND 2.3 (older people mobile lives) -local dvhh_keepvars = "`dvhh_keepvars' b480 b481 b485 c96111* c96112* cc5413* c73311* c73312*" +local dvhh_keepvars = "`dvhh_keepvars' b480 b481 b485 cc5413t c73312t" // overseas travel expenditures + +* -> dvper file +local dvper_keepvars = "case" -* keep these from rawhh file +* -> rawhh file * DEMAND 2.3 (older people mobile lives) -local rawhh_keepvars = "flydes*" +local rawhh_keepvars = "flydes*" // flights - NB see 2012 fix below in rawhh section + +* -> rawper file +local rawper_keepvars = "case" +************ + +************ +* set logging capture log close log using "`logd'/ONS-UK-EFS-time-series-extract-$S_DATE.smcl", replace clear all @@ -95,18 +108,19 @@ set maxvar 10000, perm di "*******************************************************" di "* This script will process all of the EFS files for:" di "* `years'" -di "* -di "* This could take some time. -di "* I suggest you check it is running and then get a cup of tea.... +di "* It will keep the following variables:" +di "* dvhh: `dvhh_keepvars' " +di "* rawhh: `rawhh_keepvars'" +di "* This could take some time. " +di "* I suggest you check it is running and then get a cup of tea...." di "*******************************************************" if `do_extracts' { di "do_extracts = `do_extracts', all years (`years') to be extracted and refreshed" foreach y of local do_years { - * create new constraints in FES data di "* * * * * * * * * " di "* -> Processing `y'" - /* constraint vars/labels + /* census vars/labels * c_accom_0 c_accom_1 c_accom_2 c_accom_3 c_accom_4 * 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" @@ -134,7 +148,8 @@ if `do_extracts' { * c_tenure_0 c_tenure_1 c_tenure_2 c_tenure_3 * 0 "Owned" 1 "Rent from council" 2 "Social rent" 3 "Private rent incl rent free" * c_white_0 c_white_1 * 0 "White HRP" 1 "Non-white HRP" - */ ********* di "* dv household file" use "`efsd'/`y'/stata/dvhh.dta", clear + */ ****************************** + * dvhh di "* dv household file" use "`efsd'/`y'/stata/dvhh.dta", clear * 2010 onwards data = mixed/uppercase if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { rename *, lower @@ -148,13 +163,27 @@ if `do_extracts' { * has 1, 2 and more than 2 - so needs recoding. recode c_nchild (0=0) (1=1) (2/max=2) lab var c_nchild "Constraint: number of children" lab define c_nchild 0 None 1 One 2 "Two or more" lab val c_nchild c_nchild gen ba_year = `y' - ** Sorting sort case - * keep only basic variables - - keep case* ba_* c_* region `dvhh_keepvars' - qui: compress * save all vars save "`efsd'/`y'/stata/dvhh-temp.dta", replace ****************************** di "* raw household file for c_comp and c_accom" - di "* also to pick up `rawhh_keepvars'" use "`efsd'/`y'/stata/rawhh.dta", clear + * construct list of vars to keep + * if dvhh_keepvars is empty STATA will skip + local keepvars = "" + foreach v of varlist `dvhh_keepvars' { + capture confirm variable `v' + if !_rc { + *di "* found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + + di "* dvhh: keeping case* ba_* c_* region weight* `keepvars'" + keep case* ba_* c_* region weight* `keepvars' + qui: compress * save kept dvhh vars save "`efsd'/`y'/stata/dvhh-temp.dta", replace ****************************** + ****************************** + * rawhh di "* raw household file for: c_comp and c_accom" + di "* also to pick up: `rawhh_keepvars'" use "`efsd'/`y'/stata/rawhh.dta", clear * 2010 onwards data = mixed/uppercase if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { rename *, lower @@ -162,61 +191,48 @@ if `do_extracts' { if survyr > 2001 { * grr, why can't var names stay the same?! gen acomtype = accom - } replace ba_c_accom = 3 if acomtype == 2 replace ba_c_accom = 4 if acomtype == 3 replace ba_c_accom = 4 if acomtype == 4 lab var ba_c_accom "Constraint: accommodation type" lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" lab val ba_c_accom c_accom ** Composition. * Object = * 0 'married/partnered' * 1 'single parent' * 2 'single person' * 3 'other'. * co-habiting. gen ba_c_comp=1 if numcpart>0 * married. replace ba_c_comp=0 if nummpart>0 * single parent - assumes a single adult living with 1 or more * children is a single parent. replace ba_c_comp=1 if (numadult==1 & numchild>0) * single person. replace ba_c_comp=2 if (numadult==1 & numchild==0) * the rest - this is a cheat! recode ba_c_comp (missing=3) lab var ba_c_comp "Constraint: household composition" lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" lab val ba_c_comp c_comp tab ba_c_comp numcpart tab ba_c_comp nummpart gen ba_calyear = -1 - + } replace ba_c_accom = 3 if acomtype == 2 replace ba_c_accom = 4 if acomtype == 3 replace ba_c_accom = 4 if acomtype == 4 lab var ba_c_accom "Constraint: accommodation type" lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" lab val ba_c_accom c_accom ** Composition. * Object = * 0 'married/partnered' * 1 'single parent' * 2 'single person' * 3 'other'. * co-habiting. gen ba_c_comp=1 if numcpart>0 * married. replace ba_c_comp=0 if nummpart>0 * single parent - assumes a single adult living with 1 or more * children is a single parent. replace ba_c_comp=1 if (numadult==1 & numchild>0) * single person. replace ba_c_comp=2 if (numadult==1 & numchild==0) * the rest - this is a cheat! recode ba_c_comp (missing=3) lab var ba_c_comp "Constraint: household composition" lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" lab val ba_c_comp c_comp tab ba_c_comp numcpart tab ba_c_comp nummpart if "`y'" == "2001-2002" | "`y'" == "2002-2003" | "`y'" == "2003-2004" | "`y'" == "2004-2005" | "`y'" == "2005-2006" | "`y'" == "2006" | "`y'" == "2007" { di "* Setting up cal year for `y'" - tab survyr sampyear, mi + * tab survyr sampyear, mi gen ba_sampyear = sampyear - * ba_calyear removed as was actually sample year - - * construct list of vars to keep based on ideal - foreach v of local keepvars_orig { - capture confirm variable `v' - if !_rc { - *di in red "weight exists" - su `v' - local keepvars = "`keepvars' `v'" - } - else { - *di in red "weight does not exist" - } - } - di "* Done setting up sample year for `y'" + lab var ba_sampyear "Sample year" } if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { di "* Setting up cal year for `y'" * sampyear variable removed + * tab survyr sampyear, mi gen ba_sampyear = survyr - * construct list of vars to keep based on ideal - foreach v of local keepvars_orig { - capture confirm variable `v' - if !_rc { - *di in red "weight exists" - local keepvars = "`keepvars' `v'" - } - else { - *di in red "weight does not exist" - } - } lab var ba_sampyear "Sample year" - - di "* Done setting up sample year for `y'" } if "`y'" == "2012" { - * appear to have set multiple flights varnames differently + di "* fixing issue with multiple flights varnames differently for `y'" renpfix flydes_ flydest1 } - keep case* ba_* `rawhh_keepvars' - * Useful vars already 'kept' - * fix name + * construct list of vars to keep + * if rawhh_keepvars is empty STATA will skip + local keepvars = "" + foreach v of varlist `rawhh_keepvars' { + capture confirm variable `v' + if !_rc { + * di "* found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + + di "* rawhh: keeping case* ba_* c_* `keepvars'" renpfix ba_c_ c_ + keep case* ba_* c_* `keepvars' qui: compress - sort case - save "`efsd'/`y'/stata/rawhh-temp.dta", replace ************* - * + + save "`efsd'/`y'/stata/rawhh-temp.dta", replace ****************************** + ****************************** + * rawper di "* Need rawper file for ethnicity detail" use "`efsd'/`y'/stata/rawper.dta", clear @@ -270,11 +286,27 @@ if `do_extracts' { lab def c_ethnicd 0 "Missing/inapplicable" 1 "White" 2 "Mixed" 3 "Asian" 4 "Black" 5 "Other" lab val c_ethnicd c_ethnicd - * keep only new variables - keep case* c_* + * construct list of vars to keep + * if rawper_keepvars is empty STATA will skip + local keepvars "" + foreach v of varlist `rawper_keepvars' { + capture confirm variable `v' + if !_rc { + *di found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + + * keep rawper variables + keep case* c_* `keepvars' save "`efsd'/`y'/stata/rawper-temp.dta", replace + ****************************** - ************ di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)" + ****************************** + * dvper di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)" use "`efsd'/`y'/stata/dvper.dta", clear * 2010 data = mixed/uppercase @@ -284,10 +316,26 @@ if `do_extracts' { * Ethnicity * NB a012 and a013 changed categories 2007->2008 to 0 = n/a, 1 = white, 2 = everyone else - * this still works, you can get detail from rawper.dta recode a012p (1=0) (2/max=1), gen(c_white) lab var c_white "Constraint: non-white HRP" lab def c_white 0 "White HRP" 1 "Non-white HRP" lab val c_white c_white * if HRPs are not classified - they will show up as missing keep case* c_white + * this still works, you can get detail from rawper.dta recode a012p (1=0) (2/max=1), gen(c_white) lab var c_white "Constraint: non-white HRP" lab def c_white 0 "White HRP" 1 "Non-white HRP" lab val c_white c_white * if HRPs are not classified - they will show up as missing + + * construct list of vars to keep + * if dvper_keepvars is empty STATA will skip + local keepvars = "" + foreach v of varlist `dvper_keepvars' { + capture confirm variable `v' + if !_rc { + *di "* found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + keep case* c_white qui: compress - save "`efsd'/`y'/stata/dvper-temp.dta", replace + save "`efsd'/`y'/stata/dvper-temp.dta", replace + ****************************** ************ * Now c_lli but this time need to collapse it so we count the number in the household with/out lli * and count the number of children of various ages @@ -323,7 +371,7 @@ if `do_extracts' { lab def c_lli 0 "No person with lli" 1 "At least 1 person with lli" lab var ba_under14 "Number of children aged under 14" - * keep only new variables + * keep only new variables as we've collapsed to hh keep case* ba_* c_* qui: compress save "`efsd'/`y'/stata/dvper-lli.dta", replace @@ -394,8 +442,6 @@ if `do_extracts' { * new: * a099: * 1 january to march * 2 april to june * 3 july to september * 4 october to december - *egen ba_calyear_quarter = concat(ba_calyear ba_quarter), punct("_Q") - gen ba_month = a055 * create a birth cohort variable @@ -421,11 +467,11 @@ if `do_extracts' { tab ba_quarter - egen ba_calyear_quarter = concat(ba_calyear ba_quarter), punct("_Q") - lab var ba_calyear_quarter "EFS/FES calendar year & quarter" + egen ba_sampyear_quarter = concat(ba_sampyear ba_quarter), punct("_Q") + lab var ba_sampyear_quarter "EFS/FES calendar year & quarter" - egen ba_calyear_month = concat(ba_calyear ba_month), punct("_") - lab var ba_calyear_month "EFS/FES calendar year & month" + egen ba_sampyear_month = concat(ba_sampyear ba_month), punct("_") + lab var ba_sampyear_month "EFS/FES calendar year & month" gen survey_name = "efs" @@ -444,21 +490,21 @@ if `do_extracts' { **************************** * now merge them all into one big file -* start with 2001/2 -use "`outd'/EFS-2001-2002-extract-BA.dta", clear +clear // start with nothing -foreach y of local mergeyears { +foreach y of local do_years { di "Appending `y'" qui: append using "`outd'/EFS-`y'-extract-BA.dta", force *erase "`efsd'/`y'/FES-`y'-extract-BA.dta" } +* the above code tabstat c_*, c(s) s(mean min max) lab var survey_year "EFS/FES year" * finally check for duplicate months 2005-6 -> 2006 samples -tab ba_calyear ba_month +tab ba_sampyear ba_month tab survey_year ba_month * Jan/Feb/Mar 2006 are duplicates of Jan/Feb/Mar 2005-6 @@ -467,7 +513,7 @@ drop if survey_year == "2006" & ba_month == 2 drop if survey_year == "2006" & ba_month == 3 * check -tab ba_calyear ba_month +tab ba_sampyear ba_month gen caseno = case @@ -475,7 +521,7 @@ gen caseno = case * to test tabstat c_ncars c_nchild c_npersons c_nearners c_nrooms, by(ba_sampyear) -drop _merge* a055 g* p116t +drop _merge* a055 g* p116t caseno di "*-> Compressing" qui: compress