From fa52d4d8fdb72b059427a9ed82da55fd4b782b3f Mon Sep 17 00:00:00 2001 From: Ben Anderson <b.anderson@soton.ac.uk> Date: Mon, 20 Jul 2015 11:10:22 +0100 Subject: [PATCH] updated extraction script to include 2013 --- ONS-UK-EFS-LCFS-time-series-extract.do | 25 +- sublime2stata.do | 733 +++++++++++++++++++++++++ 2 files changed, 746 insertions(+), 12 deletions(-) create mode 100644 sublime2stata.do diff --git a/ONS-UK-EFS-LCFS-time-series-extract.do b/ONS-UK-EFS-LCFS-time-series-extract.do index 70025b8..3a84046 100755 --- a/ONS-UK-EFS-LCFS-time-series-extract.do +++ b/ONS-UK-EFS-LCFS-time-series-extract.do @@ -62,12 +62,12 @@ local outd = "`efsd'/processed" * `efsd'/processed/ *********************** -local extract_years "2001-2012" // just a name for the FINAL extracted file +local extract_years "2001-2013" // just a name for the FINAL extracted file * To save time you can leave out years you have already processed * just paste the ones you want into the do_years local variable below * choose any of 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012 -local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process +local do_years = "2013" // years to process * set to 1 to refresh each yearly extract you listed in do_years & append the files * set to 0 to just append previously extracted files @@ -174,7 +174,7 @@ if `do_extracts' { use "`efsd'/`y'/stata/dvhh.dta", clear * 2010 onwards data = mixed/uppercase - if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { rename *, lower } ** sex of HRP @@ -298,11 +298,12 @@ if `do_extracts' { * has 1, 2 and more than 2 - so needs recoding. recode c_nchild (0=0) (1=1) (2/max=2) lab var c_nchild "Constraint: number of children" - lab define c_nchild 0 None 1 One 2 "Two or more" + lab define c_nchild 0 "None" 1 "One" 2 "Two or more" lab val c_nchild c_nchild + di "Year = `y'" gen ba_year = `y' - + * construct list of vars to keep * if dvhh_keepvars is empty STATA will skip local keepvars = "" @@ -345,7 +346,7 @@ if `do_extracts' { use "`efsd'/`y'/stata/rawhh.dta", clear * 2010 onwards data = mixed/uppercase - if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { rename *, lower } @@ -404,7 +405,7 @@ if `do_extracts' { lab var ba_sampyear "Sample year" } - if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { di "* Setting up cal year for `y'" * sampyear variable removed * tab survyr sampyear, mi @@ -444,7 +445,7 @@ if `do_extracts' { use "`efsd'/`y'/stata/rawper.dta", clear * 2010 data = mixed/uppercase - if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { rename *, lower } * keep HRPs @@ -462,7 +463,7 @@ if `do_extracts' { di "* Age error: `y'" drop if dvage18 == 2 } - else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { di "Removing HRPs where age < 16 for `y'" drop if dvage_p < 16 } @@ -477,7 +478,7 @@ if `do_extracts' { di "* year = `y'" gen c_ethnicd = eth01p } - else if "`y'" == "2011" | "`y'" == "2012" { + else if "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { di "* year = `y'" gen c_ethnicd = ethep replace c_ethnicd = ethwp if c_ethnicd == . @@ -519,7 +520,7 @@ if `do_extracts' { use "`efsd'/`y'/stata/dvper.dta", clear * 2010 data = mixed/uppercase - if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2013" { rename *, lower } @@ -562,7 +563,7 @@ if `do_extracts' { use "`efsd'/`y'/stata/dvper.dta", clear * 2010 onwards data = mixed/uppercase - if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" { + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" | "`y'" == "2013" { rename *, lower } diff --git a/sublime2stata.do b/sublime2stata.do new file mode 100644 index 0000000..6dce73d --- /dev/null +++ b/sublime2stata.do @@ -0,0 +1,733 @@ + + +* run without waiting for user +set more off + +* set these to what works for you +local place = "~/Documents/Work" +local efsd = "`place'/Data/Social Science Datatsets/Expenditure and Food Survey" +local logd = "`efsd'/log_files" +local outd = "`efsd'/processed" + +************************ +* NB - the script assumes a certain folder structure for the source EFS/LCFS data like so: +* `efsd'/<year>/stata/<datafile>.dta +* You may have to remane some of the downloaded & unzipped UKDA data folders to make this work +* The script also assumes that this folder exists for the final results: +* `efsd'/processed/ +*********************** + +local extract_years "2001-2013" + +* To save time you can leave out years you have already processed +* just paste the ones you want into the do_years local variable below +* choose any of 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012 +local do_years = "2013" + +* set to 1 to refresh each yearly extract you listed in do_years & append the files +* set to 0 to just append previously extracted files +local do_extracts 1 + +************************ +* Set the variables to be extracted here +* there is error checking below to make sure that they exist in each year or to skip if not +* put case in each keep var if otherwise empty + +* -> dvhh file +* basic weighting & consumption expenditure codes +* p60*t p61*t p630* +local dvhh_keepvars = "" +* needed for income equivalisation later +local dvhh_keepvars = "`dvhh_keepvars' incanon a055 g018 g019 p116* p344* p389* p396*" +* DEMAND 2.3 (older people mobile lives) +*local dvhh_keepvars = "`dvhh_keepvars' b480 b481 b485 cc5413t c73312t" +* DEMAND 3.1 (adapting infrastructures) +* a1701 a1711 +local dvhh_keepvars = "`dvhh_keepvars' a103 a108 a128 a130 a15* a16*" + +* -> dvper file +local dvper_keepvars = "case" + +* -> rawhh file +* DEMAND 2.3 (older people mobile lives) +local rawhh_keepvars = "flydes*" + +* -> rawper file +local rawper_keepvars = "case" +************************ + +************************ +* set logging +capture log close +log using "`logd'/ONS-UK-EFS-time-series-extract-$S_DATE.smcl", replace +clear all + +* increase default number of variables allowed +set maxvar 10000, perm + +di "*******************************************************" +di "* This script will process all of the EFS files for:" +di "* `years'" +di "* It will keep the following variables:" +di "* dvhh: `dvhh_keepvars' " +di "* rawhh: `rawhh_keepvars'" +di "* This could take some time. " +di "* I suggest you check it is running and then get a cup of tea...." +di "*******************************************************" +if `do_extracts' { + di "do_extracts = `do_extracts', all years (`years') to be extracted and refreshed" + + foreach y of local do_years { + di "* * * * * * * * * " + di "* -> Processing `y'" + /* census vars/labels + + * c_accom_0 c_accom_1 c_accom_2 c_accom_3 c_accom_4 + * 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" + + * c_age_0 c_age_1 c_age_2 c_age_3 c_age_4 c_age_5 c_age_6 c_age_7 + * 0 "16-24" 1 "25-34" 2 "35-44" 3 "45-54" 4 "55-64" 5 "65-74" 6 "75+" + + * c_comp_0 c_comp_1 c_comp_2 c_comp_3 + * 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" + + * c_lli_0 c_lli_1 + * 0 "No person with lli" 1 "At least 1 person with lli" + + * c_ncars_0 c_ncars_1 c_ncars_2 + * 0 None 1 One 2 "Two or more" + + * c_nchild_0 c_nchild_1 c_nchild_2 + * 0 None 1 One 2 "Two or more" + + * c_nearners_0 c_nearners_1 c_nearners_2 c_nearners_3 + * 0 "0" 1 "1" 2 "2" 3 "3+" + + * c_npersons_0 c_npersons_1 c_npersons_2 c_npersons_3 c_npersons_4 + * 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+" + + * c_nrooms_0 c_nrooms_1 c_nrooms_2 c_nrooms_3 + * 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+" + + * c_empl_0 c_empl_1 c_empl_2 c_empl_3 c_empl_4 + * 0 "NS-SEC 1" 1 "NS-SEC 2" 2 "NS-SEC 3" 3 "Inactive" 4 "Retired" + + * c_gender_0 c_gender_1 + * 0 "Male" 1 "Female" + + * c_tenure_0 c_tenure_1 c_tenure_2 c_tenure_3 + * 0 "Owned" 1 "Rent from council" 2 "Social rent" 3 "Private rent incl rent free" + + * c_white_0 c_white_1 + * 0 "White HRP" 1 "Non-white HRP" + + */ + + ****************************** + * dvhh + di "* dv household file" + use "`efsd'/`y'/stata/dvhh.dta", clear + + * 2010 onwards data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + rename *, lower + } + ** sex of HRP + gen c_gender = -1 + replace c_gender = 0 if sexhrp == 1 + replace c_gender = 1 if sexhrp == 2 + lab var c_gender "Constraint: Gender of HRP" + lab def c_gender 0 "Male" 1 "Female" + lab val c_gender c_gender + + ** age of HRP + * need to use 75+ as few 80+ after 2001-2 + recode p396p (min/15= . ) (16/24 = 0) (25/34 = 1) (35/44 = 2) (45/54 = 3) (55/64 = 4) (65/74 = 5) (75/max = 6), gen(c_age) + lab var c_age "Constraint: Age of HRP" + * NB for NI need to change these as Census categories are different. Why why why!? + label define c_age 0 "16-24" 1 "25-34" 2 "35-44" 3 "45-54" 4 "55-64" 5 "65-74" 6 "75+" + + lab val c_age c_age + + ** number of rooms + *1,2,3,4+ + recode a114 (1=0) (2=1) (3=2) (4=3) (5/max=4), gen(c_nrooms) + lab var c_nrooms "Constraint: number of rooms" + lab def c_nrooms 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+" + lab val c_nrooms c_nrooms + + ** Number of residents per household + recode a049 (1=0) (2=1) (3=2) (4=3) (5/max=4), gen(c_npersons) + lab var c_npersons "Constraint: number of persons in household (all ages)" + lab def c_npersons 0 "1" 1 "2" 2 "3" 3 "4" 4 "5+" + lab val c_npersons c_npersons + + ** Number of earners + recode a054 (0=0) (1=1) (2=2) (3/max=3), gen(c_nearners) + lab var c_nearners "Constraint: number of earners in household" + lab def c_nearners 0 "0" 1 "1" 2 "2" 3 "3+" + lab val c_nearners c_nearners + + ** Cars and vans. + * Object= + * 0 None + * 1 One + * 2 'Two or more'. + + *has 1-12 so need to recode. + + recode a124 (0=0) (1=1) (2/12=2), gen(c_ncars) + lab var c_ncars "Constraint: cars and vans" + lab define c_ncars 0 None 1 One 2 "Two or more" + lab val c_ncars c_ncars + *tab a124 c_cars + + ** Tenure. + * Object = + * 0 'Owned' + * 1 'Rent from council' + * 2 'Social rent' + * 3 'Private rent' - incl rent-free + + *use a121. + recode a121 (5/7=0) (1=1) (2=2) (3/4 8=3), gen(c_tenure) + lab var c_tenure "Constraint: tenure" + lab define c_tenure 0 "Owned" 1 "Rent from council" 2 "Social rent" 3 "Private rent incl rent free" + lab val c_tenure c_tenure + *tab a121 c_tenure + + ** employment status. + * Object = + * 0 'NS-SEC 1' + * 1 'NS-SEC 2' + * 2 'NS-SEC 3' + * 3 'Inactive' + * 4 'Retired'. + + * need to combine these - a093 = activity, a094 = NS-SEC. + * ref ONS website. + + * need to put a094 = 9,10,11 ('Never worked and long term unemployed',students, not stated) into 'inactive' + recode a094 (0/2=0) (3/4=1) (5/8=2) (9/12=3), gen(c_empl) + * the crosstab of a094 against a093 shows that some who are coded as + * retired/unoccupied (a093=4/5) have an NS-SEC code as they are recently + * retired/unoccupied (?). In this case we use the activity code not the NS-SEC code. + + replace c_empl=3 if a093==7 + replace c_empl=4 if a093==6 + + lab var c_empl "Constraint: employment status of HRP" + lab define c_empl 0 "NS-SEC 1" 1 "NS-SEC 2" 2 "NS-SEC 3" 3 "Inactive" 4 "Retired" + lab val c_empl c_empl + *tab a093 c_empl + *tab a094 c_empl + + ** Region. + * use gorx. + + gen region = gorx + label define region 1 "North East" 2 "North West & Merseyside" 3 "Yorkshire and the Humber" 4 "East Midlands" 5 "West Midlands" 6 "Eastern" 7 "London" 8 "South East" 9 "South West" 10 "Wales" 11 "Scotland" 12 "Northern Ireland" + lab var region "Govt. Office Region" + lab val region region + + + ** Number of children - 16 or younger. + *Object = + * 0 0 + * 1 1 + * 2 2+. + gen c_nchild = a040+a041+a042 + + * could use g019? + + * has 1, 2 and more than 2 - so needs recoding. + recode c_nchild (0=0) (1=1) (2/max=2) + lab var c_nchild "Constraint: number of children" + lab define c_nchild 0 None 1 One 2 "Two or more" + lab val c_nchild c_nchild + + gen ba_year = `y' + + * construct list of vars to keep + * if dvhh_keepvars is empty STATA will skip + local keepvars = "" + foreach v of local dvhh_keepvars { + di "* Testing for existence of `v'" + capture confirm variable `v' + if !_rc { + di "* found `v'" + local keepvars = "`keepvars' `v'" + } + else { + * exact string not found as a variable, could be because it needs expansion + * try as a varlist (forces expansion) + di "* Not found, expanding `v'" + capture noisily { + * if variable really doesn't exist this throws an exception & moves on + foreach vt of varlist `v' { + capture confirm variable `vt' + if !_rc { + di "* -> found `vt'" + local keepvars = "`keepvars' `vt'" + } + } + } + } + } + + di "* dvhh: keeping case* ba_* c_* region weight* `keepvars'" + keep case* ba_* c_* region weight* `keepvars' + qui: compress + * save kept dvhh vars + save "`efsd'/`y'/stata/dvhh-temp.dta", replace + ****************************** + + ****************************** + * rawhh + di "* raw household file for: c_comp and c_accom" + di "* also to pick up: `rawhh_keepvars'" + + use "`efsd'/`y'/stata/rawhh.dta", clear + + * 2010 onwards data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + rename *, lower + } + + * accomodation type + * Object= + * 0 Detached + * 1 Semi + * 2 Terrace + * 3 flat/maisontte + * 4 other + gen ba_c_accom = -1 + replace ba_c_accom = 0 if hsetype == 1 + replace ba_c_accom = 1 if hsetype == 2 + replace ba_c_accom = 2 if hsetype == 3 + if survyr > 2001 { + * grr, why can't var names stay the same?! + gen acomtype = accom + } + replace ba_c_accom = 3 if acomtype == 2 + replace ba_c_accom = 4 if acomtype == 3 + replace ba_c_accom = 4 if acomtype == 4 + + lab var ba_c_accom "Constraint: accommodation type" + lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" + lab val ba_c_accom c_accom + + ** Composition. + * Object = + * 0 'married/partnered' + * 1 'single parent' + * 2 'single person' + * 3 'other'. + + * co-habiting. + gen ba_c_comp=1 if numcpart>0 + * married. + replace ba_c_comp=0 if nummpart>0 + * single parent - assumes a single adult living with 1 or more + * children is a single parent. + replace ba_c_comp=1 if (numadult==1 & numchild>0) + * single person. + replace ba_c_comp=2 if (numadult==1 & numchild==0) + * the rest - this is a cheat! + recode ba_c_comp (missing=3) + lab var ba_c_comp "Constraint: household composition" + lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" + lab val ba_c_comp c_comp + tab ba_c_comp numcpart + tab ba_c_comp nummpart + + + if "`y'" == "2001-2002" | "`y'" == "2002-2003" | "`y'" == "2003-2004" | "`y'" == "2004-2005" | "`y'" == "2005-2006" | "`y'" == "2006" | "`y'" == "2007" { + di "* Setting up cal year for `y'" + * tab survyr sampyear, mi + gen ba_sampyear = sampyear + lab var ba_sampyear "Sample year" + } + + if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + di "* Setting up cal year for `y'" + * sampyear variable removed + * tab survyr sampyear, mi + gen ba_sampyear = survyr + lab var ba_sampyear "Sample year" + } + if "`y'" == "2012" { + di "* fixing issue with multiple flights varnames differently for `y'" + renpfix flydes_ flydest1 + } + * construct list of vars to keep + * if rawhh_keepvars is empty STATA will skip + local keepvars = "" + foreach v of varlist `rawhh_keepvars' { + capture confirm variable `v' + if !_rc { + * di "* found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + + di "* rawhh: keeping case* ba_* c_* `keepvars'" + renpfix ba_c_ c_ + keep case* ba_* c_* `keepvars' + + qui: compress + + save "`efsd'/`y'/stata/rawhh-temp.dta", replace + ****************************** + + ****************************** + * rawper + di "* Need rawper file for ethnicity detail" + use "`efsd'/`y'/stata/rawper.dta", clear + + * 2010 data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + rename *, lower + } + * keep HRPs + + tab hrpid, nol + + keep if hrpid == 1 + duplicates tag case, gen(tag) + + di "* -> Any duplicates in `y' ?" + + li case person hrpid sex dvage* if tag == 1 + + if "`y'" == "2005-2006" | "`y'" == "2006-2007" { + di "* Age error: `y'" + drop if dvage18 == 2 + } + else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + di "Removing HRPs where age < 16 for `y'" + drop if dvage_p < 16 + } + else { + di "Removing HRPs where age < 16 for `y'" + drop if dvage < 16 + } + + di "* ethnic detail" + + if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" { + di "* year = `y'" + gen c_ethnicd = eth01p + } + else if "`y'" == "2011" | "`y'" == "2012" { + di "* year = `y'" + gen c_ethnicd = ethep + replace c_ethnicd = ethwp if c_ethnicd == . + replace c_ethnicd = ethsp if c_ethnicd == . + replace c_ethnicd = ethnip if c_ethnicd == . + } + else { + di "* year = `y'" + * before 2008 + gen c_ethnicd = ethnic_p + } + lab var c_ethnicd "Detailed ethnic group" + lab def c_ethnicd 0 "Missing/inapplicable" 1 "White" 2 "Mixed" 3 "Asian" 4 "Black" 5 "Other" + lab val c_ethnicd c_ethnicd + + * construct list of vars to keep + * if rawper_keepvars is empty STATA will skip + local keepvars "" + foreach v of varlist `rawper_keepvars' { + capture confirm variable `v' + if !_rc { + *di found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + + * keep rawper variables + keep case* c_* `keepvars' + save "`efsd'/`y'/stata/rawper-temp.dta", replace + ****************************** + + ****************************** + * dvper + di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)" + + use "`efsd'/`y'/stata/dvper.dta", clear + + * 2010 data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" { + rename *, lower + } + + * keep only hrps + keep if a003 == 1 + + * Ethnicity + * NB a012 and a013 changed categories 2007->2008 to 0 = n/a, 1 = white, 2 = everyone else + * this still works, you can get detail from rawper.dta + recode a012p (1=0) (2/max=1), gen(c_white) + lab var c_white "Constraint: non-white HRP" + lab def c_white 0 "White HRP" 1 "Non-white HRP" + lab val c_white c_white + * if HRPs are not classified - they will show up as missing + + * construct list of vars to keep + * if dvper_keepvars is empty STATA will skip + local keepvars = "" + foreach v of varlist `dvper_keepvars' { + capture confirm variable `v' + if !_rc { + *di "* found `v'" + local keepvars = "`keepvars' `v'" + } + else { + di in red "`v' does not exist in `y' - will be missing" + } + } + keep case* c_white + + qui: compress + + save "`efsd'/`y'/stata/dvper-temp.dta", replace + ****************************** + + ************ + * Now c_lli but this time need to collapse it so we count the number in the household with/out lli + * and count the number of children of various ages + + use "`efsd'/`y'/stata/dvper.dta", clear + + * 2010 onwards data = mixed/uppercase + if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" { + rename *, lower + } + + * assume if someone is eligible for incapacity benefit then = lli + * even if not actually receiving + * in rawper dvincap = a + b + * in dvper a227 (1 = rec) = a a227 (2=not rec) = b + * ought to check prevalence with FRS + gen c_lli_sum = 0 + replace c_lli_sum = 1 if a227 > 0 + + ********** + * need to count children of ages < 14 for OECD equivalence scale + * can then calculate n over 14 by substracting from (a040 + a041 + a042) + + gen ba_under14 = 0 + replace ba_under14 = 1 if a005p < 14 + + * collapse to count the incidence of lli & count the number of children in each age group + * case is the household id + collapse (sum) c_lli_sum ba_*, by(case) + gen c_lli = 0 + replace c_lli = 1 if c_lli_sum > 0 + drop c_lli_sum + lab var c_lli "Constraint: presence of LLI" + lab def c_lli 0 "No person with lli" 1 "At least 1 person with lli" + lab var ba_under14 "Number of children aged under 14" + + * keep only new variables as we've collapsed to hh + keep case* ba_* c_* + qui: compress + save "`efsd'/`y'/stata/dvper-lli.dta", replace + + + di "*** MERGE ALL FILES for year = `y'" + + * + + qui: merge case using "`efsd'/`y'/stata/dvhh-temp.dta" "`efsd'/`y'/stata/rawper-temp.dta" "`efsd'/`y'/stata/dvper-temp.dta" "`efsd'/`y'/stata/dvper-lli.dta" "`efsd'/`y'/stata/rawhh-temp.dta" , sort + + su _merge* + + * DELETE TEMPORARY FILES + erase "`efsd'/`y'/stata/dvhh-temp.dta" + erase "`efsd'/`y'/stata/rawper-temp.dta" + erase "`efsd'/`y'/stata/dvper-temp.dta" + erase "`efsd'/`y'/stata/dvper-lli.dta" + erase "`efsd'/`y'/stata/rawhh-temp.dta" + + gen survey_year = "`y'" + tab survey_year + + aorder + * keep all (makes life easier below as some vars are in some years and not others) + + *********************************************** + * calculate OECD equivalisation weight in order to equivalise income or expenditure + * see e.g. DWP HBAI reports + * can then calculate n over 14 by substracting from g019 + + gen ba_over14 = 0 + replace ba_over14 = g019 - ba_under14 + + gen ba_adults = g018 if g018 > 0 + * ignore hhs with no adults (how many are there?) + + /* + 1st adult = .67 + spouse = .33 + other 2nd adult = .33 + 3rd adult = .33 + subsequent adults = .33 + children aged < 14 = .2 + children aged 14+ = .33 + */ + * catch hh with no children + replace ba_under14 = 0 if ba_under14 == . + replace ba_over14 = 0 if ba_over14 == . + + gen oecd_equivbhcwt = 0.67 if ba_adults >= 1 + replace oecd_equivbhcwt = oecd_equivbhcwt + ((ba_adults-1) * 0.33) + (ba_under14 * 0.2) + (ba_over14 * 0.33) + di "*-> Calculating equiv income (OECD) and quartiles/deciles" + + * p344, p389 & p396 changed to *p after 2006 and top coded (!) + if `y' > 2005 { + rename p344p p344 + rename p389p p389 + rename p396p p396 + } + + gen equiv_p344 = p344/oecd_equivbhcwt + gen equiv_p389bhc = p389/oecd_equivbhcwt + gen equiv_p389ahc = (p389-p116t)/oecd_equivbhcwt + lab var equiv_p344 "Equivalised normal gross household income (OECD)" + lab var equiv_p389bhc "Equivalised normal disposable (net) household income (BHC, OECD)" + lab var equiv_p389ahc "Equivalised normal disposable (net) household income (AHC, OECD)" + + local incomes "incanon p344 p389" + local incanonl "anonymised hhold inc + allowances" + local p344l "gross normal weekly household income" + local p389l "normal weekly disposable hhld income" + foreach i of local incomes { + egen `i'_dec = cut(`i'), group(10) + lab var `i'_dec "Deciles: ``i'l'" + egen `i'_quart = cut(`i'), group(4) + lab var `i'_quart "Quartiles: ``i'l'" + } + + * quarter labels changed in 2006 + + * old: + * 1 april to june + * 2 june to september + * 3 october to december + * 4 january to march + + * new: + * a099: + * 1 january to march + * 2 april to june + * 3 july to september + * 4 october to december + + gen ba_month = a055 + + * create a birth cohort variable + * remember that after 2005 age is top coded to 80 + * year of birth + gen ba_birthyear = ba_sampyear - p396 + * create a birth cohort variable + recode ba_birth (1900/1909=1 "1900-1909") (1910/1919=2 "1910-1919") (1920/1929=3 "1920-1929") (1930/1939=4 "1930-1939") (1940/1949=5 "1940-1949") (1950/1959=6 "1950-1959") (1960/1969=7 "1960-1969") (1970/1979=8 "1970-1979") (1980/1989=9 "1980-1989") (1990/1999=10 "1990-1999") (2000/2009=11 "2000-2009"), gen(ba_birth_cohort) + + * tab ba_birth_cohort c_age, mi + + *********************** + * End of per-year processing + + gen ba_quarter = -1 + replace ba_quarter = 1 if ba_month == 1 | ba_month == 2 | ba_month == 3 + replace ba_quarter = 2 if ba_month == 4 | ba_month == 5 | ba_month == 6 + replace ba_quarter = 3 if ba_month == 7 | ba_month == 8 | ba_month == 9 + replace ba_quarter = 4 if ba_month == 10 | ba_month == 11 | ba_month == 12 + + tab ba_quarter + + egen ba_sampyear_quarter = concat(ba_sampyear ba_quarter), punct("_Q") + lab var ba_sampyear_quarter "EFS/FES calendar year & quarter" + + egen ba_sampyear_month = concat(ba_sampyear ba_month), punct("_") + lab var ba_sampyear_month "EFS/FES calendar year & month" + + gen survey_name = "efs" + + gen uk_country = 1 if region > 0 & region < 10 + replace uk_country = 2 if region == 10 + replace uk_country = 3 if region == 11 + replace uk_country = 4 if region == 12 + lab def uk_country 1 "England" 2 "Wales" 3 "Scotland" 4 "Northern Ireland" + lab val uk_country uk_country + + + qui: compress + save "`outd'/EFS-`y'-extract-BA.dta", replace + } +} + +**************************** +* now merge them all into one big file + +clear // start with nothing + +foreach y of local do_years { + di "Appending `y'" + qui: append using "`outd'/EFS-`y'-extract-BA.dta", force + *erase "`efsd'/`y'/FES-`y'-extract-BA.dta" +} + +* the above code +tabstat c_*, c(s) s(mean min max) + +lab var survey_year "EFS/FES year" + +* finally check for duplicate months 2005-6 -> 2006 samples +tab ba_sampyear ba_month +tab survey_year ba_month + +* Jan/Feb/Mar 2006 are duplicates of Jan/Feb/Mar 2005-6 +drop if survey_year == "2006" & ba_month == 1 +drop if survey_year == "2006" & ba_month == 2 +drop if survey_year == "2006" & ba_month == 3 + +* check +tab ba_sampyear ba_month + + +gen caseno = case + +* to test +tabstat c_ncars c_nchild c_npersons c_nearners c_nrooms, by(ba_sampyear) + +drop _merge* a055 g* p116t caseno + +di "*-> Compressing" +qui: compress + +aorder + +order case* survey* + +save "`outd'/EFS-`extract_years'-extract-BA.dta", replace + +di "************************************************************************" +if `do_extracts' { + di "*-> do_extracts = `do_extracts', all years (`years') extracted and refreshed" + } +else + { + di "*-> do_extracts = `do_extracts', years not extracted so individual files not refreshed" +} +di "*-> Job ended at $S_DATE" + +log close + -- GitLab