Skip to content
Snippets Groups Projects
Commit 15d12e2f authored by Ben Anderson's avatar Ben Anderson
Browse files

simplified extraction code

only keeps a basic core of variables to produce a smaller file on the
assumption that this will be linked back to source expenditure data for
specific analyses
parent 48cb9815
No related branches found
No related tags found
No related merge requests found
......@@ -35,9 +35,11 @@ GNU General Public License for more details.
* History
* 15/8/2012 moved creation of ba_quarter etc to per-year processing
* TO DO
* update to 2012
* 28/4/2015 - adding 2011 & 2012
* LCFS database changes:
* 2010-2011 - 7272volume_h_changes_database_2011.xls
* 2011-2012 - 7472_volume_h_changes_database_2012.xls
* 29/4/2015 - changed to only produce basic file on the assumption that this will be linked back to source expenditure data for specific analyses
* NB - the script assumes a certain folder structure for the source EFS data like so:
* `efsd'/<year>/stata/<datafile>.dta
......@@ -52,7 +54,7 @@ local place = "/Users/ben/Documents/Work" local efsd = "`place'/Data/Social Scie
local logd = "`efsd'/log_files"local outd = "`efsd'/processed"
* Years to be extracted
* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010
* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011
* NB: from 2006 the EFS is collected on a calendar year basis.
* This means that 2005-6_Q4 and 2006_Q1 are exactly the same cases
......@@ -61,15 +63,18 @@ local logd = "`efsd'/log_files" local outd = "`efsd'/processed"
* To save time you can leave out years you have already processed
* just paste the ones you want into the allyears local variable below
local years "2001-2010" // just a label
local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010" // years to process local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010"
local years "2001-2012" // just a label
* 2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011
local all_years "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012" // years to process local do_years = "2001-2002 2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012"
* set to 1 to refresh each yearly extract you listed in do_years
local do_extracts "1"
* drop the first survey for the merge as it is loaded first
local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010"
local mergeyears = "2002-2003 2003-2004 2004-2005 2005-2006 2006 2007 2008 2009 2010 2011 2012"
* be sure to keep only these from the dv hh file - last ones needed for income equivalisation later
local dvhh_keepvars = "incanon weight* a055 g018 g019 p116* p344* p389*"
capture log close
log using "`logd'/ONS-UK-EFS-time-series-extract-$S_DATE.smcl", replace
......@@ -121,8 +126,8 @@ if `do_extracts' {
* c_white_0 c_white_1 * 0 "White HRP" 1 "Non-white HRP"
*/ ********* di "* dv household file" use "`efsd'/`y'/stata/dvhh.dta", clear
* 2010 data = mixed/uppercase
if "`y'" === "2010" {
* 2010 onwards data = mixed/uppercase
if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
rename *, lower
} ** sex of HRP gen c_gender = -1 replace c_gender = 0 if sexhrp == 1 replace c_gender = 1 if sexhrp == 2 lab var c_gender "Constraint: Gender of HRP" lab def c_gender 0 "Male" 1 "Female" lab val c_gender c_gender ** age of HRP
* need to use 75+ as few 80+ after 2001-2 recode p396p (min/15= . ) (16/24 = 0) (25/34 = 1) (35/44 = 2) (45/54 = 3) /// (55/64 = 4) (65/74 = 5) (75/max = 6), gen(c_age) lab var c_age "Constraint: Age of HRP"
......@@ -134,29 +139,24 @@ if `do_extracts' {
* has 1, 2 and more than 2 - so needs recoding. recode c_nchild (0=0) (1=1) (2/max=2) lab var c_nchild "Constraint: number of children" lab define c_nchild 0 None 1 One 2 "Two or more" lab val c_nchild c_nchild ** Sorting sort case
* keep all variables and base final merge on it
* keep only basic variables
keep case* c_* region `dvhh_keepvars'
qui: compress * save all vars save "`efsd'/`y'/stata/dvhh-temp.dta", replace ****************************** di "* raw household file for c_comp and c_accom"
* also to pick up electricity water payments periodicity etc for error analysis use "`efsd'/`y'/stata/rawhh.dta", clear
* 2010 data = mixed/uppercase
if "`y'" === "2010" {
* 2010 onwards data = mixed/uppercase
if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
rename *, lower
} * accomodation type * Object= * 0 Detached * 1 Semi * 2 Terrace * 3 flat/maisontte * 4 other gen ba_c_accom = -1 replace ba_c_accom = 0 if hsetype == 1 replace ba_c_accom = 1 if hsetype == 2 replace ba_c_accom = 2 if hsetype == 3
if survyr > 2001 {
* grr, why can't var names stay the same?!
gen acomtype = accom
} replace ba_c_accom = 3 if acomtype == 2 replace ba_c_accom = 4 if acomtype == 3 replace ba_c_accom = 4 if acomtype == 4 lab var ba_c_accom "Constraint: accommodation type" lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" lab val ba_c_accom c_accom ** Composition. * Object = * 0 'married/partnered' * 1 'single parent' * 2 'single person' * 3 'other'. * co-habiting. gen ba_c_comp=1 if numcpart>0 * married. replace ba_c_comp=0 if nummpart>0 * single parent - assumes a single adult living with 1 or more * children is a single parent. replace ba_c_comp=1 if (numadult==1 & numchild>0) * single person. replace ba_c_comp=2 if (numadult==1 & numchild==0) * the rest - this is a cheat! recode ba_c_comp (missing=3) lab var ba_c_comp "Constraint: household composition" lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" lab val ba_c_comp c_comp tab ba_c_comp numcpart tab ba_c_comp nummpart * rawhh derived cable tv dummies * if tvtype = 3 = cable, if = 4 is cable + phone gen tvtype_2 = tvtype2 gen tvtype_3 = tvtype3 gen tvtype_4 = tvtype4 * if exists - gen tvtype_5 = tvtype5 egen ba_c_has_cable_rawhh = anymatch(tvtype_*), values(3 4) * check for tvtype(1) as well replace ba_c_has_cable_rawhh = 1 if tvtype == 3 lab var ba_c_has_cable_rawhh "Sim: Has cable based on tvtype in rawhh"
gen ba_calyear = -1
} replace ba_c_accom = 3 if acomtype == 2 replace ba_c_accom = 4 if acomtype == 3 replace ba_c_accom = 4 if acomtype == 4 lab var ba_c_accom "Constraint: accommodation type" lab define c_accom 0 "Detached" 1 "Semi-detached" 2 "Terraced" 3 "flat/maisonette" 4 "other" lab val ba_c_accom c_accom ** Composition. * Object = * 0 'married/partnered' * 1 'single parent' * 2 'single person' * 3 'other'. * co-habiting. gen ba_c_comp=1 if numcpart>0 * married. replace ba_c_comp=0 if nummpart>0 * single parent - assumes a single adult living with 1 or more * children is a single parent. replace ba_c_comp=1 if (numadult==1 & numchild>0) * single person. replace ba_c_comp=2 if (numadult==1 & numchild==0) * the rest - this is a cheat! recode ba_c_comp (missing=3) lab var ba_c_comp "Constraint: household composition" lab define c_comp 0 "married/partnered" 1 "single parent" 2 "single person" 3 "other" lab val ba_c_comp c_comp tab ba_c_comp numcpart tab ba_c_comp nummpart gen ba_calyear = -1
local keepvars_orig "waterpay watermet elecpay eacamt eacper elecpayo dveac estndordamt estndord estndo_1 estndo_2 dvestndo dsselecf dsselecp dwpelecf dwpelecp dwpper"
local keepvars ""
if "`y'" == "2001-2002" | "`y'" == "2002-2003" | "`y'" == "2003-2004" | "`y'" == "2004-2005" | "`y'" == "2005-2006" | "`y'" == "2006" | "`y'" == "2007" {
di "* Setting up cal year for `y'"
tab survyr sampyear, mi
replace ba_sampyear = sampyear
gen ba_sampyear = sampyear
* ba_calyear removed as was actually sample year
* construct list of vars to keep based on ideal
......@@ -171,18 +171,14 @@ if `do_extracts' {
*di in red "weight does not exist"
}
}
di "***"
di "* Want: `keepvars_orig'"
di "* Have: `keepvars'"
di "***"
keep case ba_* wsinc water* sewsep ctwat percwat percsew percwsew ctwat `keepvars'
keep case* ba_*
di "* Done setting up sample year for `y'"
}
if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
di "* Setting up cal year for `y'"
* sampyear variable removed
replace ba_sampyear = survyr
gen ba_sampyear = survyr
* construct list of vars to keep based on ideal
foreach v of local keepvars_orig {
capture confirm variable `v'
......@@ -194,12 +190,8 @@ if `do_extracts' {
*di in red "weight does not exist"
}
}
di "***"
di "* Want: `keepvars_orig'"
di "* Have: `keepvars'"
di "***"
lab var ba_sampyear "Sample year"
keep case ba_* wsinc water* sewsep ctwat percwat percsew percwsew ctwat `keepvars'
keep case* ba_*
di "* Done setting up sample year for `y'"
}
......@@ -215,7 +207,7 @@ if `do_extracts' {
use "`efsd'/`y'/stata/rawper.dta", clear
* 2010 data = mixed/uppercase
if "`y'" === "2010" {
if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
rename *, lower
}
* keep HRPs
......@@ -233,7 +225,7 @@ if `do_extracts' {
di "* Age error: `y'"
drop if dvage18 == 2
}
else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
else if "`y'" == "2006" | "`y'" == "2007" | "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
di "Removing HRPs where age < 16 for `y'"
drop if dvage_p < 16
}
......@@ -242,12 +234,21 @@ if `do_extracts' {
drop if dvage < 16
}
* ethnic detail
di "* ethnic detail"
if "`y'" == "2008" | "`y'" == "2009" | "`y'" == "2010" {
di "* year = `y'"
gen c_ethnicd = eth01p
}
else if "`y'" == "2011" | "`y'" == "2012" {
di "* year = `y'"
gen c_ethnicd = ethep
replace c_ethnicd = ethwp if c_ethnicd == .
replace c_ethnicd = ethsp if c_ethnicd == .
replace c_ethnicd = ethnip if c_ethnicd == .
}
else {
di "* year = `y'"
* before 2008
gen c_ethnicd = ethnic_p
}
......@@ -256,20 +257,20 @@ if `do_extracts' {
lab val c_ethnicd c_ethnicd
* keep only new variables
keep case c_*
keep case* c_*
save "`efsd'/`y'/stata/rawper-temp.dta", replace
************ di "* Need dvper file to count n children aged under 14 (for OECD equivalisation weight)"
use "`efsd'/`y'/stata/dvper.dta", clear
* 2010 data = mixed/uppercase
if "`y'" === "2010" {
if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" {
rename *, lower
} * keep only hrps keep if a003 == 1
* Ethnicity
* NB a012 and a013 changed categories 2007->2008 to 0 = n/a, 1 = white, 2 = everyone else
* this still works, you can get detail from rawper.dta recode a012p (1=0) (2/max=1), gen(c_white) lab var c_white "Constraint: non-white HRP" lab def c_white 0 "White HRP" 1 "Non-white HRP" lab val c_white c_white * NB - 20 HRPs are not classified - they will show up as missing keep case c_white a010
* this still works, you can get detail from rawper.dta recode a012p (1=0) (2/max=1), gen(c_white) lab var c_white "Constraint: non-white HRP" lab def c_white 0 "White HRP" 1 "Non-white HRP" lab val c_white c_white * if HRPs are not classified - they will show up as missing keep case* c_white
qui: compress
save "`efsd'/`y'/stata/dvper-temp.dta", replace
......@@ -278,8 +279,8 @@ if `do_extracts' {
* and count the number of children of various ages
use "`efsd'/`y'/stata/dvper.dta", clear
* 2010 data = mixed/uppercase
if "`y'" === "2010" {
* 2010 onwards data = mixed/uppercase
if "`y'" == "2010" | "`y'" == "2011" | "`y'" == "2012" | "`y'" == "2012" {
rename *, lower
}
......@@ -309,7 +310,7 @@ if `do_extracts' {
lab var ba_under14 "Number of children aged under 14"
* keep only new variables
keep case ba_* c_*
keep case* ba_* c_*
qui: compress
save "`efsd'/`y'/stata/dvper-lli.dta", replace
di "*** MERGE ALL FILES for year = `y'"
......@@ -334,7 +335,7 @@ if `do_extracts' {
* keep all (makes life easier below as some vars are in some years and not others)
***********************************************
* calculate OECD equivaisation weight in order to equivalise income or expenditure
* calculate OECD equivalisation weight in order to equivalise income or expenditure
* see e.g. DWP HBAI reports
* can then calculate n over 14 by substracting from g019
......@@ -419,7 +420,7 @@ foreach y of local mergeyears {
tabstat c_*, c(s) s(mean min max)
lab var year "Calendar year"lab var survey_year "EFS/FES year"
lab var survey_year "EFS/FES year"
* finally check for duplicate months 2005-6 -> 2006 samples
tab ba_calyear ba_month
......@@ -436,17 +437,9 @@ tab ba_calyear ba_month
di "Compressing"
qui: compress
* drop fs* as only exist in 2005-6
drop fs*
aorder
* full version
save "`outd'/EFS-`years'-extract-BA.dta", replace
* use this version to match to case in older files for specific variable input
keep case survey_year year gorx incanon c_* ba_* weight*
gen caseno = case
if `do_extracts' {
......@@ -457,7 +450,9 @@ else
di "do_extracts = `do_extracts', years not extracted so individual files not refreshed"
}
su c_*
* to test
tabstat c_ncars c_nchild c_npersons c_nearners c_nrooms, by(ba_sampyear)
save "`outd'/EFS-`years'-extract-reduced-BA.dta", replace
di "Job ended at $S_DATE"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment