diff --git a/NEED/process-NEED-EULF-2014.do b/NEED/process-NEED-EULF-2014.do index a627b959203458745b07ba5683143a99a22050a3..e8357702d3f240c1ddaed0f49b6acd63d3ed222c 100644 --- a/NEED/process-NEED-EULF-2014.do +++ b/NEED/process-NEED-EULF-2014.do @@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/" * NB this is the 2014 EULF we're using local ifile "need_eul_may2014" * original data file -local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta" +local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'" * If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples * For now we'll just create a random sub-sample of sample% to make testing models etc easier * 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc -local sample 20 -local samplet "`sample'pc" +local sample "10 50 100" + local sampleby "EE_BAND PROP_TYPE" local version "v1.1" @@ -82,130 +82,147 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep * use these locals to control what happens (set to 0 to skip the code) * create codebook & some descriptives -local create_codebook = 0 +local create_codebook = 1 * create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis local create_xwavefile = 1 * create long form file with wave (yearly) data - be careful, this take a long time due to large memory use! local create_longfile = 1 * load the original file -use "`dfile_orig'", clear +use "`dfile_orig'.dta", clear if `create_codebook' { * create original EULF codebook - * not much point running thid for each % sample although the counts etc reported in the codebook won't match + * not much point running this for each % sample although the counts etc reported in the codebook won't match log off main - log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb) + log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb) desc codebook log close cb log on main } -***** random sample **** -* select a random sample but ensure proportions of sampleby are kept -di "* Keeping `sample'% sample by `sampleby'" -sample `sample', by(`sampleby') - -tab `sampleby', mi - - -if `create_xwavefile' { - * create the file with data that (notionally) doesn't change - - * create a wide consumption file - preserve - keep HH_ID Gcons* Econs* - compress - save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace - restore +foreach s of local sample { + local samplet "`s'pc" + ***** random sample **** + * select a random sample but ensure proportions of sampleby are kept + di "* Keeping `s'% sample by `sampleby'" + sample `s', by(`sampleby') + tab `sampleby', mi - preserve - drop Gcons* Econs* - * fix some mis-codings (or lack of coding of missing) - local vars "E7Flag2012 CWI LI" - foreach v of local vars { - destring `v', force replace - replace `v' = 0 if `v' !=1 - label def `v' 0 "No or N/A" 1 "Yes" - label val `v' `v' - } - - * turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis! - replace FP_ENG = . if LOFT_DEPTH == 99 - replace LOFT_DEPTH = . if LOFT_DEPTH == 99 + + if `create_xwavefile' { + * create the file with data that (notionally) doesn't change + + * create a wide consumption file + preserve + keep HH_ID Gcons* Econs* + compress + save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace + restore - * what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid? - tabstat IMD_WALES, by(REGION) s(mean min max n) - * there seem to be some welsh LSOAs allocated to English GORs? + preserve + drop Gcons* Econs* + * fix some mis-codings (or lack of coding of missing) + local vars "E7Flag2012 CWI LI BOILER" + foreach v of local vars { + destring `v', force replace + replace `v' = 0 if `v' !=1 + label def `v' 0 "No or N/A" 1 "Yes" + label val `v' `v' + } + + * turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis! + * recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter) + tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace + replace FP_ENG = "Unknown" if FP_ENG == "99" + replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH == "99" + * comes in as missing? + replace LI_YEAR = "Unknown" if LI_YEAR == "." + replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "." + replace CWI_YEAR = "Unknown" if CWI_YEAR == "." + + * what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid? + + tabstat IMD_WALES, by(REGION) s(mean min max n) + * there seem to be some welsh LSOAs allocated to English GORs? + + tabstat IMD_ENG, by(REGION) s(mean min max n) + * there seem to be some English LSOAs allocated to Wales? - tabstat IMD_ENG, by(REGION) s(mean min max n) - * there seem to be some English LSOAs allocated to Wales? + tab FP_ENG REGION + + * REGION is ONS admin codes + * create a new variable with meaningful labels + gen ba_region = 1 if REGION == "E12000001" + replace ba_region = 2 if REGION == "E12000002" + replace ba_region = 3 if REGION == "E12000003" + replace ba_region = 4 if REGION == "E12000004" + replace ba_region = 5 if REGION == "E12000005" + replace ba_region = 6 if REGION == "E12000006" + replace ba_region = 7 if REGION == "E12000007" + replace ba_region = 8 if REGION == "E12000008" + replace ba_region = 9 if REGION == "E12000009" + replace ba_region = 10 if REGION == "W99999999" + + lab var ba_region "former Govt Office region (labelled)" + * http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html + lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" /// + 5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales" + lab val ba_region ba_region + + compress + log off main + log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave) + desc + di "** urban/rural would be helpful" + codebook + compress + save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace + log close cb_xwave + log on main + restore + } - tabstat FP_ENG, by(REGION) - * REGION is ONS admin codes - * create a new variable with meaningful labels - gen ba_region = 1 if REGION == "E12000001" - replace ba_region = 2 if REGION == "E12000002" - replace ba_region = 3 if REGION == "E12000003" - replace ba_region = 4 if REGION == "E12000004" - replace ba_region = 5 if REGION == "E12000005" - replace ba_region = 6 if REGION == "E12000006" - replace ba_region = 7 if REGION == "E12000007" - replace ba_region = 8 if REGION == "E12000008" - replace ba_region = 9 if REGION == "E12000009" - replace ba_region = 10 if REGION == "W99999999" - - lab var ba_region "former Govt Office region (labelled)" - * http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html - lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" /// - 5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales" - lab val ba_region ba_region - + if `create_longfile' { + * create the long file with as few vars as possible (quicker) + * still takes a while... + + keep HH_ID Gcons* Econs* + + * panel vars: + * Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012 + local vars "Gcons Econs" + foreach v of local vars { + di "* Renaming -> `v'" + foreach y of numlist 2005/2012 { + di "* -> `v' (`y')" + * put year on the end so reshape works + gen `v'Valid`y' = `v'`y'Valid + * remove old variable to save time in reshape & space + drop `v'`y'Valid + } + } + * this takes a LONG time for the full dataset + reshape long Gcons GconsValid Econs EconsValid, i(HH_ID) + rename _j year + * set as panel + xtset HH_ID year compress + save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace + log off main - log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave) + log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl) desc - di "** urban/rural would be helpful" codebook - compress - save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace - log close cb_xwave + log close cbl log on main - restore -} - -if `create_longfile' { - * create the long file with as few vars as possible (quicker) - * still takes a while... - - keep HH_ID Gcons* Econs* - - * panel vars: - * Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012 - local vars "Gcons Econs" - foreach v of local vars { - di "* Renaming -> `v'" - foreach y of numlist 2005/2012 { - di "* -> `v' (`y')" - * put year on the end so reshape works - gen `v'Valid`y' = `v'`y'Valid - * remove old variable to save time in reshape & space - drop `v'`y'Valid - } + + * this leaves us with the long form file in memory } - * this takes a LONG time for the full dataset - reshape long Gcons GconsValid Econs EconsValid, i(HH_ID) - rename _j year - * set as panel - xtset HH_ID year - compress - save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace - * this leaves us with the long form file in memory } - /* * Link xwave data to long form file * THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care