Administrator approval is now required for registering new accounts. If you are registering a new account, and are external to the University, please ask the repository owner to contact ServiceLine to request your account be approved. Repository owners must include the newly registered email address, and specific repository in the request for approval.

Commit f9597617 authored by Ben Anderson's avatar Ben Anderson
Browse files

altered script to create random samples using loop

10%, 50%, 100% (not a sample!!)
parent 6c03a3c5
...@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/" ...@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/"
* NB this is the 2014 EULF we're using * NB this is the 2014 EULF we're using
local ifile "need_eul_may2014" local ifile "need_eul_may2014"
* original data file * original data file
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta" local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"
* If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples * If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
* For now we'll just create a random sub-sample of sample% to make testing models etc easier * For now we'll just create a random sub-sample of sample% to make testing models etc easier
* 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc * 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
local sample 20 local sample "10 50 100"
local samplet "`sample'pc"
local sampleby "EE_BAND PROP_TYPE" local sampleby "EE_BAND PROP_TYPE"
local version "v1.1" local version "v1.1"
...@@ -82,130 +82,147 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep ...@@ -82,130 +82,147 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep
* use these locals to control what happens (set to 0 to skip the code) * use these locals to control what happens (set to 0 to skip the code)
* create codebook & some descriptives * create codebook & some descriptives
local create_codebook = 0 local create_codebook = 1
* create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis * create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
local create_xwavefile = 1 local create_xwavefile = 1
* create long form file with wave (yearly) data - be careful, this take a long time due to large memory use! * create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
local create_longfile = 1 local create_longfile = 1
* load the original file * load the original file
use "`dfile_orig'", clear use "`dfile_orig'.dta", clear
if `create_codebook' { if `create_codebook' {
* create original EULF codebook * create original EULF codebook
* not much point running thid for each % sample although the counts etc reported in the codebook won't match * not much point running this for each % sample although the counts etc reported in the codebook won't match
log off main log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb) log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
desc desc
codebook codebook
log close cb log close cb
log on main log on main
} }
***** random sample **** foreach s of local sample {
* select a random sample but ensure proportions of sampleby are kept local samplet "`s'pc"
di "* Keeping `sample'% sample by `sampleby'" ***** random sample ****
sample `sample', by(`sampleby') * select a random sample but ensure proportions of sampleby are kept
di "* Keeping `s'% sample by `sampleby'"
tab `sampleby', mi sample `s', by(`sampleby')
if `create_xwavefile' {
* create the file with data that (notionally) doesn't change
* create a wide consumption file
preserve
keep HH_ID Gcons* Econs*
compress
save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
restore
tab `sampleby', mi
preserve
drop Gcons* Econs* if `create_xwavefile' {
* fix some mis-codings (or lack of coding of missing) * create the file with data that (notionally) doesn't change
local vars "E7Flag2012 CWI LI"
foreach v of local vars { * create a wide consumption file
destring `v', force replace preserve
replace `v' = 0 if `v' !=1 keep HH_ID Gcons* Econs*
label def `v' 0 "No or N/A" 1 "Yes" compress
label val `v' `v' save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
} restore
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
replace FP_ENG = . if LOFT_DEPTH == 99
replace LOFT_DEPTH = . if LOFT_DEPTH == 99
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
tabstat IMD_WALES, by(REGION) s(mean min max n) preserve
* there seem to be some welsh LSOAs allocated to English GORs? drop Gcons* Econs*
* fix some mis-codings (or lack of coding of missing)
local vars "E7Flag2012 CWI LI BOILER"
foreach v of local vars {
destring `v', force replace
replace `v' = 0 if `v' !=1
label def `v' 0 "No or N/A" 1 "Yes"
label val `v' `v'
}
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace
replace FP_ENG = "Unknown" if FP_ENG == "99"
replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH == "99"
* comes in as missing?
replace LI_YEAR = "Unknown" if LI_YEAR == "."
replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "."
replace CWI_YEAR = "Unknown" if CWI_YEAR == "."
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
tabstat IMD_WALES, by(REGION) s(mean min max n)
* there seem to be some welsh LSOAs allocated to English GORs?
tabstat IMD_ENG, by(REGION) s(mean min max n)
* there seem to be some English LSOAs allocated to Wales?
tabstat IMD_ENG, by(REGION) s(mean min max n) tab FP_ENG REGION
* there seem to be some English LSOAs allocated to Wales?
* REGION is ONS admin codes
* create a new variable with meaningful labels
gen ba_region = 1 if REGION == "E12000001"
replace ba_region = 2 if REGION == "E12000002"
replace ba_region = 3 if REGION == "E12000003"
replace ba_region = 4 if REGION == "E12000004"
replace ba_region = 5 if REGION == "E12000005"
replace ba_region = 6 if REGION == "E12000006"
replace ba_region = 7 if REGION == "E12000007"
replace ba_region = 8 if REGION == "E12000008"
replace ba_region = 9 if REGION == "E12000009"
replace ba_region = 10 if REGION == "W99999999"
lab var ba_region "former Govt Office region (labelled)"
* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
lab val ba_region ba_region
compress
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
desc
di "** urban/rural would be helpful"
codebook
compress
save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
log close cb_xwave
log on main
restore
}
tabstat FP_ENG, by(REGION) if `create_longfile' {
* REGION is ONS admin codes * create the long file with as few vars as possible (quicker)
* create a new variable with meaningful labels * still takes a while...
gen ba_region = 1 if REGION == "E12000001"
replace ba_region = 2 if REGION == "E12000002" keep HH_ID Gcons* Econs*
replace ba_region = 3 if REGION == "E12000003"
replace ba_region = 4 if REGION == "E12000004" * panel vars:
replace ba_region = 5 if REGION == "E12000005" * Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
replace ba_region = 6 if REGION == "E12000006" local vars "Gcons Econs"
replace ba_region = 7 if REGION == "E12000007" foreach v of local vars {
replace ba_region = 8 if REGION == "E12000008" di "* Renaming -> `v'"
replace ba_region = 9 if REGION == "E12000009" foreach y of numlist 2005/2012 {
replace ba_region = 10 if REGION == "W99999999" di "* -> `v' (`y')"
* put year on the end so reshape works
lab var ba_region "former Govt Office region (labelled)" gen `v'Valid`y' = `v'`y'Valid
* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html * remove old variable to save time in reshape & space
lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" /// drop `v'`y'Valid
5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales" }
lab val ba_region ba_region }
* this takes a LONG time for the full dataset
reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
rename _j year
* set as panel
xtset HH_ID year
compress compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
log off main log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave) log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
desc desc
di "** urban/rural would be helpful"
codebook codebook
compress log close cbl
save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
log close cb_xwave
log on main log on main
restore
} * this leaves us with the long form file in memory
if `create_longfile' {
* create the long file with as few vars as possible (quicker)
* still takes a while...
keep HH_ID Gcons* Econs*
* panel vars:
* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
local vars "Gcons Econs"
foreach v of local vars {
di "* Renaming -> `v'"
foreach y of numlist 2005/2012 {
di "* -> `v' (`y')"
* put year on the end so reshape works
gen `v'Valid`y' = `v'`y'Valid
* remove old variable to save time in reshape & space
drop `v'`y'Valid
}
} }
* this takes a LONG time for the full dataset
reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
rename _j year
* set as panel
xtset HH_ID year
compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
* this leaves us with the long form file in memory
} }
/* /*
* Link xwave data to long form file * Link xwave data to long form file
* THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care * THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment