Administrator approval is now required for registering new accounts. If you are registering a new account, and are external to the University, please ask the repository owner to contact ServiceLine to request your account be approved. Repository owners must include the newly registered email address, and specific repository in the request for approval.

Commit f9597617 authored by Ben Anderson's avatar Ben Anderson
Browse files

altered script to create random samples using loop

10%, 50%, 100% (not a sample!!)
parent 6c03a3c5
......@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/"
* NB this is the 2014 EULF we're using
local ifile "need_eul_may2014"
* original data file
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta"
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"
* If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
* For now we'll just create a random sub-sample of sample% to make testing models etc easier
* 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
local sample 20
local samplet "`sample'pc"
local sample "10 50 100"
local sampleby "EE_BAND PROP_TYPE"
local version "v1.1"
......@@ -82,130 +82,147 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep
* use these locals to control what happens (set to 0 to skip the code)
* create codebook & some descriptives
local create_codebook = 0
local create_codebook = 1
* create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
local create_xwavefile = 1
* create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
local create_longfile = 1
* load the original file
use "`dfile_orig'", clear
use "`dfile_orig'.dta", clear
if `create_codebook' {
* create original EULF codebook
* not much point running thid for each % sample although the counts etc reported in the codebook won't match
* not much point running this for each % sample although the counts etc reported in the codebook won't match
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb)
log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
desc
codebook
log close cb
log on main
}
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `sample'% sample by `sampleby'"
sample `sample', by(`sampleby')
tab `sampleby', mi
if `create_xwavefile' {
* create the file with data that (notionally) doesn't change
* create a wide consumption file
preserve
keep HH_ID Gcons* Econs*
compress
save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
restore
foreach s of local sample {
local samplet "`s'pc"
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `s'% sample by `sampleby'"
sample `s', by(`sampleby')
tab `sampleby', mi
preserve
drop Gcons* Econs*
* fix some mis-codings (or lack of coding of missing)
local vars "E7Flag2012 CWI LI"
foreach v of local vars {
destring `v', force replace
replace `v' = 0 if `v' !=1
label def `v' 0 "No or N/A" 1 "Yes"
label val `v' `v'
}
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
replace FP_ENG = . if LOFT_DEPTH == 99
replace LOFT_DEPTH = . if LOFT_DEPTH == 99
if `create_xwavefile' {
* create the file with data that (notionally) doesn't change
* create a wide consumption file
preserve
keep HH_ID Gcons* Econs*
compress
save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
restore
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
tabstat IMD_WALES, by(REGION) s(mean min max n)
* there seem to be some welsh LSOAs allocated to English GORs?
preserve
drop Gcons* Econs*
* fix some mis-codings (or lack of coding of missing)
local vars "E7Flag2012 CWI LI BOILER"
foreach v of local vars {
destring `v', force replace
replace `v' = 0 if `v' !=1
label def `v' 0 "No or N/A" 1 "Yes"
label val `v' `v'
}
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace
replace FP_ENG = "Unknown" if FP_ENG == "99"
replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH == "99"
* comes in as missing?
replace LI_YEAR = "Unknown" if LI_YEAR == "."
replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "."
replace CWI_YEAR = "Unknown" if CWI_YEAR == "."
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
tabstat IMD_WALES, by(REGION) s(mean min max n)
* there seem to be some welsh LSOAs allocated to English GORs?
tabstat IMD_ENG, by(REGION) s(mean min max n)
* there seem to be some English LSOAs allocated to Wales?
tabstat IMD_ENG, by(REGION) s(mean min max n)
* there seem to be some English LSOAs allocated to Wales?
tab FP_ENG REGION
* REGION is ONS admin codes
* create a new variable with meaningful labels
gen ba_region = 1 if REGION == "E12000001"
replace ba_region = 2 if REGION == "E12000002"
replace ba_region = 3 if REGION == "E12000003"
replace ba_region = 4 if REGION == "E12000004"
replace ba_region = 5 if REGION == "E12000005"
replace ba_region = 6 if REGION == "E12000006"
replace ba_region = 7 if REGION == "E12000007"
replace ba_region = 8 if REGION == "E12000008"
replace ba_region = 9 if REGION == "E12000009"
replace ba_region = 10 if REGION == "W99999999"
lab var ba_region "former Govt Office region (labelled)"
* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
lab val ba_region ba_region
compress
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
desc
di "** urban/rural would be helpful"
codebook
compress
save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
log close cb_xwave
log on main
restore
}
tabstat FP_ENG, by(REGION)
* REGION is ONS admin codes
* create a new variable with meaningful labels
gen ba_region = 1 if REGION == "E12000001"
replace ba_region = 2 if REGION == "E12000002"
replace ba_region = 3 if REGION == "E12000003"
replace ba_region = 4 if REGION == "E12000004"
replace ba_region = 5 if REGION == "E12000005"
replace ba_region = 6 if REGION == "E12000006"
replace ba_region = 7 if REGION == "E12000007"
replace ba_region = 8 if REGION == "E12000008"
replace ba_region = 9 if REGION == "E12000009"
replace ba_region = 10 if REGION == "W99999999"
lab var ba_region "former Govt Office region (labelled)"
* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
lab val ba_region ba_region
if `create_longfile' {
* create the long file with as few vars as possible (quicker)
* still takes a while...
keep HH_ID Gcons* Econs*
* panel vars:
* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
local vars "Gcons Econs"
foreach v of local vars {
di "* Renaming -> `v'"
foreach y of numlist 2005/2012 {
di "* -> `v' (`y')"
* put year on the end so reshape works
gen `v'Valid`y' = `v'`y'Valid
* remove old variable to save time in reshape & space
drop `v'`y'Valid
}
}
* this takes a LONG time for the full dataset
reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
rename _j year
* set as panel
xtset HH_ID year
compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
desc
di "** urban/rural would be helpful"
codebook
compress
save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
log close cb_xwave
log close cbl
log on main
restore
}
if `create_longfile' {
* create the long file with as few vars as possible (quicker)
* still takes a while...
keep HH_ID Gcons* Econs*
* panel vars:
* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
local vars "Gcons Econs"
foreach v of local vars {
di "* Renaming -> `v'"
foreach y of numlist 2005/2012 {
di "* -> `v' (`y')"
* put year on the end so reshape works
gen `v'Valid`y' = `v'`y'Valid
* remove old variable to save time in reshape & space
drop `v'`y'Valid
}
* this leaves us with the long form file in memory
}
* this takes a LONG time for the full dataset
reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
rename _j year
* set as panel
xtset HH_ID year
compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
* this leaves us with the long form file in memory
}
/*
* Link xwave data to long form file
* THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment