Commit f9597617 authored by Ben Anderson's avatar Ben Anderson
Browse files

altered script to create random samples using loop

10%, 50%, 100% (not a sample!!)
parent 6c03a3c5
......@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/"
* NB this is the 2014 EULF we're using
local ifile "need_eul_may2014"
* original data file
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta"
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"
* If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
* For now we'll just create a random sub-sample of sample% to make testing models etc easier
* 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
local sample 20
local samplet "`sample'pc"
local sample "10 50 100"
local sampleby "EE_BAND PROP_TYPE"
local version "v1.1"
......@@ -82,130 +82,147 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep
* use these locals to control what happens (set to 0 to skip the code)
* create codebook & some descriptives
local create_codebook = 0
local create_codebook = 1
* create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
local create_xwavefile = 1
* create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
local create_longfile = 1
* load the original file
use "`dfile_orig'", clear
use "`dfile_orig'.dta", clear
if `create_codebook' {
* create original EULF codebook
* not much point running thid for each % sample although the counts etc reported in the codebook won't match
* not much point running this for each % sample although the counts etc reported in the codebook won't match
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb)
log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
desc
codebook
log close cb
log on main
}
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `sample'% sample by `sampleby'"
sample `sample', by(`sampleby')
tab `sampleby', mi
if `create_xwavefile' {
* create the file with data that (notionally) doesn't change
* create a wide consumption file
preserve
keep HH_ID Gcons* Econs*
compress
save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
restore
foreach s of local sample {
local samplet "`s'pc"
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `s'% sample by `sampleby'"
sample `s', by(`sampleby')
tab `sampleby', mi
preserve
drop Gcons* Econs*
* fix some mis-codings (or lack of coding of missing)
local vars "E7Flag2012 CWI LI"
foreach v of local vars {
destring `v', force replace
replace `v' = 0 if `v' !=1
label def `v' 0 "No or N/A" 1 "Yes"
label val `v' `v'
}
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
replace FP_ENG = . if LOFT_DEPTH == 99
replace LOFT_DEPTH = . if LOFT_DEPTH == 99
if `create_xwavefile' {
* create the file with data that (notionally) doesn't change
* create a wide consumption file
preserve
keep HH_ID Gcons* Econs*
compress
save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
restore
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
tabstat IMD_WALES, by(REGION) s(mean min max n)
* there seem to be some welsh LSOAs allocated to English GORs?
preserve
drop Gcons* Econs*
* fix some mis-codings (or lack of coding of missing)
local vars "E7Flag2012 CWI LI BOILER"
foreach v of local vars {
destring `v', force replace
replace `v' = 0 if `v' !=1
label def `v' 0 "No or N/A" 1 "Yes"
label val `v' `v'
}
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace
replace FP_ENG = "Unknown" if FP_ENG == "99"
replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH == "99"
* comes in as missing?
replace LI_YEAR = "Unknown" if LI_YEAR == "."
replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "."
replace CWI_YEAR = "Unknown" if CWI_YEAR == "."
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
tabstat IMD_WALES, by(REGION) s(mean min max n)
* there seem to be some welsh LSOAs allocated to English GORs?
tabstat IMD_ENG, by(REGION) s(mean min max n)
* there seem to be some English LSOAs allocated to Wales?
tabstat IMD_ENG, by(REGION) s(mean min max n)
* there seem to be some English LSOAs allocated to Wales?
tab FP_ENG REGION
* REGION is ONS admin codes
* create a new variable with meaningful labels
gen ba_region = 1 if REGION == "E12000001"
replace ba_region = 2 if REGION == "E12000002"
replace ba_region = 3 if REGION == "E12000003"
replace ba_region = 4 if REGION == "E12000004"
replace ba_region = 5 if REGION == "E12000005"
replace ba_region = 6 if REGION == "E12000006"
replace ba_region = 7 if REGION == "E12000007"
replace ba_region = 8 if REGION == "E12000008"
replace ba_region = 9 if REGION == "E12000009"
replace ba_region = 10 if REGION == "W99999999"
lab var ba_region "former Govt Office region (labelled)"
* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
lab val ba_region ba_region
compress
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
desc
di "** urban/rural would be helpful"
codebook
compress
save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
log close cb_xwave
log on main
restore
}
tabstat FP_ENG, by(REGION)
* REGION is ONS admin codes
* create a new variable with meaningful labels
gen ba_region = 1 if REGION == "E12000001"
replace ba_region = 2 if REGION == "E12000002"
replace ba_region = 3 if REGION == "E12000003"
replace ba_region = 4 if REGION == "E12000004"
replace ba_region = 5 if REGION == "E12000005"
replace ba_region = 6 if REGION == "E12000006"
replace ba_region = 7 if REGION == "E12000007"
replace ba_region = 8 if REGION == "E12000008"
replace ba_region = 9 if REGION == "E12000009"
replace ba_region = 10 if REGION == "W99999999"
lab var ba_region "former Govt Office region (labelled)"
* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
lab val ba_region ba_region
if `create_longfile' {
* create the long file with as few vars as possible (quicker)
* still takes a while...
keep HH_ID Gcons* Econs*
* panel vars:
* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
local vars "Gcons Econs"
foreach v of local vars {
di "* Renaming -> `v'"
foreach y of numlist 2005/2012 {
di "* -> `v' (`y')"
* put year on the end so reshape works
gen `v'Valid`y' = `v'`y'Valid
* remove old variable to save time in reshape & space
drop `v'`y'Valid
}
}
* this takes a LONG time for the full dataset
reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
rename _j year
* set as panel
xtset HH_ID year
compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
desc
di "** urban/rural would be helpful"
codebook
compress
save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
log close cb_xwave
log close cbl
log on main
restore
}
if `create_longfile' {
* create the long file with as few vars as possible (quicker)
* still takes a while...
keep HH_ID Gcons* Econs*
* panel vars:
* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
local vars "Gcons Econs"
foreach v of local vars {
di "* Renaming -> `v'"
foreach y of numlist 2005/2012 {
di "* -> `v' (`y')"
* put year on the end so reshape works
gen `v'Valid`y' = `v'`y'Valid
* remove old variable to save time in reshape & space
drop `v'`y'Valid
}
* this leaves us with the long form file in memory
}
* this takes a LONG time for the full dataset
reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
rename _j year
* set as panel
xtset HH_ID year
compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
* this leaves us with the long form file in memory
}
/*
* Link xwave data to long form file
* THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment