Administrator approval is now required for registering new accounts. If you are registering a new account, and are external to the University, please ask the repository owner to contact ServiceLine to request your account be approved. Repository owners must include the newly registered email address, and specific repository in the request for approval.

Commit f9597617 authored by Ben Anderson's avatar Ben Anderson
Browse files

altered script to create random samples using loop

10%, 50%, 100% (not a sample!!)
parent 6c03a3c5
......@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/"
* NB this is the 2014 EULF we're using
local ifile "need_eul_may2014"
* original data file
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta"
local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"
* If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
* For now we'll just create a random sub-sample of sample% to make testing models etc easier
* 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
local sample 20
local samplet "`sample'pc"
local sample "10 50 100"
local sampleby "EE_BAND PROP_TYPE"
local version "v1.1"
......@@ -82,35 +82,37 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep
* use these locals to control what happens (set to 0 to skip the code)
* create codebook & some descriptives
local create_codebook = 0
local create_codebook = 1
* create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
local create_xwavefile = 1
* create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
local create_longfile = 1
* load the original file
use "`dfile_orig'", clear
use "`dfile_orig'.dta", clear
if `create_codebook' {
* create original EULF codebook
* not much point running thid for each % sample although the counts etc reported in the codebook won't match
* not much point running this for each % sample although the counts etc reported in the codebook won't match
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb)
log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
desc
codebook
log close cb
log on main
}
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `sample'% sample by `sampleby'"
sample `sample', by(`sampleby')
foreach s of local sample {
local samplet "`s'pc"
***** random sample ****
* select a random sample but ensure proportions of sampleby are kept
di "* Keeping `s'% sample by `sampleby'"
sample `s', by(`sampleby')
tab `sampleby', mi
tab `sampleby', mi
if `create_xwavefile' {
if `create_xwavefile' {
* create the file with data that (notionally) doesn't change
* create a wide consumption file
......@@ -124,7 +126,7 @@ if `create_xwavefile' {
preserve
drop Gcons* Econs*
* fix some mis-codings (or lack of coding of missing)
local vars "E7Flag2012 CWI LI"
local vars "E7Flag2012 CWI LI BOILER"
foreach v of local vars {
destring `v', force replace
replace `v' = 0 if `v' !=1
......@@ -133,8 +135,14 @@ if `create_xwavefile' {
}
* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
replace FP_ENG = . if LOFT_DEPTH == 99
replace LOFT_DEPTH = . if LOFT_DEPTH == 99
* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace
replace FP_ENG = "Unknown" if FP_ENG == "99"
replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH == "99"
* comes in as missing?
replace LI_YEAR = "Unknown" if LI_YEAR == "."
replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "."
replace CWI_YEAR = "Unknown" if CWI_YEAR == "."
* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
......@@ -144,7 +152,8 @@ if `create_xwavefile' {
tabstat IMD_ENG, by(REGION) s(mean min max n)
* there seem to be some English LSOAs allocated to Wales?
tabstat FP_ENG, by(REGION)
tab FP_ENG REGION
* REGION is ONS admin codes
* create a new variable with meaningful labels
gen ba_region = 1 if REGION == "E12000001"
......@@ -175,9 +184,9 @@ if `create_xwavefile' {
log close cb_xwave
log on main
restore
}
}
if `create_longfile' {
if `create_longfile' {
* create the long file with as few vars as possible (quicker)
* still takes a while...
......@@ -203,9 +212,17 @@ if `create_longfile' {
xtset HH_ID year
compress
save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
log off main
log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
desc
codebook
log close cbl
log on main
* this leaves us with the long form file in memory
}
}
/*
* Link xwave data to long form file
* THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment