altered script to create random samples using loop

10%, 50%, 100% (not a sample!!)

altered script to create random samples using loop
f9597617 · Ben Anderson · 6c03a3c5 · f9597617
Commit f9597617 authored 10 years ago by Ben Anderson
--- a/NEED/process-NEED-EULF-2014.do
+++ b/NEED/process-NEED-EULF-2014.do
@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/"
 * NB this is the 2014 EULF we're using
 local ifile "need_eul_may2014"
 * original data file
-local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta"
+local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"


 * If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
 * For now we'll just create a random sub-sample of sample% to make testing models etc easier
 * 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
-local sample 20
-local samplet "`sample'pc"
+local sample "10 50 100"
+
 local sampleby "EE_BAND PROP_TYPE"

 local version "v1.1"
@@ -82,30 +82,32 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep

 * use these locals to control what happens (set to 0 to skip the code)
 * create codebook & some descriptives
-local create_codebook = 0
+local create_codebook = 1
 * create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
 local create_xwavefile = 1
 * create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
 local create_longfile = 1

 * load the original file
-use "`dfile_orig'", clear
+use "`dfile_orig'.dta", clear

 if `create_codebook' {
 	* create original EULF codebook
-	* not much point running thid for each % sample although the counts etc reported in the codebook won't match
+	* not much point running this for each % sample although the counts etc reported in the codebook won't match
 	log off main
-	log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb)
+	log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
 	desc
 	codebook
 	log close cb
 	log on main
 }

+foreach s of local sample {
+	local samplet "`s'pc"
 	***** random sample ****
 	* select a random sample but ensure proportions of sampleby are kept
-di "* Keeping `sample'% sample by `sampleby'"
-sample `sample', by(`sampleby')
+	di "* Keeping `s'% sample by `sampleby'"
+	sample `s', by(`sampleby')
 	
 	tab `sampleby', mi
 	
@@ -124,7 +126,7 @@ if `create_xwavefile' {
 		preserve
 			drop Gcons* Econs*
 			* fix some mis-codings (or lack of coding of missing)
-		local vars "E7Flag2012 CWI LI"
+			local vars "E7Flag2012 CWI LI BOILER"
 			foreach v of local vars {
 				destring `v', force replace
 				replace `v' = 0 if `v' !=1
@@ -133,8 +135,14 @@ if `create_xwavefile' {
 			}
 			
 			* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
-		replace FP_ENG = . if LOFT_DEPTH  == 99
-		replace LOFT_DEPTH = . if LOFT_DEPTH  == 99
+			* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
+			tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace
+			replace FP_ENG = "Unknown" if FP_ENG  == "99"
+			replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH  == "99"
+			* comes in as missing?
+			replace LI_YEAR = "Unknown" if LI_YEAR == "."
+			replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "."
+			replace CWI_YEAR = "Unknown" if CWI_YEAR == "."
 					
 			* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
 			
@@ -144,7 +152,8 @@ if `create_xwavefile' {
 			tabstat IMD_ENG, by(REGION) s(mean min max n)
 			* there seem to be some English LSOAs allocated to Wales?
 		
-		tabstat FP_ENG, by(REGION)
+			tab FP_ENG REGION
+			
 			* REGION is ONS admin codes
 			* create a new variable with meaningful labels
 			gen ba_region = 1 if REGION == "E12000001"
@@ -203,9 +212,17 @@ if `create_longfile' {
 		xtset HH_ID year
 		compress
 		save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
+		
+		log off main
+		log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
+		desc
+		codebook
+		log close cbl
+		log on main
+	
 		* this leaves us with the long form file in memory
 	}
-
+}
 /*
 * Link xwave data to long form file
 * THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care