From 0762906c56d62f09307ada0ab24961b51ffcd4ef Mon Sep 17 00:00:00 2001
From: Ben Anderson <b.anderson@soton.ac.uk>
Date: Fri, 10 Oct 2014 17:06:24 +0100
Subject: [PATCH] fixed processing script to give random samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

set the % you want using the ‘sample’  local. By default this
constrains the selection to keep the overall ratios of EE_BAND
PROP_TYPE constant (as in the original sampling)
---
 NEED/process-NEED-EULF-2014.do | 112 ++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 50 deletions(-)

diff --git a/NEED/process-NEED-EULF-2014.do b/NEED/process-NEED-EULF-2014.do
index c182c73..177dfe3 100644
--- a/NEED/process-NEED-EULF-2014.do
+++ b/NEED/process-NEED-EULF-2014.do
@@ -88,61 +88,66 @@ if `create_xwavefile' {
 	* create a wide consumption file
 	preserve
 		keep HH_ID Gcons* Econs*
+		compress
 		save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
 	restore
-	drop Gcons* Econs*
 	
-	* fix some mis-codings (or lack of coding of missing)
-	local vars "E7Flag2012 CWI LI"
-	foreach v of local vars {
-		destring `v', force replace
-		replace `v' = 0 if `v' !=1
-		label def `v' 0 "No or N/A" 1 "Yes"
-		label val `v' `v'
-	}
-	
-	* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
-	replace FP_ENG = . if LOFT_DEPTH  == 99
-	replace LOFT_DEPTH = . if LOFT_DEPTH  == 99
-	
-	* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
 	
-	tabstat IMD_WALES, by(REGION) s(mean min max n)
-	* there seem to be some welsh LSOAs allocated to English GORs?
-	
-	tabstat IMD_ENG, by(REGION) s(mean min max n)
-	* there seem to be some English LSOAs allocated to Wales?
-
-	tabstat FP_ENG, by(REGION)
-	* REGION is ONS admin codes
-	* create a new variable with meaningful labels
-	gen ba_region = 1 if REGION == "E12000001"
-	replace ba_region = 2 if REGION == "E12000002"
-	replace ba_region = 3 if REGION == "E12000003"
-	replace ba_region = 4 if REGION == "E12000004"
-	replace ba_region = 5 if REGION == "E12000005"
-	replace ba_region = 6 if REGION == "E12000006"
-	replace ba_region = 7 if REGION == "E12000007"
-	replace ba_region = 8 if REGION == "E12000008"
-	replace ba_region = 9 if REGION == "E12000009"
-	replace ba_region = 10 if REGION == "W99999999"
-	
-	lab var ba_region "former Govt Office region (labelled)"
-	* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
-	lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
-		5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
-	lab val ba_region ba_region
+	preserve
+		drop Gcons* Econs*
+		* fix some mis-codings (or lack of coding of missing)
+		local vars "E7Flag2012 CWI LI"
+		foreach v of local vars {
+			destring `v', force replace
+			replace `v' = 0 if `v' !=1
+			label def `v' 0 "No or N/A" 1 "Yes"
+			label val `v' `v'
+		}
+		
+		* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
+		replace FP_ENG = . if LOFT_DEPTH  == 99
+		replace LOFT_DEPTH = . if LOFT_DEPTH  == 99
+		
+		* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
+		
+		tabstat IMD_WALES, by(REGION) s(mean min max n)
+		* there seem to be some welsh LSOAs allocated to English GORs?
+		
+		tabstat IMD_ENG, by(REGION) s(mean min max n)
+		* there seem to be some English LSOAs allocated to Wales?
 	
-	compress
-	log off main
-	log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
-	desc
-	di "** no idea what G, L, M mean in the 'valid' variables - presumably 0 = off gas & V = valid?"
-	di "** urban/rural would be helpful"
-	codebook
-	save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
-	log close cb_xwave
-	log on main
+		tabstat FP_ENG, by(REGION)
+		* REGION is ONS admin codes
+		* create a new variable with meaningful labels
+		gen ba_region = 1 if REGION == "E12000001"
+		replace ba_region = 2 if REGION == "E12000002"
+		replace ba_region = 3 if REGION == "E12000003"
+		replace ba_region = 4 if REGION == "E12000004"
+		replace ba_region = 5 if REGION == "E12000005"
+		replace ba_region = 6 if REGION == "E12000006"
+		replace ba_region = 7 if REGION == "E12000007"
+		replace ba_region = 8 if REGION == "E12000008"
+		replace ba_region = 9 if REGION == "E12000009"
+		replace ba_region = 10 if REGION == "W99999999"
+		
+		lab var ba_region "former Govt Office region (labelled)"
+		* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
+		lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
+			5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
+		lab val ba_region ba_region
+		
+		compress
+		log off main
+		log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
+		desc
+		di "** no idea what G, L, M mean in the 'valid' variables - presumably 0 = off gas & V = valid?"
+		di "** urban/rural would be helpful"
+		codebook
+		compress
+		save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
+		log close cb_xwave
+		log on main
+	restore
 }
 
 if `create_longfile' {
@@ -169,11 +174,16 @@ if `create_longfile' {
 	rename _j year
 	* set as panel
 	xtset HH_ID year
+	di "* check distributions for `samplet' sample"
+	xtdescribe
+	xtsum Gcons Econs
 	compress
 	save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
+	* this leaves us with the long form file in memory
 }
 
 /*
+* Link xwave data to long form file
 * THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care
 * now just merge them
 * start with long file which may or may not have just been re-created
@@ -184,4 +194,6 @@ merge m:1 HH_ID using "`dpath'/`dfile'_xwavefile.dta"
 save "`dpath'/`dfile'_consumptionfile_long_complete_`samplet'.dta", replace
 */
 
+* done!
+
 log close _all
-- 
GitLab