diff --git a/NEED/process-NEED-EULF-2014.do b/NEED/process-NEED-EULF-2014.do
index a627b959203458745b07ba5683143a99a22050a3..e8357702d3f240c1ddaed0f49b6acd63d3ed222c 100644
--- a/NEED/process-NEED-EULF-2014.do
+++ b/NEED/process-NEED-EULF-2014.do
@@ -60,14 +60,14 @@ local dpath "`proot'/NEED/End User Licence File 2014/"
 * NB this is the 2014 EULF we're using
 local ifile "need_eul_may2014"
 * original data file
-local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta"
+local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'"
 
 
 * If you really wanted to you could set up a loop to iterate over a list of sample values to create a set of random sub-samples
 * For now we'll just create a random sub-sample of sample% to make testing models etc easier
 * 10 = 10% sample, 50 = 50% sample, 100 = 100% sample etc
-local sample 20
-local samplet "`sample'pc"
+local sample "10 50 100"
+
 local sampleby "EE_BAND PROP_TYPE"
 
 local version "v1.1"
@@ -82,130 +82,147 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep
 
 * use these locals to control what happens (set to 0 to skip the code)
 * create codebook & some descriptives
-local create_codebook = 0
+local create_codebook = 1
 * create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis
 local create_xwavefile = 1
 * create long form file with wave (yearly) data - be careful, this take a long time due to large memory use!
 local create_longfile = 1
 
 * load the original file
-use "`dfile_orig'", clear
+use "`dfile_orig'.dta", clear
 
 if `create_codebook' {
 	* create original EULF codebook
-	* not much point running thid for each % sample although the counts etc reported in the codebook won't match
+	* not much point running this for each % sample although the counts etc reported in the codebook won't match
 	log off main
-	log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb)
+	log using "`dfile_orig'-codebook-$S_DATE.smcl", replace name(cb)
 	desc
 	codebook
 	log close cb
 	log on main
 }
 
-***** random sample ****
-* select a random sample but ensure proportions of sampleby are kept
-di "* Keeping `sample'% sample by `sampleby'"
-sample `sample', by(`sampleby')
-
-tab `sampleby', mi
-
-
-if `create_xwavefile' {
-	* create the file with data that (notionally) doesn't change
-
-	* create a wide consumption file
-	preserve
-		keep HH_ID Gcons* Econs*
-		compress
-		save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
-	restore
+foreach s of local sample {
+	local samplet "`s'pc"
+	***** random sample ****
+	* select a random sample but ensure proportions of sampleby are kept
+	di "* Keeping `s'% sample by `sampleby'"
+	sample `s', by(`sampleby')
 	
+	tab `sampleby', mi
 	
-	preserve
-		drop Gcons* Econs*
-		* fix some mis-codings (or lack of coding of missing)
-		local vars "E7Flag2012 CWI LI"
-		foreach v of local vars {
-			destring `v', force replace
-			replace `v' = 0 if `v' !=1
-			label def `v' 0 "No or N/A" 1 "Yes"
-			label val `v' `v'
-		}
-		
-		* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
-		replace FP_ENG = . if LOFT_DEPTH  == 99
-		replace LOFT_DEPTH = . if LOFT_DEPTH  == 99
+	
+	if `create_xwavefile' {
+		* create the file with data that (notionally) doesn't change
+	
+		* create a wide consumption file
+		preserve
+			keep HH_ID Gcons* Econs*
+			compress
+			save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace
+		restore
 		
-		* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
 		
-		tabstat IMD_WALES, by(REGION) s(mean min max n)
-		* there seem to be some welsh LSOAs allocated to English GORs?
+		preserve
+			drop Gcons* Econs*
+			* fix some mis-codings (or lack of coding of missing)
+			local vars "E7Flag2012 CWI LI BOILER"
+			foreach v of local vars {
+				destring `v', force replace
+				replace `v' = 0 if `v' !=1
+				label def `v' 0 "No or N/A" 1 "Yes"
+				label val `v' `v'
+			}
+			
+			* turn '99' into missing - ideally missing should be -99 to aid re-coding and avoid unpleasant surprises in naive analysis!
+			* recode category vars into strings to avoid confusion & enable 'unknown' to be retained in models etc (might matter)
+			tostring LOFT_DEPTH FP_ENG LI_YEAR BOILER_YEAR CWI_YEAR, force replace
+			replace FP_ENG = "Unknown" if FP_ENG  == "99"
+			replace LOFT_DEPTH = "Unknown" if LOFT_DEPTH  == "99"
+			* comes in as missing?
+			replace LI_YEAR = "Unknown" if LI_YEAR == "."
+			replace BOILER_YEAR = "Unknown" if BOILER_YEAR == "."
+			replace CWI_YEAR = "Unknown" if CWI_YEAR == "."
+					
+			* what do G, L, M mean in the gas 'valid' variables - presumably 0 = off gas & V = valid?
+			
+			tabstat IMD_WALES, by(REGION) s(mean min max n)
+			* there seem to be some welsh LSOAs allocated to English GORs?
+			
+			tabstat IMD_ENG, by(REGION) s(mean min max n)
+			* there seem to be some English LSOAs allocated to Wales?
 		
-		tabstat IMD_ENG, by(REGION) s(mean min max n)
-		* there seem to be some English LSOAs allocated to Wales?
+			tab FP_ENG REGION
+			
+			* REGION is ONS admin codes
+			* create a new variable with meaningful labels
+			gen ba_region = 1 if REGION == "E12000001"
+			replace ba_region = 2 if REGION == "E12000002"
+			replace ba_region = 3 if REGION == "E12000003"
+			replace ba_region = 4 if REGION == "E12000004"
+			replace ba_region = 5 if REGION == "E12000005"
+			replace ba_region = 6 if REGION == "E12000006"
+			replace ba_region = 7 if REGION == "E12000007"
+			replace ba_region = 8 if REGION == "E12000008"
+			replace ba_region = 9 if REGION == "E12000009"
+			replace ba_region = 10 if REGION == "W99999999"
+			
+			lab var ba_region "former Govt Office region (labelled)"
+			* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
+			lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
+				5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
+			lab val ba_region ba_region
+			
+			compress
+			log off main
+			log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
+			desc
+			di "** urban/rural would be helpful"
+			codebook
+			compress
+			save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
+			log close cb_xwave
+			log on main
+		restore
+	}
 	
-		tabstat FP_ENG, by(REGION)
-		* REGION is ONS admin codes
-		* create a new variable with meaningful labels
-		gen ba_region = 1 if REGION == "E12000001"
-		replace ba_region = 2 if REGION == "E12000002"
-		replace ba_region = 3 if REGION == "E12000003"
-		replace ba_region = 4 if REGION == "E12000004"
-		replace ba_region = 5 if REGION == "E12000005"
-		replace ba_region = 6 if REGION == "E12000006"
-		replace ba_region = 7 if REGION == "E12000007"
-		replace ba_region = 8 if REGION == "E12000008"
-		replace ba_region = 9 if REGION == "E12000009"
-		replace ba_region = 10 if REGION == "W99999999"
-		
-		lab var ba_region "former Govt Office region (labelled)"
-		* http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html
-		lab def ba_region 1 "North East" 2 "North West" 3 "Yorkshire & The Humber" 4 "East Midlands" ///
-			5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 9 "South West" 10 "Wales"
-		lab val ba_region ba_region
-		
+	if `create_longfile' {
+		* create the long file with as few vars as possible (quicker)
+		* still takes a while...
+	
+		keep HH_ID Gcons* Econs*
+	
+		* panel vars:
+		* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
+		local vars "Gcons Econs"
+		foreach v of local vars {
+			di "* Renaming -> `v'"
+			foreach y of numlist 2005/2012 {
+				di "* -> `v' (`y')"
+				* put year on the end so reshape works
+				gen `v'Valid`y' = `v'`y'Valid
+				* remove old variable to save time in reshape & space
+				drop `v'`y'Valid
+			}
+		}
+		* this takes a LONG time for the full dataset
+		reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
+		rename _j year
+		* set as panel
+		xtset HH_ID year
 		compress
+		save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
+		
 		log off main
-		log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave)
+		log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'-long-`version'-`sample'pc-$S_DATE.smcl", replace name(cbl)
 		desc
-		di "** urban/rural would be helpful"
 		codebook
-		compress
-		save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace
-		log close cb_xwave
+		log close cbl
 		log on main
-	restore
-}
-
-if `create_longfile' {
-	* create the long file with as few vars as possible (quicker)
-	* still takes a while...
-
-	keep HH_ID Gcons* Econs*
-
-	* panel vars:
-	* Gcons2005 Gcons2005Valid Econs2005 Econs2005Valid -> 2012
-	local vars "Gcons Econs"
-	foreach v of local vars {
-		di "* Renaming -> `v'"
-		foreach y of numlist 2005/2012 {
-			di "* -> `v' (`y')"
-			* put year on the end so reshape works
-			gen `v'Valid`y' = `v'`y'Valid
-			* remove old variable to save time in reshape & space
-			drop `v'`y'Valid
-		}
+	
+		* this leaves us with the long form file in memory
 	}
-	* this takes a LONG time for the full dataset
-	reshape long Gcons GconsValid Econs EconsValid, i(HH_ID)
-	rename _j year
-	* set as panel
-	xtset HH_ID year
-	compress
-	save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace
-	* this leaves us with the long form file in memory
 }
-
 /*
 * Link xwave data to long form file
 * THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care