diff --git a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do
index 08c4100e1aed67579937d521d66b4e1f6832fd91..8ec70935e13d0c955a610c16c70c84af2dee9f7c 100644
--- a/NEED/analyse-NEED-EULF-2014-electricity-consumption.do
+++ b/NEED/analyse-NEED-EULF-2014-electricity-consumption.do
@@ -37,37 +37,43 @@ capture noisily log close
 set more off
 
 * written for Mac OSX - remember to change filesystem delimiter for other platforms
-global home "~/Documents"
+global home "~/Documents/Work"
 
-global dpath "$home/Work/Data/Social Science Datatsets/DECC/NEED/End User Licence File 2014/processed"
-global rpath "$home/Work/Papers and Conferences/RSS-2015/results"
+global dpath "$home/Data/Social Science Datasets/DECC/NEED/End User Licence File 2014/processed"
+global rpath "$home/Papers and Conferences/RSS-2015/results"
 
 local version "v1"
+
+log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
+
 * set sample
 * 100 = 100pc
 * etc
-local sample "100"
-
-* control what happens
-local do_desc = 1
+local sample = 100
 
-* toggle graph drawing
-local do_graphs = 1
+* use weights (leave blank to ignore)
+/*
+"Weighting based on Region, property age, property type and floor area band. 
+Summing all weights gives (approximate) total number of households in England and Wales 2011."
+*/
+local need_weight = "[pw = WEIGHT]" // use probability weights
 
-* ref DECC look up table
-lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)"
-* NB DECC look up table says max elec = 50,000
-lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
+* control what happens
+local do_desc = 0 // do descriptives
+local do_histo = 0 // toggle graph drawing
+local do_box = 0 // toggle graph drawing
+local do_xsec = 1 // run cross-sectional analysis
+local do_longit = 1 // run longitudinal analysis
 
 
-* also be aware that the consumption is rounded in buckets:
+* Be aware that the consumption is rounded in buckets:
 /*
 GconsYEAR	.	Missing, off gas or invalid consumption
-	100 � 7,999	Gas consumption kWh rounded to nearest 500 kWh
-	8,000- 15,999	Gas consumption kWh rounded to nearest 100 kWh
-	16,000 � 24,999	Gas consumption kWh rounded to nearest 500 kWh
-	25,000 � 34,999	Gas consumption kWh rounded to nearest 1,000 kWh
-	35,000 � 50,000	Gas consumption kWh rounded to nearest 5,000 kWh
+	100 - 7,999	Gas consumption kWh rounded to nearest 500 kWh
+	8,000 - 15,999	Gas consumption kWh rounded to nearest 100 kWh
+	16,000 - 24,999	Gas consumption kWh rounded to nearest 500 kWh
+	25,000 - 34,999	Gas consumption kWh rounded to nearest 1,000 kWh
+	35,000 - 50,000	Gas consumption kWh rounded to nearest 5,000 kWh
 
 
 EconsYEAR	.	Missing or invalid consumption
@@ -79,7 +85,6 @@ EconsYEAR	.	Missing or invalid consumption
 set more off
 */
 
-log using "$rpath/analyse-NEED-EULF-2014-electricity-consumption-`version'.smcl", replace
 
 
 di "************************"
@@ -90,20 +95,30 @@ use "$dpath/need_eul_may2014_consumptionfile_long_`sample'pc.dta", clear
 * merge in the xwave file (fixed data - we assume!)
 merge m:1 HH_ID using "$dpath/need_eul_may2014_xwavefile_100pc.dta"
 
-lab var Econs "Electricity (KwH/year)"
-lab var Gcons "Gas (KwH/year)"
-
-* set as panel in case it wasn't
-* fix format of year so xtset doesn't break
-format year %ty
-xtset HH_ID year, delta(1 year)
+* set variable labels
+* ref DECC look up table
+lab def GconsValidr 1 "(V)alid" 2 "(O)ff-gas" 3 "(L)Gas < 100" 4 "(G) Gas > 50,000" 5 "M(issing in source)"
+* NB DECC look up table says max elec = 50,000
+lab def EconsValidr 1 "(V)alid" 2 "not set" 3 "(L)Elec < 100" 4 "(G) Elec > 25,000" 5 "M(issing in source)"
 
-* examine panel status
-xtdescribe
+* set up correct long form 'is X present in year' using year (if known)
+local vars "BOILER LI CWI"
 
-* set up
-local vars "Econs Gcons"
+* what will hapen if there are multiple replacements in a household
 foreach v of local vars {
+	gen ba_have_`v' = 0
+	destring `v'_YEAR, force replace
+	replace ba_have_`v' = 1 if `v'_YEAR <= year
+}
+
+* labels
+lab var Econs "Electricity (kWh/year)"
+lab var Gcons "Gas (kWh/year)"
+
+* set up data
+* include Gcons if you want gas too
+local setupvars "Econs Gcons"
+foreach v of local setupvars {
 	di "***************"
 	di "* Testing `v' for `sample'% sample"
 
@@ -116,25 +131,61 @@ foreach v of local vars {
 
 	lab var `v'Validr "Recoded `v'Valid"
 	lab val `v'Validr `v'Validr
+	tab `v'Validr `v'Valid
 
-	di "* Check transitions (`v'Validr)"
-	xttrans `v'Validr, freq
+	* set up consumption deciles and outlier flags
+	gen u99_`v' = 0
+	gen l99_`v' = 0
+	gen m98_`v' = 0
+	gen m90_`v' = 0
 
-	* set up consumption deciles
 	levelsof(year), local(levels)
 	foreach l of local levels {
-		di "* Calculating consumption deciles for `v' for `l'"
-		* creates missing for other years have to do this as egen does not allow by
+		di "* Calculating consumption deciles and outlier flags for `v' for `l'"
+		
+		* creates missing for other years have to do this as egen does not allow by & can't 'replace'
+		di "* `v' deciles for `l'"
 		egen `v'_dec_`l' = cut(`v') if year == `l', group(10)
+
+		di "* `v' outlier flags for `l'"
+		qui: su `v' if year == `l', de
+		
+		di "* define outliers"
+		di "* -> Upper: valid = G & in highest 1% of valid"
+		replace u99_`v' = 1 if (`v' > `r(p99)' | `v'Validr == 4) & year == `l' //  upper 1% 'valid'
+		
+		di "* -> Lower: valid = L & in lowest 1% of valid"
+		replace l99_`v' = 1 if (`v' < `r(p1)' | `v'Validr == 3) & year == `l' // lower 1% 'valid'
+		replace m98_`v' = 1 if u99_`v' == 0 & l99_`v' == 0 & year == `l' // middle 98% 'valid'	
+		replace m90_`v' = 1 if `v' > `r(p5)' & `v' < `r(p95)' & year == `l' // middle 90% 'valid'
+		
+		* do checks here otherwise tab puts things into the return list
+		*tab u99_`v' if year == `l', mi 
+		*tab l99_`v' if year == `l', mi 
+		*tab m98_`v' if year == `l', mi
+		*tab m90_`v' if year == `l', mi
+		
 	}
-	* now combine them - set missing option otherwise it counts a row where all are missing as 0
+	* now combine the deciles - set missing option otherwise it counts a row where all are missing as 0
 	egen `v'_dec = rowtotal(`v'_dec_*), missing
 	* remove temporary ones
 	drop `v'_dec_*
-	* check
-	tab `v'_dec year
+	* check unweighted
+	table `v'_dec year
+
+	tabstat u* l* m*, by(year)
+
+	tab l99_`v' `v'Validr
+	tab u99_`v' `v'Validr
+
+	* log vars
+	gen log`v' = log(`v')
+	lab var log`v' "Log `v'"
 }
 
+* set 'survey' weight
+svyset `need_weight'
+
 * flag dwellings which are off gas for electricity
 * NB - in this dataset we don't know if they use electricity as main heat (could be oil)
 gen ba_off_gas = 0
@@ -143,49 +194,219 @@ lab def ba_off_gas 0 "On gas (GconsValid!=O)" 1 "Off gas (GconsValid=O, from EPC
 lab val ba_off_gas ba_off_gas
 
 * check
-tabstat Gcons Econs, by(ba_off_gas)
+svy: mean Gcons Econs, over(ba_off_gas)
 di "* MAIN_HEAT_FUEL - Description of main heating fuel (gas or other). EPC - but NB could be 'other' but still be 'on gas'"
 
-tab ba_off_gas MAIN_HEAT_FUEL, mi // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
+table ba_off_gas MAIN_HEAT_FUEL `need_weight', missing // suggests EPC says 'off gas' (via GconsValid) but main heat fuel still says 'gas'?
 
-table year MAIN_HEAT_FUEL, by(ba_off_gas)
+table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas)
 * roughly constant rate throughout years
-table year MAIN_HEAT_FUEL, by(ba_off_gas) c(mean Gcons n Gcons)
-* but off gas have no gas readings as you'd expect (DECC filter)
-
-foreach v of local vars {
-	di "***************"
-	di "* Testing `v' for `sample'% sample"
-
-	* overall
-	xtsum `v' if `v'Valid == "V"
-	* test values for valid - check for valid 0s for example. This only happens for gas where:
-	* 100 < gcons < 250 so included but rounded to nearest 500 = 0
-
-	* elec always rounded to nearest 50 so min should always be 100
-
-	tabstat `v', by(`v'Valid) s(n mean semean min max)
-	* by year
-	di "* check `v' for 0s (`s'% sample)"
-	table `v' year if `v' < 1000
-	table `v'Valid year, c(count `v' min `v' mean `v' max `v')
+table year MAIN_HEAT_FUEL `need_weight', by(ba_off_gas) c(mean Gcons n Gcons)
+* but off gas have no gas readings as you'd expect (DECC applied filter)
+
+local descvars "Econs"
+if `do_desc' {
+	foreach v of local descvars {
+		di "***************"
+		di "* Testing `v' for `sample'% sample"
+
+		di "* Check `v' by year"
+		di "* Check `v' for 0s (`sample'% sample)"
+		if "`v'" == "Econs" {
+			di "* -> `v' always rounded to nearest 50 so min should always be 100"
+			local min = 500
+		}
+		if "`v'" == "Gcons" {
+			di "* -> 100 < `v' < 250 so included but rounded to nearest 500 which in this case = 0"
+			local min = 1000
+		}
+		table `v' year if `v' < `min' `need_weight'
+
+		di "* Check no values of `v' appear for 'non valid' codes"
+		di "* test values for all valid codes"
+		table `v'Valid `need_weight', c(count `v' mean `v')
+
+		di "* test `v' per year for valid codes (use pw)"
+		local table_stats "count `v' min `v' mean `v' p50 `v' max `v'" // set table stats - 5 max!!
+		table year if `v'Valid == "V" `need_weight', c(`table_stats`n'')
+
+		di "* table won't allow semean for iw so use svy: mean `v' instead"
+		svy: mean `v' if `v'Valid == "V", over(year)
+		
+		di "* use table to force sd `v' calculation using iw not pw, include mean to check"
+		table year [iw=WEIGHT], c(mean `v' sd `v')
+
+		if `do_histo' {
+			di "* Running graphs: histo for `v' (can't use `need_weight')"
+			histogram `v' if `v'Valid == "V", frequency by(year) ylab(,angle(0)) scale(0.75)
+			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-histo_`v'_by_year_valid.png", replace
+		}
+
+		if `do_box' {
+			di "* Running graphs: boxes for `v'"
+			di "* `v' over year"
+			graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical))) ///
+				ylab(,angle(0)) scale(0.75)
+			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_over_year_valid.png", replace
+
+			di "* `v' over year by floor area"
+			svy: mean `v', over(FLOOR_AREA_BAND)
+			graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical)))  ///
+				ylab(,angle(0)) by(FLOOR_AREA_BAND) scale(0.75)
+			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_floor_valid.png", replace
+
+			di "* `v' over year by EE band"
+			graph box `v' if `v'Valid == "V" `need_weight', over(year, label(angle(vertical)))  ///
+				ylab(,angle(0)) by(EE_BAND) scale(0.75)
+			graph export "$rpath/graphs/NEED-EULF-2014-`sample'pc-box_`v'_yr_ee_valid.png", replace
+
+		}
+		
+		di "* check the distributions of the outliers"
+		local tvars "MAIN_HEAT_FUEL E7Flag2012 ba_off_gas FLOOR_AREA_BAND EE_BAND IMD_ENG"
+		foreach tv of local tvars {
+			di "* Checking top 1% against `tv'"
+			tab `tv' u99_Econs , col
+		}  
+		
 
-	if `do_graphs' {
-		di "* Running graphs - do not keep in memory, just save out"
-		di "* Running graphs: histo"
-		histogram `v' if `v'Valid == "V", by(year) scale(0.75)
-		graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-histo_`v'_by_year_valid.png", replace
 
-		di "* Running graphs: boxes"
-		graph box `v' if `v'Valid == "V", over(year) scale(0.75)
-		graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_over_year_valid.png", replace
+	}
+}
 
-		graph box `v' if `v'Valid == "V", over(year) by(FLOOR_AREA_BAND) scale(0.75)
-		graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_floor_valid.png", replace
+if `do_xsec' {
+	di "* Running cross sectional analysis using 2012 (all valid cases)"
+	* make sure test vars are destringed
+	local tvars "IMD_ENG FP_ENG E7Flag2012 MAIN_HEAT_FUEL PROP_AGE PROP_TYPE FLOOR_AREA_BAND EE_BAND LOFT_DEPTH WALL_CONS CWI BOILER"
+	foreach tv of local tvars {
+		destring `tv', force replace
+	}
+	* kitchen sink model - use 99% signif level
+	* use vce(robust) as running without & using hettest suggests heteroscedasticity
+	* regress would throw out collinear variables
+	* do not include IMD & FP as really need multilvel model for that
+	regress logEcons i.ba_region i.E7Flag2012 i.MAIN_HEAT_FUEL ///
+		i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
+		if year == 2012, level(99) vce(robust)
+	est store est_logEcons_2012
+
+	estout est_logEcons_2012 using "$rpath/regress_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
+
+	* tests
+	predict logEconsr if e(sample), residuals
+	di "* test for normality of residuals using graphs"
+	* see http://www-personal.umich.edu/~bwest/ch3diags.do
+	* do not use sktest as large n & also see 
+	* http://www.stata.com/statalist/archive/2005-12/msg00238.html
+	
+	* Plot residuals
+	* pnormal plot of residuals
+	pnorm logEconsr, name(pnorm_logEconsr)
+	graph export "$rpath/graphs/pnorm_logEconsr.png", replace
+	
+	* qnormal plot
+	qnorm logEconsr, name(qnorm_logEconsr)
+	graph export "$rpath/graphs/qnorm_logEconsr.png", replace
+
+	* test the null hypothesis that the variance of the residuals is homogenous. 
+	* Therefore, if the p-value is very small, we would have to reject the hypothesis 
+	* and accept the alternative hypothesis that the variance is not homogenous. 
+	* so if p < 0.05 -> heteroscedasticity and so need robust
+	* BUT check rvfplot (http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm)
+	
+	/* can't use after cluster
+	di "* test for Heteroscedasticity"
+	di "* if p < 0.05 -> heteroscedasticity and so need robust"
+	estat hettest
+	*/
+
+	di "* test for omitted vars"
+	* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
+	di "* if p < 0.05 -> omitted vars"
+	estat ovtest
+
+	* check for mis-specification
+	* http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter2/statareg2.htm
+	* The model is then refit using these two variables as predictors. 
+	* _hat should be significant since it is the predicted value. 
+	* On the other hand, _hatsq shouldn't, because if our model is specified correctly, 
+	* the squared predictions should not have much explanatory power. 
+	* That is we wouldn't  expect  _hatsq to be a significant predictor if our model is specified correctly. 
+	* So we will be looking at the p-value for _hatsq.
+	* p of _hatsq < 0.05 -> mis-spec
+		
+	di "* linktest"
+	di "* if p of _hatsq < 0.05 -> mis-spec"
+	linktest
+	
+	* test BIC etc
+	estat ic
 
-		graph box `v' if `v'Valid == "V", over(year) by(EE_BAND) scale(0.75)
-		graph export "$rpath/graphs/NEED-EULF-2014-`s'pc-box_`v'_yr_ee_valid.png", replace
+}
 
+if `do_longit' {
+	di "* Running longitudinal analysis"
+	di "* Check boiler transitions"
+	xttrans ba_have_BOILER, freq
+	
+	di "* Check loft insulation transitions"
+	xttrans ba_have_LI, freq
+	
+	di "* Check cavity wall insulation transitions"
+	xttrans ba_have_CWI, freq
+
+	foreach v of local descvars {
+		di "***************"
+		di "* Longitudinal (panel) analysis"
+		* set as panel in case it wasn't
+		* fix format of year so xtset doesn't break
+		format year %ty
+		xtset HH_ID year, delta(1 year)
+
+		di "* examine panel status"
+		xtdescribe
+
+		di "* examine panel status for valid observations"
+		xtdescribe if Econs < .
+		
+		di "* xtsum `v' for all"
+		xtsum `v'
+		
+		di "* Check transitions (`v'Validr)"
+		xttrans `v'Validr, freq
+
+		di "* Check `v' consumption decile transitions"
+		xttrans `v'_dec, freq
+
+		di "* Check `v' outlier transitions"
+		bysort MAIN_HEAT_FUEL: xttrans u99_`v', freq
+		bysort MAIN_HEAT_FUEL: xttrans l99_`v', freq
+
+		di "* check lag model"
+		bysort HH_ID (year): gen logEconslag = logEcons[_n-1]
+
+		gen yeart = year - 2005
+		regress logEcons logEconslag i.ba_region i.E7Flag2012 ///
+			 i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
+		 	yeart ba_have_BOILER ba_have_LI ba_have_CWI
+		est store lag_model
+		estout lag_model using "$rpath/lag_model_logEcons_`version'.txt", cells("b se ci_u ci_l _star ") stats(r2 rmse ll N, fmt(%9.3f %9.0g)) replace
+		
+		di "* xt regression model of change (use log Econs)"
+		xtreg  logEcons i.ba_region i.E7Flag2012 ///
+			 i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
+		 	yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) fe
+		 
+		est store xtr_fe_logEcons
+		estout xtr_fe_logEcons using "$rpath/xtr_fe_logEcons_`version'.txt",  replace
+		 
+		di "* now including time constant variables"
+		xtreg  logEcons i.ba_region i.E7Flag2012 ///
+			 i.MAIN_HEAT_FUEL i.PROP_AGE i.PROP_TYPE i.FLOOR_AREA_BAND i.EE_BAND i.LOFT_DEPTH i.WALL_CONS  ///
+		 	yeart ba_have_BOILER ba_have_LI ba_have_CWI, i(HH_ID) re
+		 
+		est store xtr_re_logEcons
+		estout xtr_re_logEcons using "$rpath/xtr_re_logEcons_`version'.txt",  replace
 	}
 }