diff --git a/NEED/analyse-NEED-EULF-2014-descriptives.do b/NEED/analyse-NEED-EULF-2014-descriptives.do index 6350117c77c0a39773a72fdfe62c9607e48e99a3..0aa7a3ca12e53844f4235d06204e02b3e3f8e463 100644 --- a/NEED/analyse-NEED-EULF-2014-descriptives.do +++ b/NEED/analyse-NEED-EULF-2014-descriptives.do @@ -39,6 +39,7 @@ if `do_2014_desc' { use "`proot'/NEED/End User Licence File 2014/UKDA-7518-stata11/stata11/need_eul_may2014.dta", clear * distributions for 2012 (to test) + * processor intensive local vars "Econs2012 Gcons2012" local tvars "EE_BAND FLOOR_AREA_BAND PROP_AGE" foreach v of local vars { @@ -56,6 +57,13 @@ if `do_long_desc' { * Now use the pre-processed long form file which contains all years of consumption data but not the constant values (housing charactersitics etc) which are in the xwave file use "`dpath'/need_eul_may2014_longfile.dta", clear + * set as panel + xtset HH_ID year, delta(1 year) + + xtdescribe + + xtsum Econs Gcons + * summarise Electricity table EconsValid year, c(count Econs min Econs mean Econs max Econs) * summarise Gas diff --git a/NEED/process-NEED-EULF-2014.do b/NEED/process-NEED-EULF-2014.do index 8c7a4efb59114d1cfdb218b0a4a8a0ecc1ba7cc0..c182c73a3fef258262532077585e14e3fa5f99dd 100644 --- a/NEED/process-NEED-EULF-2014.do +++ b/NEED/process-NEED-EULF-2014.do @@ -35,7 +35,17 @@ local dpath "`proot'/NEED/End User Licence File 2014/" local ifile "need_eul_may2014" * original data file local dfile_orig "`dpath'UKDA-7518-stata11/stata11/`ifile'.dta" -local version "v1" + +* 10 = 10% sample, 50 = 50% sample, 100 = 100% sample +local sample 10 +local samplet "`sample'pc" +local sampleby "EE_BAND PROP_TYPE" + +local version "v1.1" +* includes production of % samples which maintain the original dimensions used to +* produce the EULF samples: EE_BAND PROP_TYPE + +*local version "v1" set more off @@ -43,17 +53,20 @@ log using "`dpath'/processed/process-NEED-EULF-2014-`version'-$S_DATE.smcl", rep * use these locals to control what happens (set to 0 to skip the code) * create codebook & some descriptives -local create_codebook = 1 +local create_codebook = 0 * create wide form fixed file with (supposedly) unchanging data & a seperate 'wide' consumption data file for cross-sectional analysis local create_xwavefile = 1 * create long form file with wave (yearly) data - be careful, this take a long time due to large memory use! -local create_longfile = 0 +local create_longfile = 1 + +* load the original file +use "`dfile_orig'", clear if `create_codebook' { - * create the codebook + * create original EULF codebook + * not much point running thid for each % sample although the counts etc reported in the codebook won't match log off main - log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-$S_DATE.smcl", replace name(cb) - use "`dfile_orig'", clear + log using "`dpath'/processed/codebook-NEED-EULF-2014-`version'-`sample'pc-$S_DATE.smcl", replace name(cb) desc di "** no idea what G, L, M mean in the 'valid' variables - presumably 0 = off gas & V = valid?" codebook @@ -61,13 +74,21 @@ if `create_codebook' { log on main } +***** random sample **** +* select a random sample but ensure proportions of sampleby are kept +di "* Keeping `sample'% sample by `sampleby'" +sample `sample', by(`sampleby') + +tab `sampleby', mi + + if `create_xwavefile' { * create the file with data that (notionally) doesn't change - use "`dfile_orig'", clear + * create a wide consumption file preserve keep HH_ID Gcons* Econs* - save "`dpath'/processed/`ifile'_consumptionfile_wide.dta", replace + save "`dpath'/processed/`ifile'_consumptionfile_wide_`samplet'.dta", replace restore drop Gcons* Econs* @@ -114,12 +135,12 @@ if `create_xwavefile' { compress log off main - log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-$S_DATE.smcl", replace name(cb_xwave) + log using "`dpath'/processed/codebook-NEED-EULF-2014-`ifile'_xwavefile-`version'-`samplet'-$S_DATE.smcl", replace name(cb_xwave) desc di "** no idea what G, L, M mean in the 'valid' variables - presumably 0 = off gas & V = valid?" di "** urban/rural would be helpful" codebook - save "`dpath'/processed/`ifile'_xwavefile.dta", replace + save "`dpath'/processed/`ifile'_xwavefile_`samplet'.dta", replace log close cb_xwave log on main } @@ -127,7 +148,7 @@ if `create_xwavefile' { if `create_longfile' { * create the long file with as few vars as possible (quicker) * still takes a while... - use "`dfile_orig'.dta", clear + keep HH_ID Gcons* Econs* * panel vars: @@ -143,24 +164,24 @@ if `create_longfile' { drop `v'`y'Valid } } - * this takes a LONG time - avoid running many times! + * this takes a LONG time for the full dataset reshape long Gcons GconsValid Econs EconsValid, i(HH_ID) rename _j year * set as panel - xtset HH_ID year, delta(1 year) + xtset HH_ID year compress - save "`dpath'/processed/`ifile'_consumptionfile_long.dta", replace + save "`dpath'/processed/`ifile'_consumptionfile_long_`samplet'.dta", replace } /* -* THIS TAKES AGES and creates a 1.5 GB file - use with care +* THIS TAKES AGES and creates a 1.5 GB file for the full dataset - use with care * now just merge them * start with long file which may or may not have just been re-created use "`dpath'/`dfile'_consumptionfile_long.dta", clear merge m:1 HH_ID using "`dpath'/`dfile'_xwavefile.dta" -save "`dpath'/`dfile'_consumptionfile_long_complete.dta", replace +save "`dpath'/`dfile'_consumptionfile_long_complete_`samplet'.dta", replace */ log close _all