diff --git a/CER-data-processing-original.do b/CER-data-processing-original.do index 9942c35f7190d51cf2372b6c5d8fea07b5632c3d..5627245b5d80e4b17ead49c4b19230942f3b8038 100644 --- a/CER-data-processing-original.do +++ b/CER-data-processing-original.do @@ -4,6 +4,8 @@ * - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data * - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/ +* processes the original data for further use + * This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the * "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project * - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21 @@ -32,7 +34,7 @@ global where "/Users/ben/Documents/Work" global proot "$where/Projects/ESRC-Transformative-Census2022" global rfiles "$proot/results/CER-Irish-SM-Trial" * original files -global odfiles "$where/Data/Social Science Datatsets/CER Smart Metering Project" +global odfiles "$where/Data/Social Science Datatsets/CER Smart Metering Project/data" * processed files global pdfiles "$proot/data/cer" @@ -53,7 +55,7 @@ timer on 1 ************************************ ************************************ * start with the pre-trial survey -use "$odfiles/data/processed/Smart meters Residential pre-trial survey data.dta" +import excel "$odfiles/original/Smart meters Residential pre-trial survey data.xlsx", sheet("Sheet1") firstrow clear ******** * test age, sex, employment status of chief income earner @@ -162,162 +164,27 @@ recode Question310Whatistheemploym (1/3=1) (4/5=2) (6=3) (7=4), gen(ba_empl) lab def ba_empl 1 "In work" 2 "Unemployed" 3 "Retired" 4 "Caring for relative or family" lab val ba_empl ba_empl -save "$pdfiles/Smart meters Residential pre-trial survey data-$version.dta", replace +save "$odfiles/processed/Smart meters Residential pre-trial survey data-$version.dta", replace ************************************ ************************************ -* load in the two cluster files, merge and save -insheet using "$pdfiles/October 2009 summaries/OctHH_wkend_clusterID.txt", tab clear -rename fitcluster wkend_fitcluster -lab var wkend_fitcluster "Weekend clusters" -rename id ID -compress -save "$pdfiles/October 2009 summaries/OctHH_wkend_clusterID.dta", replace - -insheet using "$pdfiles/October 2009 summaries/OctHH_midwk_clusterID.txt", tab clear -rename fitcluster midwk_fitcluster -lab var midwk_fitcluster "Mid-week clusters" -rename id ID -compress -save "$pdfiles/October 2009 summaries/OctHH_midwk_clusterID.dta", replace - -merge 1:1 ID using "$pdfiles/October 2009 summaries/OctHH_wkend_clusterID.dta", nogen - -* overlap between clusters? -tab wkend_fitcluster midwk_fitcluster, mi - -save "$pdfiles/October 2009 summaries/OctHH_clusterIDs.dta", replace - -merge 1:1 ID using "$pdfiles/Smart meters Residential pre-trial survey data-$version.dta" - -* so 746 households don't match to the Oct 2009 sample leaving us with 3,486 -gen oct_sample = 0 -replace oct_sample = 1 if _merge == 3 - -save "$pdfiles/Oct-2009-summaries-survey-$version.dta", replace - -****************************** -* load in Sharon's daily summaries for weekdays (derived from the raw data) -* this one has spaces as delimiter -insheet using "$pdfiles/October 2009 summaries/CER_OctHH_midwk_long.txt", delim(" ") clear -destring ecf lf, replace force -gen midweek = 1 -compress -save "$pdfiles/October 2009 summaries/CER_OctHH_midwk_long.dta", replace - -* this one has tabs! -insheet using "$pdfiles/October 2009 summaries/CER_OctHH_wkend_long.txt", tab clear -destring ecf lf, replace force -gen midweek = 0 -compress -save "$pdfiles/October 2009 summaries/CER_OctHH_wkend_long.dta", replace - -* append mid-week -append using "$pdfiles/October 2009 summaries/CER_OctHH_midwk_long.dta" - -rename id ID - -* remove the dates that are NOT October 2009 (why are they in there anyway??) -drop if dateoct > 300 - -* add survey & cluster data -merge m:1 ID using "$pdfiles/Oct-2009-summaries-survey-$version.dta", gen(m_survey) - -* some survey respondents not in the October data, some in October data not in survey -* keep what matches -keep if m_survey == 3 -* save -save "$pdfiles/Oct-2009-daily-summaries-survey-$version.dta", replace - -********************* * Switch to 1/2 hour level data * raw data -insheet using "$odfiles/data/original/HH2009_long.txt", delimiter(" ") names clear -rename HHID ID +insheet using "$odfiles/original/HH2009_long.txt", delimiter(" ") names clear +rename hhid ID rename kw kwh tostring ds, force generate(ts_ds) gen date = substr(ts_ds,1,3) gen halfhour = substr(ts_ds,4,5) - -* mid-week -insheet using "$pdfiles/CER_OctHH_data/CER_OctHH_mdwk_30min.txt", tab clear -li in 1/5 -* the columns are munched -drop id -rename ds ID -lab var ID "ID" -rename kw timestamp -lab var timestamp "timestamp (original format)" -rename dateoct kwh -lab var kwh "kWh" -rename v5 date -lab var date "date (original format)" -li in 1/5 -* need to weed out the October 2010 cases -keep if date < 365 - -tostring timestamp, gen(tmp_timestamp) force - -gen halfhour = substr(tmp_timestamp,4,5) - -tab date - -* how many households do we have in this sample? -* should be same as from the clustering -preserve - collapse (mean) kwh , by(ID) - desc -restore - -gen midweek = 1 -lab def midweek 0 "Saturday/Sunday" 1 "Tuesday-Thursday" -lab val midweek midweek -drop tmp_timestamp - -save "$pdfiles/CER_OctHH_data/CER_Oct2009HH_mdwk_30min.dta", replace - -******************************* -* load in weekends -insheet using "$pdfiles/CER_OctHH_data/CER_OctHH_wkend_30min.txt", tab clear -li in 1/5 -* the columns are munched again -drop id -rename ds ID -lab var ID "ID" -rename kw timestamp -lab var timestamp "timestamp (original format)" -rename dateoct kwh -lab var kwh "kWh" -rename v5 date -lab var date "date (original format)" -li in 1/5 -* need to weed out the October 2010 cases -keep if date < 365 - -tostring timestamp, gen(tmp_timestamp) force - -gen halfhour = substr(tmp_timestamp,4,5) -gen midweek = 0 -lab val midweek midweek -drop tmp_timestamp - -save "$pdfiles/CER_OctHH_data/CER_Oct2009HH_wkend_30min.dta", replace - -********************************* -* append mid week to weekend -append using "$pdfiles/CER_OctHH_data/CER_Oct2009HH_mdwk_30min.dta" - -* add the clustering results -merge m:1 ID using "$pdfiles/October 2009 summaries/OctHH_clusterIDs.dta", gen(m_cluster) - * fix dates properly * we know date = 1 = Jan 1st 2009 gen double s_date = mdy(1, 1, 2009) format s_date %td * add the number of days but subtract 1 otherwise we will start on 2/1/2009! +destring date, force replace replace s_date = s_date + (date - 1) * create day of week (remember in stata 0 = Sunday) @@ -344,7 +211,7 @@ gen double s_datetime = dhms(s_date, hour, mins, sec) format s_datetime %tc * add the survey data (makes big file) but only keep what we need -merge m:1 ID using "$pdfiles/Smart meters Residential pre-trial survey data-$version.dta", gen(m_survey) /// +merge m:1 ID using "$odfiles/processed/Smart meters Residential pre-trial survey data-$version.dta", gen(m_survey) /// keepusing(ba_*) sort ID s_datetime @@ -352,9 +219,9 @@ sort ID s_datetime * check li ID date halfhour s_* in 1/12, sep(2) -drop m_cluster timestamp date ds_halfhour halfhour hour mins sec +drop timestamp date ds_halfhour halfhour hour mins sec -save "$pdfiles/CER_OctHH_data/CER_Oct2009HH_30min_survey.dta", replace +save "$odfiles/processed/HH2009_long_survey.dta", replace timer off 1