altered globals

f513fd3d · Ben Anderson · d1d24f0e · f513fd3d
Commit f513fd3d authored 10 years ago by Ben Anderson
--- a/CER-data-processing-original.do
+++ b/CER-data-processing-original.do
@@ -4,6 +4,8 @@
 * - Using the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
 *   - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
+* processes the original data for further use
 * This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
 * "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project 
 * - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
@@ -32,7 +34,7 @@ global where "/Users/ben/Documents/Work"
 global proot "$where/Projects/ESRC-Transformative-Census2022"
 global rfiles "$proot/results/CER-Irish-SM-Trial"
 * original files
-global odfiles "$where/Data/Social Science Datatsets/CER Smart Metering Project"
+global odfiles "$where/Data/Social Science Datatsets/CER Smart Metering Project/data"
 * processed files
 global pdfiles "$proot/data/cer"
@@ -53,7 +55,7 @@ timer on 1
 ************************************
 ************************************
 * start with the pre-trial survey
-use "$odfiles/data/processed/Smart meters Residential pre-trial survey data.dta"
+import excel "$odfiles/original/Smart meters Residential pre-trial survey data.xlsx", sheet("Sheet1") firstrow clear
 ********
 * test age, sex, employment status of chief income earner
@@ -162,162 +164,27 @@ recode Question310Whatistheemploym (1/3=1) (4/5=2) (6=3) (7=4), gen(ba_empl)
 lab def ba_empl 1 "In work" 2 "Unemployed" 3 "Retired" 4 "Caring for relative or family"
 lab val ba_empl ba_empl
-save "$pdfiles/Smart meters Residential pre-trial survey data-$version.dta", replace
+save "$odfiles/processed/Smart meters Residential pre-trial survey data-$version.dta", replace
 ************************************
 ************************************
-* load in the two cluster files, merge and save
-insheet using "$pdfiles/October 2009 summaries/OctHH_wkend_clusterID.txt", tab clear
-rename fitcluster wkend_fitcluster
-lab var wkend_fitcluster "Weekend clusters" 
-rename id ID
-compress
-save "$pdfiles/October 2009 summaries/OctHH_wkend_clusterID.dta", replace
-insheet using "$pdfiles/October 2009 summaries/OctHH_midwk_clusterID.txt", tab clear
-rename fitcluster midwk_fitcluster
-lab var midwk_fitcluster "Mid-week clusters"
-rename id ID
-compress
-save "$pdfiles/October 2009 summaries/OctHH_midwk_clusterID.dta", replace
-merge 1:1 ID using "$pdfiles/October 2009 summaries/OctHH_wkend_clusterID.dta", nogen
-* overlap between clusters?
-tab wkend_fitcluster midwk_fitcluster, mi
-save "$pdfiles/October 2009 summaries/OctHH_clusterIDs.dta", replace
-merge 1:1 ID using "$pdfiles/Smart meters Residential pre-trial survey data-$version.dta"
-* so 746 households don't match to the Oct 2009 sample leaving us with 3,486
-gen oct_sample = 0
-replace oct_sample = 1 if _merge == 3
-save "$pdfiles/Oct-2009-summaries-survey-$version.dta", replace
-******************************
-* load in Sharon's daily summaries for weekdays (derived from the raw data)
-* this one has spaces as delimiter
-insheet using "$pdfiles/October 2009 summaries/CER_OctHH_midwk_long.txt", delim(" ") clear
-destring ecf lf, replace force
-gen midweek = 1
-compress
-save "$pdfiles/October 2009 summaries/CER_OctHH_midwk_long.dta", replace
-* this one has tabs!
-insheet using "$pdfiles/October 2009 summaries/CER_OctHH_wkend_long.txt", tab clear
-destring ecf lf, replace force
-gen midweek = 0
-compress
-save "$pdfiles/October 2009 summaries/CER_OctHH_wkend_long.dta", replace
-* append mid-week
-append using "$pdfiles/October 2009 summaries/CER_OctHH_midwk_long.dta"
-rename id ID
-* remove the dates that are NOT October 2009 (why are they in there anyway??)
-drop if dateoct > 300
-* add survey & cluster data
-merge m:1 ID using "$pdfiles/Oct-2009-summaries-survey-$version.dta", gen(m_survey)
-* some survey respondents not in the October data, some in October data not in survey
-* keep what matches
-keep if m_survey == 3
-* save
-save "$pdfiles/Oct-2009-daily-summaries-survey-$version.dta", replace
-*********************
 * Switch to 1/2 hour level data
 * raw data
-insheet using "$odfiles/data/original/HH2009_long.txt", delimiter(" ") names clear
+insheet using "$odfiles/original/HH2009_long.txt", delimiter(" ") names clear
-rename HHID ID
+rename hhid ID
 rename kw kwh
 tostring ds, force generate(ts_ds)
 gen date = substr(ts_ds,1,3)
 gen halfhour = substr(ts_ds,4,5)
-* mid-week
-insheet using "$pdfiles/CER_OctHH_data/CER_OctHH_mdwk_30min.txt", tab clear
-li in 1/5
-* the columns are munched
-drop id
-rename ds ID
-lab var ID "ID"
-rename kw timestamp
-lab var timestamp "timestamp (original format)"
-rename dateoct kwh
-lab var kwh "kWh"
-rename v5 date
-lab var date "date (original format)"
-li in 1/5
-* need to weed out the October 2010 cases
-keep if date < 365
-tostring timestamp, gen(tmp_timestamp) force
-gen halfhour = substr(tmp_timestamp,4,5)
-tab date
-* how many households do we have in this sample?
-* should be same as from the clustering
-preserve
-	collapse (mean) kwh , by(ID)
-	desc
-restore
-gen midweek = 1
-lab def midweek 0 "Saturday/Sunday" 1 "Tuesday-Thursday"
-lab val midweek midweek
-drop tmp_timestamp
-save "$pdfiles/CER_OctHH_data/CER_Oct2009HH_mdwk_30min.dta", replace
-*******************************
-* load in weekends
-insheet using "$pdfiles/CER_OctHH_data/CER_OctHH_wkend_30min.txt", tab clear
-li in 1/5
-* the columns are munched again
-drop id
-rename ds ID
-lab var ID "ID"
-rename kw timestamp
-lab var timestamp "timestamp (original format)"
-rename dateoct kwh
-lab var kwh "kWh"
-rename v5 date
-lab var date "date (original format)"
-li in 1/5
-* need to weed out the October 2010 cases
-keep if date < 365
-tostring timestamp, gen(tmp_timestamp) force
-gen halfhour = substr(tmp_timestamp,4,5)
-gen midweek = 0
-lab val midweek midweek
-drop tmp_timestamp
-save "$pdfiles/CER_OctHH_data/CER_Oct2009HH_wkend_30min.dta", replace
-*********************************
-* append mid week to weekend
-append using "$pdfiles/CER_OctHH_data/CER_Oct2009HH_mdwk_30min.dta"
-* add the clustering results
-merge m:1 ID using "$pdfiles/October 2009 summaries/OctHH_clusterIDs.dta", gen(m_cluster)
 * fix dates properly
 * we know date = 1 = Jan 1st 2009
 gen double s_date = mdy(1, 1, 2009)
 format s_date %td
 * add the number of days but subtract 1 otherwise we will start on 2/1/2009!
+destring date, force replace
 replace s_date = s_date + (date - 1)
 * create day of week (remember in stata 0 = Sunday)
@@ -344,7 +211,7 @@ gen double s_datetime = dhms(s_date, hour, mins, sec)
 format s_datetime %tc
 * add the survey data (makes big file) but only keep what we need
-merge m:1 ID using "$pdfiles/Smart meters Residential pre-trial survey data-$version.dta", gen(m_survey) ///
+merge m:1 ID using "$odfiles/processed/Smart meters Residential pre-trial survey data-$version.dta", gen(m_survey) ///
 	keepusing(ba_*)
 sort ID s_datetime
@@ -352,9 +219,9 @@ sort ID s_datetime
 * check
 li ID date halfhour s_* in 1/12, sep(2)
-drop m_cluster timestamp date ds_halfhour halfhour hour mins sec
+drop timestamp date ds_halfhour halfhour hour mins sec
-save "$pdfiles/CER_OctHH_data/CER_Oct2009HH_30min_survey.dta", replace
+save "$odfiles/processed/HH2009_long_survey.dta", replace
 timer off 1