moved code tree

980e2ddf · Ben Anderson · 8a1fee1b · 980e2ddf · 980e2ddf · 980e2ddf
Commit 980e2ddf authored 9 years ago by Ben Anderson
--- a/.gitignore
+++ b/.gitignore
+.Rproj.user
+.Rhistory
+.RData
--- a/CER-data-create-wfs.do
+++ b/CER-data-create-wfs.do
@@ -55,9 +55,9 @@ set more off
 clear all
-capture log close
+capture log close _all
-log using "$logpath/CER-data-processing-samples-$version.smcl", replace
+log using "$logpath/CER-data-processing-samples-$version.smcl", replace name(main)
 timer clear
@@ -72,11 +72,43 @@ use "$dpath/processed/SME and Residential allocations.dta"
 * merge pre-trial residential survey data
 merge 1:1 ID using "$dpath/processed/Smart meters Residential pre-trial survey data-$version.dta"
+drop _merge
 * keep the residential IDs only
 keep if sample == 1
-* keep basic info
+* set labels properly
+lab var ba_npeople "Number of residents"
+lab var ba_nchildren "Number of chidlren"
+lab var ba_nadults "Number of adults"
+lab var ba_floorarea "Self-reported floor area in m2"
+lab var ba_empl "HRP employment status"
+log off main
+log using "$dpath/processed/CER-pre-trial-survey-residential-sample-$version-codebook.smcl", ///
+	replace name(cb)
+desc
+codebook
+log off cb
+log on main
+* create a detailed version of the survey
+outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-$version.csv", comma replace
+* keep basic info to form a simplified version of the survey
 keep ID sample res_stim res_tariff ba_*
+log off main
+log using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version-codebook.smcl", ///
+	replace name(cb_wf)
+desc
+codebook
+log off cb_wf
+log on main
 save "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.dta", replace
 outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.csv", comma replace
@@ -84,7 +116,7 @@ outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$ver
 ************************************
 ************************************
-* Create 4 samples - Oct/Dec 2009 and Oct/Dec 2010
+* Create 2 samples - Oct 2009-2010 and Dec 2009-2010
 * Make sure collect all half hours
 local Oct2009 "tin(01oct2009 00:00, 31oct2009 23:45)"
 local Dec2009 "tin(01dec2009 00:00, 31dec2009 23:45)"
@@ -110,6 +142,8 @@ foreach s of local samples {
 		* restrict to the matched households
 		keep if _merge == 3
+		drop _merge sample // reduce size of files
 		xtset ID s_datetime, delta(30 mins)
 		* keep the periods we care about
 		keep if `Oct2009' || `Oct2010' || `Dec2009' || `Dec2010'

--- a/CER-data-processing-electricity.R
+++ b/CER-data-processing-electricity.R
+########
+# Data preparation of the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
+#   - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
+# - original electricity smart meter data = 6 File<n>.txt files (space delimited)
+# Easiest way to merge them: cat *.txt > catFiles.txt on a (fast) unix box
+# processes the original data for further use:
+# - create subsamples for residential data for Oct 2009 - Oct 2010 & Dec 2009 - Dec 2010
+# This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
+# "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project
+# - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
+# - http://www.energy.soton.ac.uk/tag/census2022/
+# Copyright (C) 2014  University of Southampton
+# Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, 
+# https://github.com/dataknut)
+# [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License
+# (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#YMMV - http://en.wiktionary.org/wiki/YMMV
+# Housekeeping ----
+# clear the workspace
+rm(list=ls())
+# load any libraries
+library(data.table)
+# where's the data?
+setwd("~/Documents/Work/Data/CER Smart Metering Project/data/")
+# need to unzip and concatenate the data first - it comes as 6 zipped text files
+# do this using "cat File1.txt File2.txt File3.txt File4.txt File5.txt File6.txt > catFiles.txt"
+# load data and fix variables ----
+inpath <- "original/CER_both/CER Electricity Revised March 2012/"
+# sample allocation data
+infile <- "SME and Residential allocations.csv"
+CER_HH_sample_DT <- as.data.table(read.csv(paste0(inpath,infile)), key = "ID")
+# CER_HH_sample_DT$code:
+# 1 = residential
+# 2 = SME
+# 3 = other
+CER_HH_sample_DT$Code <- factor(CER_HH_sample_DT$Code,
+                              levels = c("1","2","3"),
+                              labels = c("Residential", "SME","Other"))
+table(CER_HH_sample_DT$Code)
+# merge in kWh data ----
+# test with File1.txt until we know it works
+infile <- "File1"
+# then switch to the full data file when we know it works!
+#infile <- "catFiles"
+# construct the data file name - it is a .txt file
+dataf <- paste0(inpath,infile,".txt")
+print(paste0("Loading: ", dataf))
+CER_HH_kWh_DT <- as.data.table(read.table(dataf,
+                        col.names = c("ID","timestamp", "kWh")), key = "ID")
+# now sort the data by ID (household) and then timestamp within ID
+CER_HH_kWh_DT <- CER_HH_kWh_DT[with(CER_HH_kWh_DT, order(ID, timestamp)),]
+CER_HH_kWh_DT <- merge(CER_HH_kWh_DT, CER_HH_sample_DT, by = "ID")
+# how do we specify the columns we want to keep when we merge?
+# subset to keep the ones we want (must be a better way)
+CER_HH_kWh_DT <- subset(
+                    CER_HH_kWh_DT,
+                    select = c("ID","Code","timestamp", "kWh")) 
+# create real date/time variables ----
+# For some reason the supplied timestamp is:
+# Day code: digits 1-3 (day 1 = 1 January 2009)
+#           digits 4-5 (half hour 1 - 48) 1= 00:00:00 – 00:29:59
+CER_HH_kWh_DT$day <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 1, 3))
+CER_HH_kWh_DT$halfhour <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 4, 5))
+CER_HH_kWh_DT$datetime_z <- as.POSIXct("01/01/2009 00:00:00", tz = , "", "%d/%m/%Y %H:%M:%S")
+CER_HH_kWh_DT$datetime_start <- CER_HH_kWh_DT$datetime_z + # start with date zero
+  (CER_HH_kWh_DT$day*24*60*60) + # add number of days
+  ((CER_HH_kWh_DT$halfhour-1)*30*60) # add halfhours but subtract 1 as first needs to be '0'
+# remove unwanted variables to save memory
+CER_HH_kWh_DT$timestamp <- NULL
+CER_HH_kWh_DT$day <- NULL
+CER_HH_kWh_DT$halfhour <- NULL
+CER_HH_kWh_DT$datetime_z <- NULL
+# save outputs ----
+outpath <- "processed"
+# save the subsamples, do not save the whole file as it will be very large
+# use a pair of loops to prevent typing errors!
+years <- c("2009", "2010")
+samples <- c("Residential", "SME")
+for (y in years) {
+  for (s in samples) {
+    print(paste0("Saving ", s, " in ", y))
+    # October samples
+    dateSt <- paste0(y,"-10-01")
+    dateEn <- paste0(y,"-10-31")
+    date_start<-as.POSIXct(dateSt,tz="")
+    date_end<-as.POSIXct(dateEn,tz="")
+    outfile <- paste0(outpath,"/","CER_October_",y,"_",s,".csv")
+    print(paste0("Saving: ", outfile))
+    write.csv(
+      CER_HH_kWh_DT[
+        CER_HH_kWh_DT$datetime_start %in% date_start:date_end &
+        CER_HH_kWh_DT$Code == s],
+      row.names = FALSE,
+      file = outfile)
+    # December samples
+    dateSt <- paste0(y,"-12-01")
+    dateEn <- paste0(y,"-12-31")
+    date_start<-as.POSIXct(dateSt,tz="")
+    date_end<-as.POSIXct(dateEn,tz="")
+    outfile <- paste0(outpath,"/","CER_December_",y,"_",s,".csv")
+    print(paste0("Saving: ", outfile))
+    write.csv(
+      CER_HH_kWh_DT[
+        CER_HH_kWh_DT$datetime_start %in% date_start:date_end &
+          CER_HH_kWh_DT$Code == s],
+      row.names = FALSE,
+      file = outfile)
+  }
+}
--- a/CER-data-processing-electricity.do
+++ b/CER-data-processing-electricity.do
@@ -37,7 +37,7 @@ GNU General Public License for more details.
 global where "~/Documents/Work"
 * project
-global proot "$where/Data/Social Science Datasets/CER Smart Metering Project"
+global proot "$where/Data/CER Smart Metering Project"
 * data
 global dpath "$proot/data"
@@ -68,8 +68,8 @@ local fill_option "full"
 * only set this to 1 if you want to refresh ALL the input files!
 local do_raw 0 // process big half hour data file - need to unzip the txt files, takes ages to run
-local do_summaries 0 // create summary files at ID level
+local do_summaries 1 // create summary files at ID level - takes even longer
-local do_census2022 1 // create Census2022 Oct 2009 dataset
+local do_census2022 0 // create Census2022 Oct 2009 dataset
 ************************************
 ************************************
@@ -113,30 +113,41 @@ if `do_raw' {
 		gen mins = 0
 		replace mins = 30 if mod(halfhour,2) == 1
-		gen sec = 0
+		gen secs = 0
 		di "* create stata datetime - this breaks for those halfhours which are within clock changes - i.e. halfhour == 49 or 50"
-		gen double s_datetime = dhms(s_date, hour, mins, sec)
+		gen double stata_datetime = dhms(s_date, hour, mins, secs)
-		format s_datetime %tc
+		format stata_datetime %tc
 		di "* drop those where s_datetime = missing - XX this will mean dropping the 49 & 50 halfhours when the clocks changed"
-		drop if s_datetime == .
+		drop if stata_datetime == .
 		di "* check first 6 hours"
 		li ID date halfhour s_* in 1/12, sep(2)
 		* which day does the data start/end?
-		qui: su s_datetime
+		qui: su stata_datetime
 		di "Data start: " %tc `r(min)'
 		di "Data end: " %tc `r(max)'
+		* create an R-friendly date time
+		gen double t_date = dofc(stata_datetime)
+		gen double t_day = day(t_date)
+		gen double t_month = month(t_date)
+		gen double t_year = year(t_date) 
+		egen r_date = concat(t_year t_month t_day), punct("/")
+		egen r_time = concat(hour mins secs), punct(":")
+		egen r_datetime = concat(r_date r_time), punct(" ")
 		di "* Drop all variables we don't need"
-		drop date s_date halfhour hour mins sec
+		drop date s_date halfhour hour mins sec date t_* r_date r_time
 		di "* test for duplicates (will break xtset if there are)"
-		duplicates tag ID s_datetime, gen(dups)
+		duplicates tag ID stata_datetime, gen(dups)
-		tabout dups using "$logpath/File`f'-dups-test.txt"
+		tabout dups using "$logpath/File`f'-dups-test.txt", replace
 		di "* usually a few - list them"
 		li if dups == 1
@@ -145,7 +156,7 @@ if `do_raw' {
 		lab var ID "Household id - links to survey"
 		lab var kwh "kWh consumed"
-		lab var s_datetime "Time stamp (STATA SIF format)"
+		lab var stata_datetime "Time stamp (STATA SIF format)"
 		drop dups
 		compress
@@ -166,11 +177,15 @@ if `do_raw' {
 	di "* Skip testing for missing for now due to file size"
 	di "* setting the xt so we don't have to do it again (takes ages)"
-	xtset ID s_datetime, delta(30 minutes)
+	xtset ID stata_datetime, delta(30 minutes)
 	di "* save out the whole enormous file"
+	preserve
+		drop r_datetime
 		save "$dpath/processed/CER-halfhour-electricity-all.dta", replace
+	restore
 	* as CSV for R etc
+	drop stata_datetime
 	outsheet using "$dpath/processed/CER-halfhour-electricity-all.csv", comma replace
 }
 else {
@@ -184,11 +199,13 @@ if `do_summaries' {
 	di "*******************************************************"
 	di "* Creating a summary file by ID"
+	use "$dpath/processed/CER-halfhour-electricity-all.dta", clear
 	gen obs = 1
-	collapse (min) min_s_datetime = s_datetime (max) max_s_datetime = s_datetime (sum) obs kwh, by(ID)
+	collapse (min) min_s_datetime = stata_datetime (max) max_s_datetime = stata_datetime (sum) obs kwh, by(ID)
-	gen kwh_mean = kwh_sum/obs
+	gen kwh_mean = kwh/obs
 	lab var kwh_mean "Mean kwh per half hour across all obs for this ID"
 	su obs kwh_mean, de
@@ -273,19 +290,21 @@ if `do_census2022' {
 	di "* select 4 weeks around October 2009 as it matches to pre-trial survey closely"
 	di "* -> Census2022 'sample'"
+	use "$dpath/processed/CER-halfhour-electricity-all.dta", clear
 	* should already be xtset from 'do_raw'
 	keep if tin(28sept2009 00:00, 25oct2009 23:55)
 	* check
 	* which day does the data start/end?
-	qui: su s_datetime
+	qui: su stata_datetime
 	di "Data start: " %tc `r(min)'
 	di "Data end: " %tc `r(max)'
 	di "* create day of week (remember in stata 0 = Sunday)"
-	gen s_dow = dow(dofc(s_datetime))
+	gen s_dow = dow(dofc(stata_datetime))
 	di "* how many are missing?"
 	tab s_dow, mi
@@ -311,11 +330,11 @@ if `do_census2022' {
 	* NB: if we did not specify the  'full' option this will only impute missing datetimes
 	* between the first & last ID observation - so it fills in gaps, it does NOT full up all
 	* possible datetimes from the start to the end of the sample
-	gen double s_dow = dow(dofc(s_datetime))
+	gen double s_dow = dow(dofc(stata_datetime))
-	gen double s_date = mdy(month(dofc(s_datetime)), day(dofc(s_datetime)), year(dofc(s_datetime)))
+	gen double s_date = mdy(month(dofc(stata_datetime)), day(dofc(stata_datetime)), year(dofc(stata_datetime)))
 	format s_date %td
-	gen hours = hh(s_datetime)
+	gen hours = hh(stata_datetime)
-	gen mins = mm(s_datetime)
+	gen mins = mm(stata_datetime)
 	gen secs = 0
 	gen double s_halfhour = hms(hours, mins, secs)
 	format s_halfhour %tcHH:MM

--- a/github.Rproj
+++ b/github.Rproj
+Version: 1.0
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+RnwWeave: Sweave
+LaTeX: pdfLaTeX