diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..807ea251739a053faee6d72fe9dc3ae80d80cc15 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.Rproj.user +.Rhistory +.RData diff --git a/CER-data-create-wfs.do b/CER-data-create-wfs.do index 8d7ea2e11eee9dac47b2f2137a71caf989f5946e..37882b8ac4c1383017c8a55be0bd4e77069b5787 100644 --- a/CER-data-create-wfs.do +++ b/CER-data-create-wfs.do @@ -55,9 +55,9 @@ set more off clear all -capture log close +capture log close _all -log using "$logpath/CER-data-processing-samples-$version.smcl", replace +log using "$logpath/CER-data-processing-samples-$version.smcl", replace name(main) timer clear @@ -72,11 +72,43 @@ use "$dpath/processed/SME and Residential allocations.dta" * merge pre-trial residential survey data merge 1:1 ID using "$dpath/processed/Smart meters Residential pre-trial survey data-$version.dta" +drop _merge + * keep the residential IDs only keep if sample == 1 -* keep basic info +* set labels properly +lab var ba_npeople "Number of residents" +lab var ba_nchildren "Number of chidlren" +lab var ba_nadults "Number of adults" +lab var ba_floorarea "Self-reported floor area in m2" +lab var ba_empl "HRP employment status" + +log off main + +log using "$dpath/processed/CER-pre-trial-survey-residential-sample-$version-codebook.smcl", /// + replace name(cb) +desc +codebook + +log off cb + +log on main +* create a detailed version of the survey +outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-$version.csv", comma replace + +* keep basic info to form a simplified version of the survey keep ID sample res_stim res_tariff ba_* +log off main + +log using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version-codebook.smcl", /// + replace name(cb_wf) +desc +codebook + +log off cb_wf + +log on main save "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.dta", replace outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.csv", comma replace @@ -84,7 +116,7 @@ outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$ver ************************************ ************************************ -* Create 4 samples - Oct/Dec 2009 and Oct/Dec 2010 +* Create 2 samples - Oct 2009-2010 and Dec 2009-2010 * Make sure collect all half hours local Oct2009 "tin(01oct2009 00:00, 31oct2009 23:45)" local Dec2009 "tin(01dec2009 00:00, 31dec2009 23:45)" @@ -110,6 +142,8 @@ foreach s of local samples { * restrict to the matched households keep if _merge == 3 + drop _merge sample // reduce size of files + xtset ID s_datetime, delta(30 mins) * keep the periods we care about keep if `Oct2009' || `Oct2010' || `Dec2009' || `Dec2010' diff --git a/CER-data-processing-electricity.R b/CER-data-processing-electricity.R new file mode 100644 index 0000000000000000000000000000000000000000..ced3caeda7c407702de2c09c06b8eea578867cc7 --- /dev/null +++ b/CER-data-processing-electricity.R @@ -0,0 +1,141 @@ +######## +# Data preparation of the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data +# - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/ + +# - original electricity smart meter data = 6 File<n>.txt files (space delimited) +# Easiest way to merge them: cat *.txt > catFiles.txt on a (fast) unix box + +# processes the original data for further use: +# - create subsamples for residential data for Oct 2009 - Oct 2010 & Dec 2009 - Dec 2010 + +# This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the +# "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project +# - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21 +# - http://www.energy.soton.ac.uk/tag/census2022/ + +# Copyright (C) 2014 University of Southampton + +# Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, +# https://github.com/dataknut) +# [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton] + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License +# (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +#YMMV - http://en.wiktionary.org/wiki/YMMV + +# Housekeeping ---- +# clear the workspace +rm(list=ls()) + +# load any libraries +library(data.table) +# where's the data? +setwd("~/Documents/Work/Data/CER Smart Metering Project/data/") + +# need to unzip and concatenate the data first - it comes as 6 zipped text files +# do this using "cat File1.txt File2.txt File3.txt File4.txt File5.txt File6.txt > catFiles.txt" + +# load data and fix variables ---- +inpath <- "original/CER_both/CER Electricity Revised March 2012/" + +# sample allocation data +infile <- "SME and Residential allocations.csv" +CER_HH_sample_DT <- as.data.table(read.csv(paste0(inpath,infile)), key = "ID") + +# CER_HH_sample_DT$code: +# 1 = residential +# 2 = SME +# 3 = other +CER_HH_sample_DT$Code <- factor(CER_HH_sample_DT$Code, + levels = c("1","2","3"), + labels = c("Residential", "SME","Other")) +table(CER_HH_sample_DT$Code) + + +# merge in kWh data ---- +# test with File1.txt until we know it works +infile <- "File1" +# then switch to the full data file when we know it works! +#infile <- "catFiles" +# construct the data file name - it is a .txt file +dataf <- paste0(inpath,infile,".txt") +print(paste0("Loading: ", dataf)) +CER_HH_kWh_DT <- as.data.table(read.table(dataf, + col.names = c("ID","timestamp", "kWh")), key = "ID") + +# now sort the data by ID (household) and then timestamp within ID +CER_HH_kWh_DT <- CER_HH_kWh_DT[with(CER_HH_kWh_DT, order(ID, timestamp)),] + +CER_HH_kWh_DT <- merge(CER_HH_kWh_DT, CER_HH_sample_DT, by = "ID") +# how do we specify the columns we want to keep when we merge? + +# subset to keep the ones we want (must be a better way) +CER_HH_kWh_DT <- subset( + CER_HH_kWh_DT, + select = c("ID","Code","timestamp", "kWh")) + +# create real date/time variables ---- +# For some reason the supplied timestamp is: +# Day code: digits 1-3 (day 1 = 1 January 2009) +# digits 4-5 (half hour 1 - 48) 1= 00:00:00 – 00:29:59 + +CER_HH_kWh_DT$day <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 1, 3)) +CER_HH_kWh_DT$halfhour <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 4, 5)) +CER_HH_kWh_DT$datetime_z <- as.POSIXct("01/01/2009 00:00:00", tz = , "", "%d/%m/%Y %H:%M:%S") +CER_HH_kWh_DT$datetime_start <- CER_HH_kWh_DT$datetime_z + # start with date zero + (CER_HH_kWh_DT$day*24*60*60) + # add number of days + ((CER_HH_kWh_DT$halfhour-1)*30*60) # add halfhours but subtract 1 as first needs to be '0' + +# remove unwanted variables to save memory +CER_HH_kWh_DT$timestamp <- NULL +CER_HH_kWh_DT$day <- NULL +CER_HH_kWh_DT$halfhour <- NULL +CER_HH_kWh_DT$datetime_z <- NULL + +# save outputs ---- +outpath <- "processed" +# save the subsamples, do not save the whole file as it will be very large +# use a pair of loops to prevent typing errors! +years <- c("2009", "2010") +samples <- c("Residential", "SME") + +for (y in years) { + for (s in samples) { + print(paste0("Saving ", s, " in ", y)) + # October samples + dateSt <- paste0(y,"-10-01") + dateEn <- paste0(y,"-10-31") + date_start<-as.POSIXct(dateSt,tz="") + date_end<-as.POSIXct(dateEn,tz="") + outfile <- paste0(outpath,"/","CER_October_",y,"_",s,".csv") + print(paste0("Saving: ", outfile)) + write.csv( + CER_HH_kWh_DT[ + CER_HH_kWh_DT$datetime_start %in% date_start:date_end & + CER_HH_kWh_DT$Code == s], + row.names = FALSE, + file = outfile) + + # December samples + dateSt <- paste0(y,"-12-01") + dateEn <- paste0(y,"-12-31") + date_start<-as.POSIXct(dateSt,tz="") + date_end<-as.POSIXct(dateEn,tz="") + outfile <- paste0(outpath,"/","CER_December_",y,"_",s,".csv") + print(paste0("Saving: ", outfile)) + write.csv( + CER_HH_kWh_DT[ + CER_HH_kWh_DT$datetime_start %in% date_start:date_end & + CER_HH_kWh_DT$Code == s], + row.names = FALSE, + file = outfile) + } +} diff --git a/CER-data-processing-electricity.do b/CER-data-processing-electricity.do index f01ff1cf72c9db8e66444cd42105c76b6e7194fa..64f7504f43771befe92eee027431cbca672db60b 100644 --- a/CER-data-processing-electricity.do +++ b/CER-data-processing-electricity.do @@ -37,7 +37,7 @@ GNU General Public License for more details. global where "~/Documents/Work" * project -global proot "$where/Data/Social Science Datasets/CER Smart Metering Project" +global proot "$where/Data/CER Smart Metering Project" * data global dpath "$proot/data" @@ -68,8 +68,8 @@ local fill_option "full" * only set this to 1 if you want to refresh ALL the input files! local do_raw 0 // process big half hour data file - need to unzip the txt files, takes ages to run -local do_summaries 0 // create summary files at ID level -local do_census2022 1 // create Census2022 Oct 2009 dataset +local do_summaries 1 // create summary files at ID level - takes even longer +local do_census2022 0 // create Census2022 Oct 2009 dataset ************************************ ************************************ @@ -113,30 +113,41 @@ if `do_raw' { gen mins = 0 replace mins = 30 if mod(halfhour,2) == 1 - gen sec = 0 + gen secs = 0 di "* create stata datetime - this breaks for those halfhours which are within clock changes - i.e. halfhour == 49 or 50" - gen double s_datetime = dhms(s_date, hour, mins, sec) - format s_datetime %tc + gen double stata_datetime = dhms(s_date, hour, mins, secs) + format stata_datetime %tc di "* drop those where s_datetime = missing - XX this will mean dropping the 49 & 50 halfhours when the clocks changed" - drop if s_datetime == . + drop if stata_datetime == . di "* check first 6 hours" li ID date halfhour s_* in 1/12, sep(2) * which day does the data start/end? - qui: su s_datetime + qui: su stata_datetime di "Data start: " %tc `r(min)' di "Data end: " %tc `r(max)' + * create an R-friendly date time + + gen double t_date = dofc(stata_datetime) + gen double t_day = day(t_date) + gen double t_month = month(t_date) + gen double t_year = year(t_date) + + egen r_date = concat(t_year t_month t_day), punct("/") + egen r_time = concat(hour mins secs), punct(":") + egen r_datetime = concat(r_date r_time), punct(" ") + di "* Drop all variables we don't need" - drop date s_date halfhour hour mins sec + drop date s_date halfhour hour mins sec date t_* r_date r_time di "* test for duplicates (will break xtset if there are)" - duplicates tag ID s_datetime, gen(dups) - tabout dups using "$logpath/File`f'-dups-test.txt" + duplicates tag ID stata_datetime, gen(dups) + tabout dups using "$logpath/File`f'-dups-test.txt", replace di "* usually a few - list them" li if dups == 1 @@ -145,7 +156,7 @@ if `do_raw' { lab var ID "Household id - links to survey" lab var kwh "kWh consumed" - lab var s_datetime "Time stamp (STATA SIF format)" + lab var stata_datetime "Time stamp (STATA SIF format)" drop dups compress @@ -166,11 +177,15 @@ if `do_raw' { di "* Skip testing for missing for now due to file size" di "* setting the xt so we don't have to do it again (takes ages)" - xtset ID s_datetime, delta(30 minutes) + xtset ID stata_datetime, delta(30 minutes) di "* save out the whole enormous file" - save "$dpath/processed/CER-halfhour-electricity-all.dta", replace + preserve + drop r_datetime + save "$dpath/processed/CER-halfhour-electricity-all.dta", replace + restore * as CSV for R etc + drop stata_datetime outsheet using "$dpath/processed/CER-halfhour-electricity-all.csv", comma replace } else { @@ -184,11 +199,13 @@ if `do_summaries' { di "*******************************************************" di "* Creating a summary file by ID" + use "$dpath/processed/CER-halfhour-electricity-all.dta", clear + gen obs = 1 - collapse (min) min_s_datetime = s_datetime (max) max_s_datetime = s_datetime (sum) obs kwh, by(ID) + collapse (min) min_s_datetime = stata_datetime (max) max_s_datetime = stata_datetime (sum) obs kwh, by(ID) - gen kwh_mean = kwh_sum/obs + gen kwh_mean = kwh/obs lab var kwh_mean "Mean kwh per half hour across all obs for this ID" su obs kwh_mean, de @@ -273,19 +290,21 @@ if `do_census2022' { di "* select 4 weeks around October 2009 as it matches to pre-trial survey closely" di "* -> Census2022 'sample'" + use "$dpath/processed/CER-halfhour-electricity-all.dta", clear + * should already be xtset from 'do_raw' keep if tin(28sept2009 00:00, 25oct2009 23:55) * check * which day does the data start/end? - qui: su s_datetime + qui: su stata_datetime di "Data start: " %tc `r(min)' di "Data end: " %tc `r(max)' di "* create day of week (remember in stata 0 = Sunday)" - gen s_dow = dow(dofc(s_datetime)) + gen s_dow = dow(dofc(stata_datetime)) di "* how many are missing?" tab s_dow, mi @@ -311,11 +330,11 @@ if `do_census2022' { * NB: if we did not specify the 'full' option this will only impute missing datetimes * between the first & last ID observation - so it fills in gaps, it does NOT full up all * possible datetimes from the start to the end of the sample - gen double s_dow = dow(dofc(s_datetime)) - gen double s_date = mdy(month(dofc(s_datetime)), day(dofc(s_datetime)), year(dofc(s_datetime))) + gen double s_dow = dow(dofc(stata_datetime)) + gen double s_date = mdy(month(dofc(stata_datetime)), day(dofc(stata_datetime)), year(dofc(stata_datetime))) format s_date %td - gen hours = hh(s_datetime) - gen mins = mm(s_datetime) + gen hours = hh(stata_datetime) + gen mins = mm(stata_datetime) gen secs = 0 gen double s_halfhour = hms(hours, mins, secs) format s_halfhour %tcHH:MM diff --git a/github.Rproj b/github.Rproj new file mode 100644 index 0000000000000000000000000000000000000000..8e3c2ebc99e2e337f7d69948b93529a437590b27 --- /dev/null +++ b/github.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX