Skip to content
Snippets Groups Projects
Commit e352c168 authored by Ben Anderson's avatar Ben Anderson
Browse files

major updates to CER data processing code

- uses paths instead of setting working directory
- saves Oct/Dec samples, Census2022 sample and also complete data by
Residential/SME/Other
- gzips output to support fread input (data.table) in other code
- fixes dates to r_datetime in this file so no need to do it again when
analysing data
parent fe9e12a0
No related branches found
No related tags found
No related merge requests found
########
# Header ####
# Data preparation of the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
# - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
......@@ -15,8 +15,8 @@
# Copyright (C) 2014 University of Southampton
# Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut,
# https://github.com/dataknut)
# Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut)
# https://github.com/dataknut
# [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
# This program is free software; you can redistribute it and/or modify
......@@ -36,19 +36,29 @@
rm(list=ls())
# load any libraries
library(data.table)
library(data.table) # for super fast tables
##
# Set paths and files ----
dPath <- "~/Documents/Work/Data/CER_Smart_Metering_Project/data/"
# where's the data?
setwd("~/Documents/Work/Data/CER Smart Metering Project/data/")
inPath <- "original/CER_both/CER Electricity Revised March 2012/"
# where do we want it put?
outPath <- "processed/"
# consumption data
# need to unzip and concatenate the data first - it comes as 6 zipped text files
# do this using "cat File1.txt File2.txt File3.txt File4.txt File5.txt File6.txt > catFiles.txt"
# test with File1.txt until we know it works
# consInfile <- "File1.txt"
# then switch to the full data file when we know it works!
consInfile <- "catFiles.txt"
# load data and fix variables ----
inpath <- "original/CER_both/CER Electricity Revised March 2012/"
# sample allocation file
sampleInfile <- "SME and Residential allocations.csv"
# sample allocation data
infile <- "SME and Residential allocations.csv"
CER_HH_sampleDT <- fread(paste0(inpath,infile))
# Load & clean sample allocation data ----
CER_HH_sampleDT <- fread(paste0(inPath,sampleInfile))
setkey(CER_HH_sampleDT, ID)
# CER_HH_sample_DT$code:
......@@ -58,16 +68,13 @@ setkey(CER_HH_sampleDT, ID)
CER_HH_sampleDT$AllocCode <- factor(CER_HH_sampleDT$Code,
levels = c("1","2","3"),
labels = c("Residential", "SME","Other"))
# check what kind of 'dwellings' we have
table(CER_HH_sampleDT$Code)
# merge in kWh data ----
# test with File1.txt until we know it works
# infile <- "File1"
# then switch to the full data file when we know it works!
infile <- "catFiles"
# construct the data file name - it is a .txt file
dataf <- paste0(inpath,infile,".txt")
####
# Load & process kWh data ----
# construct the data file name
dataf <- paste0(inPath,consInfile)
print(paste0("Loading: ", dataf))
CER_HH_kWhDT <- fread(dataf,
col.names = c("ID","timestamp", "kWh")
......@@ -77,12 +84,13 @@ setkey(CER_HH_kWhDT, ID,timestamp)
CER_HH_kWhDT <- CER_HH_kWhDT[CER_HH_sampleDT]
# check types of dwellings we have
with(CER_HH_kWhDT,
table(Code,AllocCode, useNA = "always")
)
# subset to keep the variables we want
# keep alloc code as we don't trust the original
# keep sample alloc code as we don't trust the original Code in the consumption data
CER_HH_kWhDT <- CER_HH_kWhDT[,.(ID,AllocCode,timestamp,kWh)]
# create real date/time variables ----
......@@ -90,21 +98,32 @@ CER_HH_kWhDT <- CER_HH_kWhDT[,.(ID,AllocCode,timestamp,kWh)]
# Day code: digits 1-3 (day 1 = 1 January 2009)
# digits 4-5 (half hour 1 - 48) 1= 00:00:00 – 00:29:59
CER_HH_kWh_DT$day <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 1, 3))
CER_HH_kWh_DT$halfhour <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 4, 5))
CER_HH_kWh_DT$datetime_z <- as.POSIXct("01/01/2009 00:00:00", tz = , "", "%d/%m/%Y %H:%M:%S")
CER_HH_kWh_DT$datetime_start <- CER_HH_kWh_DT$datetime_z + # start with date zero
(CER_HH_kWh_DT$day*24*60*60) + # add number of days
((CER_HH_kWh_DT$halfhour-1)*30*60) # add halfhours but subtract 1 as first needs to be '0'
CER_HH_kWhDT$day <- as.numeric(substr(CER_HH_kWhDT$timestamp, 1, 3))
CER_HH_kWhDT$halfhour <- as.numeric(substr(CER_HH_kWhDT$timestamp, 4, 5))
# create zero time to start from
CER_HH_kWhDT$datetime_z <- as.POSIXct("01/01/2009 00:00:00", tz = , "", "%d/%m/%Y %H:%M:%S")
CER_HH_kWhDT$datetime_start <- CER_HH_kWhDT$datetime_z + # start with date zero
(CER_HH_kWhDT$day*24*60*60) + # add number of days
((CER_HH_kWhDT$halfhour-1)*30*60) # add halfhours but subtract 1 as first needs to be '0'
# this could take a while - but best to do this once here than many times over later
# could use fasttime here?
system.time(
CER_HH_kWhDT$r_datetime <- as.POSIXct(CER_HH_kWhDT$datetime_start,
tz="",
"%Y-%m-%d %H:%M:%S")
)
# remove unwanted variables to save memory
CER_HH_kWh_DT$timestamp <- NULL
CER_HH_kWh_DT$day <- NULL
CER_HH_kWh_DT$halfhour <- NULL
CER_HH_kWh_DT$datetime_z <- NULL
CER_HH_kWhDT$timestamp <- NULL
CER_HH_kWhDT$day <- NULL
CER_HH_kWhDT$halfhour <- NULL
CER_HH_kWhDT$datetime_z <- NULL
CER_HH_kWhDT$datetime_start <- NULL
# Save outputs ----
# gzip them as fread (in data.table) can load from gzipped files very quickly
# save outputs ----
outpath <- "processed"
# save the subsamples, do not save the whole file as it will be very large
# use a pair of loops to prevent typing errors!
years <- c("2009", "2010")
......@@ -113,32 +132,70 @@ samples <- c("Residential", "SME")
for (y in years) {
for (s in samples) {
print(paste0("Saving ", s, " in ", y))
# October samples
dateSt <- paste0(y,"-10-01")
dateEn <- paste0(y,"-10-31")
date_start<-as.POSIXct(dateSt,tz="")
date_end<-as.POSIXct(dateEn,tz="")
outfile <- paste0(outpath,"/","CER_October_",y,"_",s,".csv")
outfile <- paste0(outPath,"CER_October_",y,"_",s,".csv")
print(paste0("Saving: ", outfile))
write.csv(
CER_HH_kWh_DT[
CER_HH_kWh_DT$datetime_start %in% date_start:date_end &
CER_HH_kWh_DT$Code == s],
CER_HH_kWhDT[
CER_HH_kWhDT$r_datetime %in% date_start:date_end &
CER_HH_kWhDT$AllocCode == s],
row.names = FALSE,
file = outfile)
cmd <- paste0("gzip -f ",outfile," &")
print(paste0("Compressing: ", outfile))
system(cmd)
# December samples
dateSt <- paste0(y,"-12-01")
dateEn <- paste0(y,"-12-31")
date_start<-as.POSIXct(dateSt,tz="")
date_end<-as.POSIXct(dateEn,tz="")
outfile <- paste0(outpath,"/","CER_December_",y,"_",s,".csv")
outfile <- paste0(outPath,"CER_December_",y,"_",s,".csv")
print(paste0("Saving: ", outfile))
write.csv(
CER_HH_kWh_DT[
CER_HH_kWh_DT$datetime_start %in% date_start:date_end &
CER_HH_kWh_DT$Code == s],
CER_HH_kWhDT[
CER_HH_kWhDT$r_datetime %in% date_start:date_end &
CER_HH_kWhDT$AllocCode == s],
row.names = FALSE,
file = outfile)
cmd <- paste0("gzip -f ",outfile," &")
print(paste0("Compressing: ", outfile))
system(cmd)
}
}
# Save the Census2022 October sub-sample ----
# 27th Sept 2009 (Sun) to 24th Oct 2009 (Sat) inclusive with the duration of 4 weeks.
dateSt <- "2009-09-27"
dateEn <- "2009-10-24"
date_start<-as.POSIXct(dateSt,tz="")
date_end<-as.POSIXct(dateEn,tz="")
outfile <- paste0(outPath,"CER_Census2022_Autumn_2009.csv")
print(paste0("Saving: ", outfile))
write.csv(
CER_HH_kWhDT[
CER_HH_kWhDT$r_datetime %in% date_start:date_end &
CER_HH_kWhDT$AllocCode == "Residential"],
row.names = FALSE,
file = outfile)
cmd <- paste0("gzip -f ",outfile," &")
print(paste0("Compressing: ", outfile))
system(cmd)
# Save the whole lot ----
for (s in samples) {
outfile <- paste0(outPath,"CER_all_half_hours_",s,".csv")
print(paste0("Saving: ", outfile))
write.csv(
CER_HH_kWhDT[CER_HH_kWhDT$AllocCode == s],
row.names = FALSE,
file = outfile)
cmd <- paste0("gzip -f ",outfile," &")
print(paste0("Compressing: ", outfile))
system(cmd)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment