diff --git a/greenGrid/processGridSpyData.R b/greenGrid/processGridSpyData.R index 2ac92f52031f565c99d2d137d4755bf1f4ed4d05..efac08efc0eb76e95abfd563d5c765b0d0340727 100644 --- a/greenGrid/processGridSpyData.R +++ b/greenGrid/processGridSpyData.R @@ -1,14 +1,82 @@ -library(greenGridr) +#### About ---- +# Code to process NZ GREEN Grid grid spy data +#### Libraries ---- +library(data.table) +library(lubridate) # keep here otherwise data.table masks various functions +library(readr) +library(greenGridr) -fpath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/_RAW DATA/GridSpyData" # location of data +#### Local parameters ---- +fpath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/_RAW DATA/GridSpyData/" # location of data +#fpath <- "~/Data/NZGreenGrid/gridspy/1min_orig/" # location of data pattern <- "*at1.csv$" # filters only 1 min data +outPath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/Clean_data/gridSpy/1min/" # place to save them +#outPath <- "~/Data/NZGreenGrid/gridspy/consolidated/" + +dataThreshold <- 3000 # assume any files smaller than this (bytes) = no data + +### Code ---- +# Get the full 1 minute file listing ---- filesDT <- list1mGridSpyFiles(fpath, pattern) +filesDT <- filesDT[, c("hhID","fileName") := tstrsplit(V1, "/")] +filesDT <- filesDT[, fullPath := paste0(fpath, hhID,"/",fileName)] + +print(paste0("Found ", nrow(filesDT), " files from ", uniqueN(filesDT$hhID), " households.")) + # check head(filesDT) -# save the files out -outPath <- "~/Data/NZGreenGrid/gridspy/consolidated/" # place to save them -process1mGridSpyFiles(filesDT) +# Load, process & save the ones which probably have data ---- + +hhIDs <- unique(filesDT$hhID) # list of household ids +allFileInfoDT <- data.table() + +for(hh in hhIDs){ + print(paste0("Loading: ", hh)) + tempHhDT <- data.table() # create data.table to hold file contents + filesToLoad <- filesDT[hhID == hh, fullPath] + for(f in filesToLoad){ + # check file + # print(paste0("Checking: ", f)) + rf <- path.expand(f) # just in case of ~ etc + finfo <- file.info(rf) + allFileInfoDT <- rbind(allFileInfoDT, as.data.table(finfo)) + fsize <- file.size(rf) + if(fsize > dataThreshold){ # set above + print(paste0("Checking: ", f)) + print(paste0("File size = ", file.size(f), " so probably OK")) # files under 3kb are probably empty + # attempt to load the file + tempDT <- fread(f) + tempHhDT <- rbind(tempHhDT, tempDT, fill = TRUE) # just in case there are different numbers of columns (quite likely!) + } + } + # tidy column names + tempHhDT$r_dateTime <- tempHhDT$"date NZ" + tempHhDT$"date NZ" <- NULL #to avoid confusion + # remove duplicates caused by over-lapping files + nObs <- nrow(tempHhDT) + print(paste0("N rows before removal of dublicates: ", nObs)) + tempHhDT <- unique(tempHhDT) + nObs <- nrow(tempHhDT) + print(paste0("N rows after removal of dublicates: ", nObs)) + # set month + tempHhDT$month <- month(tempHhDT$r_dateTime) # requires lubridate + tempHhDT$year <- year(tempHhDT$r_dateTime) # requires lubridate + # > save out by year & month ---- + months <- unique(tempHhDT$month) + years <- unique(tempHhDT$year) + for(m in months){ + for(y in years){ + ofile <- paste0(outPath, hh,"_", y, "_", m, "_all_1min_data.csv") + write_csv(tempHhDT[month == m & year == y], ofile) + print(paste0("Saved ", ofile)) + #cmd <- paste0("gzip -f ", ofile) # gzip it + #try(system(cmd)) # in case it fails - if it does there will just be .csv files (not gzipped) - e.g. under windows + } + } +} + +summary(allFileInfoDT)