moved main processing code to here as it should really only be run once (or a...

moved main processing code to here as it should really only be run once (or a very few times) a month - no need for function.

moved main processing code to here as it should really only be run once (or a...
254fe07f · Ben Anderson · 5cf49759 · 254fe07f
Commit 254fe07f authored 7 years ago by Ben Anderson
--- a/greenGrid/processGridSpyData.R
+++ b/greenGrid/processGridSpyData.R
-library(greenGridr)
+#### About ----
+# Code to process NZ GREEN Grid grid spy data
+#### Libraries ----
+library(data.table)
+library(lubridate) # keep here otherwise data.table masks various functions
+library(readr)
+library(greenGridr)
-fpath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/_RAW DATA/GridSpyData" # location of data
+#### Local parameters ----
+fpath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/_RAW DATA/GridSpyData/" # location of data
+#fpath <- "~/Data/NZGreenGrid/gridspy/1min_orig/" # location of data
 pattern <- "*at1.csv$" # filters only 1 min data
+outPath <- "/Volumes/hum-csafe/Research Projects/GREEN Grid/Clean_data/gridSpy/1min/" # place to save them
+#outPath <- "~/Data/NZGreenGrid/gridspy/consolidated/"
+dataThreshold <- 3000 # assume any files smaller than this (bytes) = no data
+### Code ----
+# Get the full 1 minute file listing ----
 filesDT <- list1mGridSpyFiles(fpath, pattern)
+filesDT <- filesDT[, c("hhID","fileName") := tstrsplit(V1, "/")]
+filesDT <- filesDT[, fullPath := paste0(fpath, hhID,"/",fileName)]
+print(paste0("Found ", nrow(filesDT), " files from ", uniqueN(filesDT$hhID), " households."))
 # check
 head(filesDT)
-# save the files out
+# Load, process & save the ones which probably have data ----
-outPath <- "~/Data/NZGreenGrid/gridspy/consolidated/" # place to save them
-process1mGridSpyFiles(filesDT)
+hhIDs <- unique(filesDT$hhID) # list of household ids
+allFileInfoDT <- data.table()
+for(hh in hhIDs){
+  print(paste0("Loading: ", hh))
+  tempHhDT <- data.table() # create data.table to hold file contents
+  filesToLoad <- filesDT[hhID == hh, fullPath]
+  for(f in filesToLoad){
+    # check file
+    # print(paste0("Checking: ", f))
+    rf <- path.expand(f) # just in case of ~ etc
+    finfo <- file.info(rf)
+    allFileInfoDT <- rbind(allFileInfoDT, as.data.table(finfo))
+    fsize <- file.size(rf)
+    if(fsize > dataThreshold){ # set above
+      print(paste0("Checking: ", f))
+      print(paste0("File size = ", file.size(f), " so probably OK")) # files under 3kb are probably empty
+      # attempt to load the file
+      tempDT <- fread(f)
+      tempHhDT <- rbind(tempHhDT, tempDT, fill = TRUE) # just in case there are different numbers of columns (quite likely!)
+    }
+  }
+  # tidy column names
+  tempHhDT$r_dateTime <- tempHhDT$"date NZ"
+  tempHhDT$"date NZ" <- NULL #to avoid confusion
+  # remove duplicates caused by over-lapping files
+  nObs <- nrow(tempHhDT)
+  print(paste0("N rows before removal of dublicates: ", nObs))
+  tempHhDT <- unique(tempHhDT)
+  nObs <- nrow(tempHhDT)
+  print(paste0("N rows after removal of dublicates: ", nObs))
+  # set month
+  tempHhDT$month <- month(tempHhDT$r_dateTime) # requires lubridate
+  tempHhDT$year <- year(tempHhDT$r_dateTime) # requires lubridate
+  # > save out by year & month ----
+  months <- unique(tempHhDT$month)
+  years <- unique(tempHhDT$year)
+  for(m in months){
+    for(y in years){
+      ofile <- paste0(outPath, hh,"_", y, "_", m, "_all_1min_data.csv")
+      write_csv(tempHhDT[month == m & year == y], ofile)
+      print(paste0("Saved ", ofile))
+      #cmd <- paste0("gzip -f ", ofile) # gzip it
+      #try(system(cmd)) # in case it fails - if it does there will just be .csv files (not gzipped) - e.g. under windows
+    }
+  }
+}
+summary(allFileInfoDT)