GREENGridModel_v1.0.Rmd

title: "GREENGrid area unit level electricity demand model"
subtitle: "v0.01a"
author: "Ben Anderson (University of Otago)"
date: 'Last run at: `r Sys.time()`'
output:
  bookdown::html_document2:
    code_folding: hide
    fig_caption: yes
    number_sections: yes
    self_contained: no
    toc: yes
    toc_depth: 3
    toc_float: yes
bibliography: '`r path.expand("~/bibliography.bib")`'
# change default
knitr::opts_chunk$set(echo = TRUE)

# Log compile time:
startTime <- proc.time()

library(spatialec)

myPackages <- c(
  "data.table", # data munching
  "dkUtils", # utils - from
  "ggplot2", # plots
  "hms", # h:m:s functions
  "ipfp", # ipf
  "GREENGridData", # processing green grid data
  "kableExtra", # pretty tables
  "lubridate", # date time functions
  "plotly", # interative plots
  "readr", # data reading
  "skimr", # for skim
  "survey", # weighted data summarising
  "viridis" # colour-blind friendly palettes
)

spatialec::loadLibraries(myPackages)

# Identify the root directory ----
root <- spatialec::findParentDirectory("spatialec")

# --- Project Settings ----
source(paste0(root, "/_scripts/spatialecSettings.R"))   # loads knitr options & assigns file paths and loads system info

# ---- Local Settings ----
lParams$censusDataSource <- "http://archive.stats.govt.nz/Census/2013-census/data-tables/meshblock-dataset.aspx"

lParams$ggPath <- "~/Data/NZ_GREENGrid" # over-rides default for speed

# Print system information
outputMessage("Running on", lParams$sysName, "from projLoc =", root)   # sysName is set in spatialecSettings.R
f <- paste0(lParams$dataPath, "processed/meshblock2013.csv.gz")
outputMessage(paste0("Loading: ", f))
mb2013DT <- data.table::fread(f)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(mb2013DT))))
outputMessage(paste0("N meshblocks loaded: ", dkUtils::tidyNum(uniqueN(mb2013DT$MB2013_code))))

h <- head(mb2013DT)
kableExtra::kable(h, caption = "Meshblocks data: head")%>%
  kable_styling()

message("Summary of variables:")
skimr::skim(mb2013DT)
f <- paste0(lParams$dataPath, "processed/areaunits2013.csv.gz")
outputMessage(paste0("Loading: ", f))
au2013DT <- data.table::fread(f)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(au2013DT))))
outputMessage(paste0("N area units loaded: ", dkUtils::tidyNum(uniqueN(au2013DT$AU2013_code))))

h <- head(au2013DT)
kableExtra::kable(h, caption = "Area units data: head")%>%
  kable_styling()

message("Summary of variables:")
skimr::skim(au2013DT)

message("List of available variables:")
t <- table(au2013DT$variable)
kableExtra::kable(t, caption = "Are units: available variables")%>%
  kable_styling()

areasDT <- data.table::fread(lParams$areasTable2013)

auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
setkey(auListDT, AU2013_code)
f <- paste0(lParams$ggPath, "/reshare/v1.0/data/ggHouseholdAttributesSafe.csv.zip")
outputMessage(paste0("Loading: ", f))
ggHhDT <- data.table::as.data.table(readr::read_csv(f))
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))))
outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID))))

h <- head(ggHhDT)
kableExtra::kable(h, caption = "GREENGrid Household attributes data: head")%>%
  kable_styling()

message("Summary of variables:")
skimr::skim(ggHhDT)

# number of kids: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nKids :=  nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nKids, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)]
ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)]

message("Household data: n kids")
table(ggHhDT$nKids_0)
table(ggHhDT$nKids_1m)

# create category
ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]

table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")

# census: ----

au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_dwellings_Couple_with_child(ren)" |
                       variable == "family_type_for_families_in_occupied_private_dwellings_One_parent_with_child(ren)", censusConstraint  := "nKids_1m"]

au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_dwellings_Total_families_in_occupied_private_dwellings", censusConstraint  := "TotalFamilies"]

# we can't calculate the residual until we have the wide form
# number of people: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nPeople, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)]

message("Household data: n people")
table(ggHhDT$nPeople_1)
table(ggHhDT$nPeople_2)
table(ggHhDT$nPeople_3)
table(ggHhDT$nPeople_4m)

# create category
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]

table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")

# census: ----
# number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_One_Usual_Resident up to number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Eight_or_More_Usual_Residents

au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_One_Usual_Resident", censusConstraint  := "nPeople_1"]

au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Two_Usual_Residents", censusConstraint  := "nPeople_2"]

au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Three_Usual_Residents", censusConstraint  := "nPeople_3"]

au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Four_Usual_Residents" |
                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Five_Usual_Residents" |
                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Six_Usual_Residents" |
                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Seven_Usual_Residents" |
                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Eight_or_More_Usual_Residents", censusConstraint  := "nPeople_4m"]

origT <- au2013DT[variable %like% "Usual_Resident", .(nHouseholds = sum(value)), keyby = .(variable)]
kableExtra::kable(origT, caption = "Census area units: n people (original)")%>%
  kable_styling()

recodeT <- au2013DT[censusConstraint %like% "nPeople", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(recodeT, caption = "Census area units: n people (recoded)")%>%
  kable_styling()

message("Total households (original) = ", sum(origT$nHouseholds))
message("Total households (recoded) = ", sum(recodeT$nHouseholds))
message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nPeople"]$AU2013_code))
# survey ----
ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
                 `Q10#1_2_1_TEXT` + # living rooms
                 `Q10#1_3_1_TEXT` + # dining rooms-kitchens
                 `Q10#1_4_1_TEXT` + # kitchens
                 `Q10#1_5_1_TEXT`  # studies
                 # Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx
                 # Q10#1_7_1_TEXT + # toilets - excluded from census
                 # Q10#1_8_1_TEXT + # laundries - excluded from census
                 ]
table(ggHhDT$nRooms, useNA = "always")
message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)

summary(r)

message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate

# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) +
  geom_point() # original shape

ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) +
  geom_point()  + # imputed shape
  labs(caption = "Well, it is a linear model...")

message("So we use the imputed values only where we had NA")
ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms),
                                                round(nRoomsImputed),
                                                nRooms
)
]

message("How bad does that look?")
ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed",
                                             "observed")]
ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) +
  geom_jitter() #

message("Well, we can live with that for now...")

#
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)]
ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)]
ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)]
table(ggHhDT$nRooms1_4)
table(ggHhDT$nRooms5_6)
table(ggHhDT$nRooms7m)

# create category
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]

message("Check coding")
table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")

# census ----
au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_One_Room" |
                       variable == "number_of_rooms_for_occupied_private_dwellings_Two_Rooms" |
                       variable == "number_of_rooms_for_occupied_private_dwellings_Three_Rooms" |
                       variable == "number_of_rooms_for_occupied_private_dwellings_Four_Rooms",
                     censusConstraint  := "nRooms1_4"]

au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_Five_Rooms" |
                     variable == "number_of_rooms_for_occupied_private_dwellings_Six_Rooms" ,
                     censusConstraint  := "nRooms5_6"]

au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_Seven_Rooms" |
                       variable == "number_of_rooms_for_occupied_private_dwellings_Eight_or_More_Rooms",
                     censusConstraint  := "nRooms7m"]

origT <- au2013DT[variable %like% "_Rooms" |
                    variable %like% "_Room", .(nHouseholds = sum(value)), keyby = .(variable)]
kableExtra::kable(origT, caption = "Census area units: n people (original)")%>%
  kable_styling()

recodeT <- au2013DT[censusConstraint %like% "nRooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(recodeT, caption = "Census area units: n people (recoded)")%>%
  kable_styling()

message("Total households (original) = ", sum(origT$nHouseholds))
message("Total households (recoded) = ", sum(recodeT$nHouseholds))
message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nRooms"]$AU2013_code))

# survey: Q10#1_1_1_TEXT ----

ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`]
table(ggHhDT$nBedrooms,  useNA = "always")
# best to do 1-2, 3, 4m

message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)

summary(r)

message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate

# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) +
  geom_point() # original shape

ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) +
  geom_point()  + # imputed shape
  labs(caption = "Well, it is a linear model...")

message("So we use the imputed values only where we had NA and we round them to whole numbers")
ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms),
                                                round(nBedroomsImputed),
                                                nBedrooms
)
]

message("How bad does that look?")
ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed",
                                             "observed")]
ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) +
  geom_jitter() #

message("Well, we can live with that for now...")

ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)]
message("Household data: n bedrooms")
table(ggHhDT$nBedrooms_1_2)
table(ggHhDT$nBedrooms_3)
table(ggHhDT$nBedrooms_4m)

# create category
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)]

table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")

# census ----
# census: number_of_bedrooms_for_occupied_private_dwellings_One_Bedroom  to number_of_bedrooms_for_occupied_private_dwellings_Eight_or_More_Bedrooms

au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_One_Bedroom" |
                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Two_Bedrooms", censusConstraint  := "nBedrooms_1_2"]

au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_Three_Bedrooms", censusConstraint  := "nBedrooms_3"]

au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_Four_Bedrooms" |
                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Five_Bedrooms" |
                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Six_Bedrooms" |
                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Seven_Bedrooms" |
                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Eight_or_More_Bedrooms", censusConstraint  := "nBedrooms_4m"]

origT <- au2013DT[variable %like% "_Bedroom" &
                    !(variable == "number_of_bedrooms_for_occupied_private_dwellings_Mean_Number_of_Bedrooms"),
                  .(nHouseholds = sum(value)), keyby = .(variable)]
kableExtra::kable(origT, caption = "Census area units: n bedrooms (original)")%>%
  kable_styling()

recodeT <- au2013DT[censusConstraint %like% "nBedrooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(recodeT, caption = "Census area units: n bedrooms (recoded)")%>%
  kable_styling()

message("Total households (original) = ", sum(origT$nHouseholds))
message("Total households (recoded) = ", sum(recodeT$nHouseholds))
message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nBedrooms"]$AU2013_code))

#au2013DT[!is.na(censusConstraint), .(nHHs = sum(value)), keyby = .(variable, censusConstraint)]
cor.test(ggHhDT$nPeople, ggHhDT$nBedrooms)
cor.test(ggHhDT$nPeople, ggHhDT$nRooms)
cor.test(ggHhDT$nRooms, ggHhDT$nBedrooms)

surveyDT <- ggHhDT[, .(linkID,
                       nKids_0, nKids_1m,
                       nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
                       #nRooms1_5, nRooms6_7, nRooms8m,
                       nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]

skimr::skim(surveyDT)

message("There are still a few NAs so we need to remove these households")
surveyDT <- na.omit(surveyDT)

skimr::skim(surveyDT)
message("Fixed it?")
censusAuDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
                         REGC2013_label == "Taranaki Region"]

# remove variables and rows we don't want
censusAuDT <- censusAuDT[, variable := NULL]
censusAuDT <- censusAuDT[!is.na(censusConstraint),]


t <- censusAuDT[censusConstraint %like% "nPeople" , .(nHouseholds = sum(value)), keyby = .(REGC2013_label, AU2013_label, AU2013_code)]

kableExtra::kable(t, caption = "Selected area units by region")%>%
  kable_styling()
message("Total households = ", sum(t$nHouseholds))

nPeopleT <- censusAuDT[censusConstraint %like% "nPeople", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(nPeopleT, caption = "Census area units: n nPeople (recoded)")%>%
  kable_styling()

message("Total households = ", sum(nPeopleT$nHouseholds))

nBedroomsT <- censusAuDT[censusConstraint %like% "nBedrooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(nBedroomsT, caption = "Census area units: n bedrooms (recoded)")%>%
  kable_styling()

message("Total households = ", sum(nBedroomsT$nHouseholds))

nKidsT <- censusAuDT[censusConstraint %like% "nKids", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(nKidsT, caption = "Census area units: n kids (recoded) - residual not yet calculated")%>%
  kable_styling()

message("Total households = ", sum(nKidsT$nHouseholds))


skimr::skim(censusAuDT)

# relies heavily on: http://robinlovelace.net/spatial-microsim-book/smsim-in-R.html
# ipfp wants:
# y = numeric vector of constraints for each area - the names must match the survey, data as counts, no ids or anything
# which means we have to assume ipfp maintains order
# A = the transposed survey data (row = var) in dummy variable form - names must match constraints, no ids etc, variables that match constraints only
# which means we have to assume ipfp maintains order
# x0 = numeric initial vector of length ncol

# set order of variables - has to match (probably)

# NB: we set the order of constraints here - we set n people as the last constraint so that the ipf algoritm fits it perfectly
# This is a design choice and it is driven by the size of the coefficients in the lm models above.

constraintsList <- c("nKids_0", "nKids_1m",
                     "nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m",
                     "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m"
) # we use just these later - useful to have a list in the correct order

# Survey setup - remove linkID - we have to trust that ipfp maintains the order!!! ----

ipfInputSurveyDT <- surveyDT[, constraintsList, with=FALSE]

# Census setup - remove area code - we have to trust that ipfp maintains the order!!! ----

tempDT <- censusAuDT[, .(AU2013_code, censusConstraint, value)]

# have to make wide
censusAuWideDT <- reshape(tempDT,
                          idvar = "AU2013_code",
                          timevar = "censusConstraint",
                          v.names = "value",
                          direction = "wide")
censusAuWideDT <- censusAuWideDT[, value.nKids_0 := value.TotalFamilies - value.nKids_1m]
censusAuWideDT$value.TotalFamilies <- NULL

# rename the cols
old <-  c("value.nKids_1m", "value.nKids_0",
          "value.nBedrooms_1_2", "value.nBedrooms_3", "value.nBedrooms_4m",
          "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m"
)
setnames(censusAuWideDT, old = old, new = constraintsList)

# ipfp just wants counts, no codes or anything and it doesn't want any where the total counts for the constraint are 0

censusAuWideDT <- censusAuWideDT[, nBedrooms_Total := nBedrooms_1_2 + nBedrooms_3 + nBedrooms_4m]
censusAuWideDT <- censusAuWideDT[, nPeople_Total := nPeople_1 + nPeople_2 + nPeople_3 + nPeople_4m]
censusAuWideDT <- censusAuWideDT[, nKids_Total := nKids_0 + nKids_1m]

summary(censusAuWideDT)

message("Removng areas which have total counts for any constraint = 0")

nOrig <- nrow(censusAuWideDT)
zerosDT <- censusAuWideDT[nPeople_Total == 0 |
                            nBedrooms_Total == 0 |
                            nKids_Total == 0, .(AU2013_code, nPeople_Total)]
zerosDT <- zerosDT[, AU2013_code := as.character(AU2013_code)]
zerosDT <- auListDT[zerosDT]

censusAuWideDT <- censusAuWideDT[nPeople_Total > 0 &
                                   nBedrooms_Total > 0 &
                                   nKids_Total > 0]
nNonZero <- nrow(censusAuWideDT)

message("Removng areas which have total counts for any constraint = 0 dropped ", nOrig - nNonZero ," areas")
message("These were: ")
zerosDT$AU2013_label

# check
summary(censusAuWideDT)

# now keep constraints only - has the side effect of fixing the order
ipfInputConstraintsDT <- censusAuWideDT[, constraintsList, with=FALSE]

# need list of area codes
areaCodes <- censusAuWideDT[, AU2013_code]
# need list of household ids
hhIdsDT <- surveyDT[, .(linkID)]
# expects:
#   an area level constraints dt and a survey constraints dt (vars in same order)
#   a list of area codes & bmg_ids
# returns a long form weights dt with oacodes labelled

# transpose the survey for ipf
surv_dtt <- t(ipfInputSurveyDT)

# set up initial vector of the same length as the number of households as a load of 1s (could be the response weights...)
x0 <- rep(1, nrow(ipfInputSurveyDT))

# create 2D weighst matrix (households by areas) - this will get filled with the weights
weights = array(NA, dim=c(nrow(ipfInputSurveyDT), nrow(ipfInputConstraintsDT)))

for(i in 1:ncol(weights)){ # for each area (cols of weights)
  print(paste0("Running ipfp on area ", i))
  weights[,i] <- ipfp(as.numeric(ipfInputConstraintsDT[i,]), # only accepts a vector of numbers
                      surv_dtt,
                      x0,
                      maxit = 10, # maxit = 10 iterations (for now),
                      verbose = T) # careful with verbose - too much output
}


# this is where we have to assume ipfp maintained the oaCode order
wDT <- as.data.table(weights) # convert to dt
wDT <- setnames(wDT, as.character(areaCodes)) # add col names

# this is where we have to assume ipfp maintained the bmg_id order
dt <- cbind(hhIdsDT, wDT) # add linkID to each row of weights (fills down, if you get recycling then there is a problem)
setkey(dt, linkID)

# need to reshape the weightsDT into 1 column of weights with oacode set to oacode etc
longFormDT <- as.data.table(melt(dt, id=c("linkID")))
# add correct area code name for future matching
longFormDT <- setnames(longFormDT, c("variable", "value"), c("AU2013_code", "ipfWeight"))
wDT <- NULL # save mem

kableExtra::kable(caption = "First few rows/columns of ipf weights table", head(longFormDT))

kableExtra::kable(caption = "Summary of ipf weights table", summary(longFormDT, digits = 2))

message("Dimensions of ipf weights table: ", dkUtils::tidyNum(nrow(longFormDT)), " rows x ", ncol(longFormDT), " cols")

setkey(longFormDT, linkID)
attDT <- ggHhDT[, .(linkID, `Hot water cylinder`, `Heat pump number`, `PV Inverter`,
                    `nAdults`, `nChildren0_12`,`nTeenagers13_18`, `nPeople`,
                    `nBedrooms`, `nBedroomsCorrected`)] # keep the vars we want to use
setkey(attDT, linkID)
longFormSvyDT <- merge(longFormDT, attDT)

# set svydesign
longFormSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
                                    weights = ~ipfWeight,
                                    data = longFormSvyDT[!is.na(ipfWeight)] # make sure no NA weights
)

svyt <- svytable(~nPeople + round(nBedroomsCorrected), longFormSvyDes,
                 exclude=NULL,
                 na.action=na.pass
)
kableExtra::kable(caption = "N bedrooms vs n people (all households, all areas selected, weighted ipf results)", round(svyt,2))

message("Calculating weighted household counts - can take some time...")
svyt <- svytable(~AU2013_code, longFormSvyDes,
                 exclude=NULL,
                 na.action=na.pass
)

ipfHhCountsDT <- data.table::as.data.table(svyt)
ipfHhCountsDT <- ipfHhCountsDT[, AU2013_code := as.character(AU2013_code)]
setkey(ipfHhCountsDT, AU2013_code)
censusAuWideDT <- censusAuWideDT[, AU2013_code := as.character(AU2013_code)]
setkey(censusAuWideDT, AU2013_code)
ipfOutputTestDT <- ipfHhCountsDT[censusAuWideDT]

t_dt <- longFormDT[ipfWeight != 0, .(nGGHouseholds = uniqueN(linkID),
                       meanWeight = mean(ipfWeight),
                       minWeight = min(ipfWeight),
                       maxWeight = max(ipfWeight)),
                   keyby = .(AU2013_code)]

ipfOutputTestDT <- ipfOutputTestDT[t_dt]
ipfOutputTestDT <- ipfOutputTestDT[, AU2013_code := as.character(AU2013_code)]
ipfOutputTestDT <- auListDT[ipfOutputTestDT]

ipfOutputTestDT <- ipfOutputTestDT[, ratio := N/nGGHouseholds]

ts <- summary(ipfOutputTestDT)

kableExtra::kable(t_dt, caption = "GREENGrid Household hot water data - summary of weights by UA)")%>%
  kable_styling()

kableExtra::kable(ts, caption = "GREENGrid Household lighting data - summary of weights)")%>%
  kable_styling()

message("How many GREENGrid households are now being used to represent each area unit?")

ggplot2::ggplot(ipfOutputTestDT, aes(x = nPeople_Total, y = N, colour = REGC2013_label)) +
  geom_point() +
  scale_colour_discrete(name = "Region") +
  labs(x = "Number of households (using census n people variable)",
       y = "Number of households (ipf results)",
       caption = "We should expect these to match as the n people constraint is fitted last")

ggplot2::ggplot(ipfOutputTestDT, aes(x = nBedrooms_Total, y = N, colour = REGC2013_label)) +
  geom_point() +
  scale_colour_discrete(name = "Region") +
  labs(x = "Number of households (using census n bedrooms variable)",
       y = "Number of households (ipf results)",
       caption = "We should expect these not to match as the n people constraint \n is fitted last and bedrooms is based on dwellings")

ggplot2::ggplot(ipfOutputTestDT, aes(x = nGGHouseholds, y = N, colour = REGC2013_label)) +
  geom_jitter() +
  scale_colour_discrete(name = "Region") +
  labs(x = "Number of GREENGrid households",
       y = "Number of households represented",
       caption = paste0("Plotting ", uniqueN(ipfOutputTestDT$AU2013_code), " area units"))

totalIpfHHs <- sum(ipfHhCountsDT$N)

f <- paste0(lParams$ggPath, "/safe/gridSpy/1min/dataExtracts/Hot Water_2015-04-01_2016-03-31_observations.csv.gz")
message("Loading: ", f)
ggHwOrigDT <- data.table::as.data.table(readr::read_csv(f))

message("N rows loaded: ", dkUtils::tidyNum(nrow(ggHwOrigDT)))
message("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHwOrigDT$linkID)))

ggHwOrigDT$hhID <- NULL # not needed

h <- head(ggHwOrigDT)
kableExtra::kable(h, caption = "GREENGrid Household hot water demand data: original")%>%
  kable_styling()

message("Summary of variables:")
skimr::skim(ggHwOrigDT)

# remove negative ----
ggHwDT <- ggHwOrigDT[circuit != "Hot Water - Controlled$4231" & powerW >= 0]

# set seasons
ggHwDT <- GREENGridData::addNZSeason(ggHwDT)

# set useful dates and times
# when loaded the r_dateTime variable is UTC, we need to change this to NZT
ggHwDT <- ggHwDT[, localDateTime := lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland")]
ggHwDT <- ggHwDT[, time := hms::as.hms(localDateTime)]
ggHwDT <- ggHwDT[, halfHour := hms::trunc_hms(time, 30 * 60)] # trunc takes us back to the start of the period
ggHwDT <- ggHwDT[, qHour := hms::trunc_hms(time, 15 * 60)] # trunc takes us back to the start of the period

message("Summary of hot water data after cleaning")
skimr::skim(ggHwDT)
myCaption <- paste0("GREENGrid household power demand data from ",
                    min(lubridate::date(ggHwDT$localDateTime)), " to ",
                    max(lubridate::date(ggHwDT$localDateTime)),
                    "\n Hot water: n households = ", uniqueN(ggHwDT$linkID)
                    )


# set keys for linkage ----
setkey(ggHhDT, linkID)
setkey(ggHwDT, linkID)
# aggregate plot over all households: nKids ----
plotDT <- ggHhDT[, .(linkID)][ggHwDT][, .(meanW = mean(powerW), # merge on linkID here as we summarise (data.table trick)
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, season)]

ggplot2::ggplot(plotDT, aes(x = qHour, y = meanW/1000, colour = season)) +
  geom_step() +
  scale_colour_discrete(name = "Season") +
  #scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )
# aggregate plot over all households: nKids ----
plotDT <- ggHhDT[, .(linkID, presenceKids)][ggHwDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, presenceKids, season)]

ggplot2::ggplot(plotDT[!is.na(presenceKids)], aes(x = qHour, y = meanW/1000, colour = presenceKids)) +
  geom_step() +
  scale_colour_discrete(name = "Presence of children") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('Presence of children' = presenceKids)]

kableExtra::kable(t, caption = "N households (Presence of children)") %>%
  kable_styling()
# aggregate plot over all households: nBedroomsCat ----
plotDT <- ggHhDT[, .(linkID, nBedroomsCat)][ggHwDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, nBedroomsCat, season)]

ggplot2::ggplot(plotDT[!is.na(nBedroomsCat)], aes(x = qHour, y = meanW/1000, colour = nBedroomsCat)) +
  geom_step() +
  facet_grid(season ~ .) +
  scale_colour_discrete(name = "N bedrooms") +
  #scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('N bedrooms' = nBedroomsCat)]

kableExtra::kable(t, caption = "N households (N people)") %>%
  kable_styling()
# aggregate plot over all households: nPeopleCat ----
plotDT <- ggHhDT[, .(linkID, nPeopleCat)][ggHwDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, nPeopleCat, season)]

ggplot2::ggplot(plotDT[!is.na(nPeopleCat)], aes(x = qHour, y = meanW/1000, colour = nPeopleCat)) +
  geom_step() +
  scale_colour_discrete(name = "N people") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('N people' = nPeopleCat)]

kableExtra::kable(t, caption = "N households (N people)") %>%
  kable_styling()
# aggregate to half-hour per hh per season
plotDT <- ggHwDT[, .(meanW = mean(powerW),
                         sdW = sd(powerW),
                         nObs = .N), keyby = .(linkID, qHour, season)]

# aggregate plot per hh
ggplot2::ggplot(plotDT, aes(x = qHour, y = meanW/1000, colour = linkID)) +
  geom_step() +
  scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (within household)",
       caption = paste0(myCaption,
                        ": Hot Water circuits, n = ",
                        uniqueN(plotDT$linkID)))

# aggregate to half-hour for overall summary plot
setkey(ggHwDT, linkID)
plotDT <- ggHhDT[, .(linkID, Location)][ggHwDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(halfHour, Location)]

# aggregate plot over all households
ggplot2::ggplot(plotDT, aes(x = halfHour, y = meanW/1000, colour = Location)) +
  geom_step() +
  #scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = paste0(myCaption,
                        ": Hot Water circuits, n = ",
                        uniqueN(dayHwDT$linkID)))

f <- paste0(lParams$ggPath, "/safe/gridSpy/1min/dataExtracts/Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")
outputMessage(paste0("Loading: ", f))
ggHpOrigDT <- data.table::as.data.table(readr::read_csv(f))
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHpOrigDT))))
outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHpOrigDT$linkID))))

ggHpOrigDT$hhID <- NULL # not needed

h <- head(ggHpOrigDT)
kableExtra::kable(h, caption = "GREENGrid Household heat pump demand data: head")%>%
  kable_styling()

message("Summary of variables:")
skimr::skim(ggHpOrigDT)

# remove negative ----
# rf_27    2015-08-22 10:33:00     Heat Pump$2826  27759
ggHpDT <- ggHpOrigDT[circuit != "Heat Pumps (2x) & Power$4399" & powerW >= 0 & powerW != 27759]

# set seasons ----
ggHpDT <- GREENGridData::addNZSeason(ggHpDT)

# set useful dates and times ----
# when loaded the r_dateTime variable is UTC, we need to change this to NZT
ggHpDT <- ggHpDT[, localDateTime := lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland")]
ggHpDT <- ggHpDT[, time := hms::as.hms(localDateTime)]
ggHpDT <- ggHpDT[, halfHour := hms::trunc_hms(time, 30 * 60)] # trunc takes us back to the start of the period
ggHwDT <- ggHpDT[, qHour := hms::trunc_hms(time, 15 * 60)] # trunc takes us back to the start of the period

message("Summary of heat pump data after cleaning")
skimr::skim(ggHpDT)
myCaption <- paste0("GREENGrid household power demand data from ",
                    min(lubridate::date(ggHpDT$localDateTime)), " to ",
                    max(lubridate::date(ggHpDT$localDateTime)),
                    "\n Heat pumps: n households = ", uniqueN(ggHpDT$linkID)
                    )


# set keys for linkage ----
setkey(ggHhDT, linkID)
setkey(ggHpDT, linkID)
# aggregate plot over all households ----
plotDT <- ggHhDT[, .(linkID)][ggHpDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, season)]

ggplot2::ggplot(plotDT, aes(x = qHour, y = meanW/1000, colour = season)) +
  geom_step() +
  scale_colour_discrete(name = "Season") +
  #scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )
# aggregate plot over all households: nKids ----
plotDT <- ggHhDT[, .(linkID, presenceKids)][ggHpDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, presenceKids, season)]

ggplot2::ggplot(plotDT[!is.na(presenceKids)], aes(x = qHour, y = meanW/1000, colour = presenceKids)) +
  geom_step() +
  scale_colour_discrete(name = "Presence of children") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .)
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = paste0(myCaption,
                        "\n Heat pumps, n = ",
                        uniqueN(dt$linkID)))

t <- ggHhDT[, .('N households' = .N), keyby = .('Presence of children' = presenceKids)]

kableExtra::kable(t, caption = "N households (Presence of children)") %>%
  kable_styling()
# aggregate plot over all households: nBedroomsCat ----
plotDT <- ggHhDT[, .(linkID, nBedroomsCat)][ggHpDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, nBedroomsCat, season)]

ggplot2::ggplot(plotDT[!is.na(nBedroomsCat)], aes(x = qHour, y = meanW/1000, colour = nBedroomsCat)) +
  geom_step() +
  scale_colour_discrete(name = "N bedrooms") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('N bedrooms' = nBedroomsCat)]

kableExtra::kable(t, caption = "N households (N bedrooms)") %>%
  kable_styling()
# aggregate plot over all households: nPeopleCat ----
plotDT <- ggHhDT[, .(linkID, nPeopleCat)][ggHpDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, nPeopleCat, season)]

ggplot2::ggplot(plotDT[!is.na(nPeopleCat)], aes(x = qHour, y = meanW/1000, colour = nPeopleCat)) +
  geom_step() +
  scale_colour_discrete(name = "N people") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('N people' = nPeopleCat)]

kableExtra::kable(t, caption = "N households (N people)") %>%
  kable_styling()

f <- paste0(lParams$ggPath, "/safe/gridSpy/1min/dataExtracts/Lighting_2015-04-01_2016-03-31_observations.csv.gz")
outputMessage(paste0("Loading: ", f))
ggLightingOrigDT <- data.table::as.data.table(readr::read_csv(f))
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggLightingOrigDT))))
outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggLightingOrigDT$linkID))))

ggLightingOrigDT$hhID <- NULL # not needed

h <- head(ggLightingOrigDT)
kableExtra::kable(h, caption = "GREENGrid Household lighting demand data: head")%>%
  kable_styling()

message("Summary of variables:")
skimr::skim(ggLightingOrigDT)

# remove negative ----
ggLightingDT <- ggLightingOrigDT[circuit != "Lighting$4404" & powerW >= 0]

# set seasons
ggLightingDT <- GREENGridData::addNZSeason(ggLightingDT)

# set useful dates and times
# when loaded the r_dateTime variable is UTC, we need to change this to NZT
ggLightingDT <- ggLightingDT[, localDateTime := lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland")]
ggLightingDT <- ggLightingDT[, time := hms::as.hms(localDateTime)]
ggLightingDT <- ggLightingDT[, halfHour := hms::trunc_hms(time, 30 * 60)] # trunc takes us back to the start of the period
ggLightingDT <- ggLightingDT[, qHour := hms::trunc_hms(time, 15 * 60)] # trunc takes us back to the start of the period

message("Summary of lighting data after cleaning")
skimr::skim(ggLightingDT)

myCaption <- paste0("GREENGrid household power demand data from ",
                    min(lubridate::date(ggLightingDT$localDateTime)), " to ",
                    max(lubridate::date(ggLightingDT$localDateTime)),
                    "\n Lighting: n households = ", uniqueN(ggLightingDT$linkID)
                    )

# set keys for linkage ----
setkey(ggHhDT, linkID)
setkey(ggLightingDT, linkID)
# aggregate plot over all households ----
plotDT <- ggHhDT[, .(linkID)][ggLightingDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, season)]

ggplot2::ggplot(plotDT, aes(x = qHour, y = meanW/1000, colour = season)) +
  geom_step() +
  scale_colour_discrete(name = "Season") +
  #scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption)
# aggregate plot over all households: nKids ----
plotDT <- ggHhDT[, .(linkID, presenceKids)][ggLightingDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, presenceKids, season)]

ggplot2::ggplot(plotDT[!is.na(presenceKids)], aes(x = qHour, y = meanW/1000, colour = presenceKids)) +
  geom_step() +
  scale_colour_discrete(name = "Presence of children") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .)
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption)

t <- ggHhDT[, .('N households' = .N), keyby = .('Presence of children' = presenceKids)]

kableExtra::kable(t, caption = "N households (Presence of children)") %>%
  kable_styling()
# aggregate plot over all households: nBedroomsCat ----
plotDT <- ggHhDT[, .(linkID, nBedroomsCat)][ggLightingDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, nBedroomsCat, season)]

ggplot2::ggplot(plotDT[!is.na(nBedroomsCat)], aes(x = qHour, y = meanW/1000, colour = nBedroomsCat)) +
  geom_step() +
  scale_colour_discrete(name = "N bedrooms") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('N bedrooms' = nBedroomsCat)]

kableExtra::kable(t, caption = "N households (N bedrooms)") %>%
  kable_styling()
# aggregate plot over all households: nPeopleCat ----
plotDT <- ggHhDT[, .(linkID, nPeopleCat)][ggLightingDT][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(qHour, nPeopleCat, season)]

ggplot2::ggplot(plotDT[!is.na(nPeopleCat)], aes(x = qHour, y = meanW/1000, colour = nPeopleCat)) +
  geom_step() +
  scale_colour_discrete(name = "N people") +
  #scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = myCaption
       )

t <- ggHhDT[, .('N households' = .N), keyby = .('N people' = nPeopleCat)]

kableExtra::kable(t, caption = "N households (N people)") %>%
  kable_styling()

message("Which day(s) have the most active household data?")

daysActiveDT <- ggHwDT[, .(count = uniqueN(linkID)),
                       keyby = .(date = lubridate::date(localDateTime), linkID)]

daysDT <- daysActiveDT[, .(nHHs = sum(count)), keyby = .(date)]

ggplot2::ggplot(daysDT, aes(x = date, y = nHHs)) +
  geom_line()

setkey(daysDT, nHHs, date) # auto-sorts

tail(daysDT)

message("So could choose 25th May 2015 which was a Monday. But we should just ")
message ("check this is when the most survey households were active since they were used for the ipf")

setkey(daysActiveDT, linkID)

surveyDT <- surveyDT[, hasSurvey := "Has survey"]
setkey(surveyDT, linkID)
setkey(daysActiveDT, linkID)
daysActiveDT <- surveyDT[daysActiveDT]
daysActiveDT <- daysActiveDT[is.na(hasSurvey), hasSurvey := "No survey"]

t <- table(daysActiveDT$linkID, daysActiveDT$hasSurvey, useNA = "always")

kableExtra::kable(t,
                  caption = "GREENGrid Household hot water data - missing surveys (data = number of days of power data)")%>%
  kable_styling()

message("Looks like we will only lose 2 - re-run the date tests with these two taken out")

daysDT <- daysActiveDT[hasSurvey == "Has survey", .(nHHs = sum(count)), keyby = .(date)]

ggplot2::ggplot(daysDT, aes(x = date, y = nHHs)) +
  geom_line()

setkey(daysDT, nHHs, date) # auto-sorts

tail(daysDT)

message("We can still take Monday 25th May...")
filter <- lubridate::date("2015-05-25")

myCaption <- paste0("GREENGrid household power demand data, ", filter)

dayHwDT <- ggHwDT[lubridate::date(localDateTime) == filter]

message("Testing the start & end times of the subset")
min(dayHwDT$localDateTime)
max(dayHwDT$localDateTime)

# aggregate to half-hour for overall summary plot
setkey(dayHwDT, linkID)
plotDT <- dayHwDT[ggHhDT[, .(linkID, Location)]][, .(meanW = mean(powerW),
                      sdW = sd(powerW),
                      sumW = sum(powerW),
                      nObs = .N), keyby = .(halfHour, Location)]

# aggregate plot over all households
ggplot2::ggplot(plotDT, aes(x = halfHour, y = meanW/1000, colour = Location)) +
  geom_step() +
  #scale_y_log10() +
  #scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (across households)",
       x = "Time of day",
       caption = paste0(myCaption,
                        ": Hot Water circuits, n = ",
                        uniqueN(dayHwDT$linkID)))
# aggregate to half-hour per hh (re-use later, gets messy)
dayHwHhDT <- dayHwDT[, .(meanW = mean(powerW),
                         sdW = sd(powerW),
                         nObs = .N), keyby = .(linkID, halfHour)]

# aggregate plot per hh
ggplot2::ggplot(dayHwHhDT, aes(x = halfHour, y = meanW/1000, colour = linkID)) +
  geom_step() +
  scale_colour_viridis(discrete = TRUE) +
  labs(y = "Mean kW (within household)",
       caption = paste0(myCaption,
                        ": Hot Water circuits, n = ",
                        uniqueN(dayHwDT$linkID)))

setkey(dayHwHhDT, linkID)
message("N households with demand data: ", uniqueN(dayHwHhDT$linkID))

setkey(longFormDT, linkID)
message("N unique households in synthetic census: ", uniqueN(longFormDT$linkID))

dataDT <- dayHwHhDT[longFormDT, allow.cartesian=TRUE]
message("N unique households in merged power data & synthetic census: ", uniqueN(longFormDT$linkID))

message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")

summary(dataDT)

message("We can drop both of these to save memory.")

dataDT <- dataDT[!is.na(meanW) & ipfWeight > 0]
nrow(dataDT)

# no weights
meanProfilesUnwDT <- dataDT[, .(meanW = mean(meanW)), keyby = .(halfHour, AU2013_code)]

ggplot2::ggplot(meanProfilesUnwDT, aes(x = halfHour, y = meanW/1000, colour = AU2013_code)) +
  geom_line() +
  scale_colour_viridis(discrete = TRUE) +
  theme(legend.position="none") + # hide for clarity
  labs(y = "Mean kW",
       caption = paste0(myCaption,
                        ": Hot Water circuits, n households (unweighted) = ",
                        uniqueN(dataDT$linkID),
                        " replicated over ",uniqueN(dataDT$areaCode) , " areas"))


# with weights
# set svydesign
message("Setting survey design using weights")
dataDTSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
                                  weights = ~ipfWeight,
                                  data = dataDT[!is.na(ipfWeight)] # make sure no NA weights
)

message("Calculating weighted power summaries - can take some time...")
system.time(svyt <- svyby(~meanW, ~AU2013_code+as.character(halfHour), dataDTSvyDes,
                          svymean,
                          keep.var = TRUE,
                          exclude=NULL,
                          na.action=na.pass
)
)

svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
# fix the time thing
svytDT$halfHour <- hms::parse_hms(svytDT$`as.character(halfHour)`)

setkey(svytDT, AU2013_code)
setkey(auListDT, AU2013_code)
svytDT <- auListDT[svytDT] # add unit area labels

setkey(ipfHhCountsDT, AU2013_code)
svytDT <- ipfHhCountsDT[svytDT] # add unit area household counts

p <- ggplot2::ggplot(svytDT, aes(x = halfHour, y = meanW/1000, colour = AU2013_label)) +
  geom_step() +
  scale_colour_viridis(discrete = TRUE) +
  facet_grid(REGC2013_label ~ .) +
  theme(legend.position="none") + # hide for clarity
  labs(y = "Mean kW",
       caption = paste0(myCaption,
                        "\n Hot Water circuits, n households (base) = ", uniqueN(dataDT$linkID),
                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
                        "\n Plotting ",uniqueN(svytDT$AU2013_code) , " areas"))

p

plotly::ggplotly(p)

setkey(seasonHwHhDT, linkID)
message("N households with demand data: ", uniqueN(seasonHwHhDT$linkID))

setkey(longFormDT, linkID)
message("N unique households in synthetic census: ", uniqueN(longFormDT$linkID))

dataDT <- seasonHwHhDT[longFormDT, allow.cartesian=TRUE]
message("N unique households in merged power data & synthetic census: ", uniqueN(longFormDT$linkID))

message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")

summary(dataDT)

message("We can drop both of these to save memory.")

dataDT <- dataDT[!is.na(meanW) & ipfWeight > 0]

summary(dataDT)

# no weights
meanProfilesUnwDT <- dataDT[, .(meanW = mean(meanW)), keyby = .(halfHour, AU2013_code, season)]

ggplot2::ggplot(meanProfilesUnwDT, aes(x = halfHour, y = meanW/1000, colour = AU2013_code)) +
  geom_line() +
  scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  theme(legend.position="none") + # hide for clarity
  labs(y = "Mean kW",
       caption = paste0(myCaption,
                        "\n Hot Water circuits",
                        "\n n households (unweighted) = ", uniqueN(dataDT$linkID),
                        "\n replicated over ",uniqueN(dataDT$AU2013_code) , " areas"))


# with weights
# set svydesign
message("Setting survey design using weights")
dataDTSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
                                  weights = ~ipfWeight,
                                  data = dataDT[!is.na(ipfWeight)] # make sure no NA weights
)

message("Calculating weighted power summaries - can take some time...")
system.time(svyt <- svyby(~meanW, ~AU2013_code+as.character(halfHour)+season, dataDTSvyDes,
                          svymean,
                          keep.var = TRUE,
                          exclude=NULL,
                          na.action=na.pass
)
)

svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
# fix the time thing
svytDT$halfHour <- hms::parse_hms(svytDT$`as.character(halfHour)`)

setkey(svytDT, AU2013_code)
setkey(auListDT, AU2013_code)
svytDT <- auListDT[svytDT] # add unit area labels

setkey(ipfHhCountsDT, AU2013_code)
svytDT <- ipfHhCountsDT[svytDT] # add unit area household counts

p <- ggplot2::ggplot(svytDT, aes(x = halfHour, y = meanW/1000, colour = AU2013_label)) +
  geom_step() +
  facet_grid(season ~ .) +
  scale_colour_viridis(discrete = TRUE) +
  theme(legend.position="none") + # hide for clarity
  labs(y = "Mean kW",
       caption = paste0(myCaption,
                        "\n Hot Water circuits",
                        "\n n households (base) = ", uniqueN(dataDT$linkID),
                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas"))

p

plotly::ggplotly(p)


myCaption <- paste0("GREENGrid household power demand data from ", min(ggLightingDT$localDateTime), " to ", max(ggLightingDT$localDateTime))

# aggregate to half-hour per hh per season (re-use this later on, gets messy)
plotDT <- ggLightingDT[, .(meanW = mean(powerW),
                         sdW = sd(powerW),
                         nObs = .N), keyby = .(halfHour, season)]

# aggregate to half-hour per hh per season (re-use this later on, gets messy)
seasonLHhDT <- ggLightingDT[, .(meanW = mean(powerW),
                         sdW = sd(powerW),
                         nObs = .N), keyby = .(linkID, halfHour, season)]

# aggregate plot per hh
p <- ggplot2::ggplot(seasonLHhDT, aes(x = halfHour, y = meanW/1000, colour = linkID)) +
  geom_step() +
  scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  labs(y = "Mean kW (within household)",
       caption = paste0(myCaption,
                        "\n Lighting circuits, n = ", uniqueN(ggLightingDT$linkID)))

p

plotly::ggplotly(p)

setkey(seasonLHhDT, linkID)
message("N households with demand data: ", uniqueN(seasonLHhDT$linkID))

setkey(longFormDT, linkID)
message("N unique households in synthetic census: ", uniqueN(longFormDT$linkID))

dataDT <- seasonLHhDT[longFormDT, allow.cartesian=TRUE]
message("N unique households in merged power data & synthetic census: ", uniqueN(longFormDT$linkID))

message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")

summary(dataDT)

message("We can drop both of these to save memory.")

dataDT <- dataDT[!is.na(meanW) & ipfWeight > 0]

summary(dataDT)

# no weights
meanProfilesUnwDT <- dataDT[, .(meanW = mean(meanW)), keyby = .(halfHour, AU2013_code, season)]

ggplot2::ggplot(meanProfilesUnwDT, aes(x = halfHour, y = meanW/1000, colour = AU2013_code)) +
  geom_line() +
  scale_colour_viridis(discrete = TRUE) +
  facet_grid(season ~ .) +
  theme(legend.position="none") + # hide for clarity
  labs(y = "Mean kW",
       caption = paste0(myCaption,
                        "\n Lighting circuits",
                        "\n n households (unweighted) = ", uniqueN(dataDT$linkID),
                        " replicated over ",uniqueN(dataDT$AU2013_code) , " areas"))


# with weights
# set svydesign
message("Setting survey design using weights")
dataDTSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
                                  weights = ~ipfWeight,
                                  data = dataDT[!is.na(ipfWeight)] # make sure no NA weights
)

message("Calculating weighted power summaries - can take some time...")
system.time(svyt <- svyby(~meanW, ~AU2013_code+as.character(halfHour)+season, dataDTSvyDes,
                          svymean,
                          keep.var = TRUE,
                          exclude=NULL,
                          na.action=na.pass
)
)

svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
# fix the time thing
svytDT$halfHour <- hms::parse_hms(svytDT$`as.character(halfHour)`)

setkey(svytDT, AU2013_code)
setkey(auListDT, AU2013_code)
svytDT <- auListDT[svytDT] # add unit area labels

setkey(ipfHhCountsDT, AU2013_code)
svytDT <- ipfHhCountsDT[svytDT] # add unit area household counts

p <- ggplot2::ggplot(svytDT, aes(x = halfHour, y = meanW/1000, colour = AU2013_label)) +
  geom_step() +
  facet_grid(season ~ REGC2013_label) +
  scale_colour_viridis(discrete = TRUE) +
  theme(legend.position="none") + # hide for clarity
  labs(y = "Mean kW",
       x = "Time of Day",
       caption = paste0(myCaption,
                        "\n Lighting circuits",
                        "\n n households (base) = ", uniqueN(dataDT$linkID),
                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas"))

p

plotly::ggplotly(p)

t <- proc.time() - startTime

elapsed <- t[[3]]
# generic about include