diff --git a/analysis/GREENGridModel/old/_createSyntheticCensus.Rmd b/analysis/GREENGridModel/old/_createSyntheticCensus.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..847c8ef155a595b229f159855bf9b47c15e1b35f
--- /dev/null
+++ b/analysis/GREENGridModel/old/_createSyntheticCensus.Rmd
@@ -0,0 +1,533 @@
+
+## Create synthetic weighted households 
+
+### Set up IPF
+
+First, test survey variables for collinearity.
+
+```{r testSurveyCollinearity}
+cor.test(ggHhDT$nPeople, ggHhDT$nBedrooms)
+cor.test(ggHhDT$nPeople, ggHhDT$nRooms)
+cor.test(ggHhDT$nRooms, ggHhDT$nBedrooms)
+
+```
+
+So people & bedrooms are the least colinear of these. Which is odd given that we used one to predict the other. Anyway, we will use nBedrooms rather than nRooms.
+
+
+```{r setConstraints}
+
+# relies heavily on: http://robinlovelace.net/spatial-microsim-book/smsim-in-R.html
+# ipfp wants:
+# y = numeric vector of constraints for each area - the names must match the survey, data as counts, no ids or anything
+# which means we have to assume ipfp maintains order
+# A = the transposed survey data (row = var) in dummy variable form - names must match constraints, no ids etc, variables that match constraints only
+# which means we have to assume ipfp maintains order
+# x0 = numeric initial vector of length ncol
+
+# set order of variables - has to match (probably)
+
+
+
+fullConstraintsList <- c("heatSourceWood", "heatSourceElectricity", "heatSourceGas", "heatSourceCoal", "heatSourceOther", 
+                         "nKids_0", "nKids_1m",
+                     #"nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m",
+                     "nRooms1_4", "nRooms5_6", "nRooms7m",
+                     "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m" # fits last
+) 
+
+```
+
+We have the following constraints available:
+
+`fullConstraintsList`
+
+Note that we set the order of constraints here - we set n people as the last constraint so that the ipf algoritm fits it perfectly. This is a design choice and it is driven by the size of the coefficients in the lm models above (which lm models?)
+
+Check census data distributions - we're looking for zeros.
+
+```{r censusChecks}
+# check constraint distributions - do we have 0s? ----
+# do this here so we only have the relevant regions
+# > heat source ----
+
+t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "heat"]) # https://stackoverflow.com/questions/30189979/select-columns-of-data-table-based-on-regex
+
+kableExtra::kable(t, caption = "Test distribution of fuel sources") %>%
+  kable_styling()
+
+# > n kids ----
+
+t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "nKids"])
+
+kableExtra::kable(t, caption = "Test distribution of nKids") %>%
+  kable_styling()
+
+# > n people ----
+
+t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "nPeople"])
+
+kableExtra::kable(t, caption = "Test distribution of nPeople") %>%
+  kable_styling()
+
+# > n rooms ----
+
+t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "nRooms"])
+
+kableExtra::kable(t, caption = "Test distribution of nRooms") %>%
+  kable_styling()
+
+```
+
+
+```{r censusSetup}
+# check totals are not 0
+#au2013DT <- au2013DT[, nBedrooms_Total := nBedrooms_1_2 + nBedrooms_3 + nBedrooms_4m]
+au2013DT <- au2013DT[, nPeople_Total := nPeople_1 + nPeople_2 + nPeople_3 + nPeople_4m]
+au2013DT <- au2013DT[, nRooms_Total := nRooms1_4 + nRooms5_6 + nRooms7m]
+au2013DT <- au2013DT[, nKids_Total := nKids_0 + nKids_1m]
+au2013DT <- au2013DT[, heatSource_Total := heatSourceWood + heatSourceElectricity + heatSourceGas + heatSourceCoal + heatSourceOther]
+
+t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "_Total"])
+t
+message("Removing areas which have total counts for any constraint = 0")
+
+nOrig <- nrow(au2013DT)
+zerosDT <- au2013DT[nPeople_Total == 0 | is.na(nPeople_Total)| 
+                            #nBedrooms_Total == 0 | is.na(nBedrooms_Total)| 
+                            nRooms_Total == 0 | is.na(nRooms_Total)| 
+                            nKids_Total == 0 | is.na(nKids_Total)| 
+                            heatSource_Total == 0 | is.na(heatSource_Total), 
+                          .(AU2013_code, nRooms_Total,nPeople_Total, nKids_Total, 
+                            heatSource_Total
+                            )
+                          ]
+zerosDT <- zerosDT[, AU2013_code := as.character(AU2013_code)]
+setkey(zerosDT, AU2013_code)
+zerosDT <- auListDT[zerosDT]
+zerosDT <- zerosDT[, drop := 1]
+
+test <- zerosDT[au2013DT]
+
+# remove areas where count is 0 or NA
+censusDT <- test[is.na(drop)]
+nNonZero <- nrow(censusDT)
+```
+
+We have removed `r nOrig - nNonZero` areas where the household totals were 0 or NA. These are shown in Table \@ref(tab:zeroTotals).
+
+> NB: these areas will have no ipf results...
+
+```{r zeroTotals}
+t <- zerosDT[, .(REGC2013_label, AU2013_label, nRooms_Total,nPeople_Total,nKids_Total,
+                 heatSource_Total
+                 )][order(REGC2013_label)]
+
+kableExtra::kable(t, caption = "Areas where the household totals were 0 or NA") %>%
+  kable_styling()
+```
+
+```{r checkCensusConstraints}
+# check
+skimr::skim(censusDT)
+```
+
+
+### Run IPF
+
+This section runs the IPF to generate the weights.
+
+First set the constraints we are going to use.
+
+```{r setFinalConstraints}
+# need list of area codes
+areaCodes <- censusDT[, AU2013_code]
+# need list of household ids
+hhIdsDT <- surveyDT[, .(linkID)]
+
+# select the constraints we want & set order
+
+# coal = 0 might break the IPF
+
+testConstraintsList <- c(#"heatSourceWood","heatSourceElectricity","heatSourceGas","heatSourceCoal","heatSourceOther", 
+                         "nKids_0", "nKids_1m",
+                     #"nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m",
+                     "nRooms1_4","nRooms5_6","nRooms7m",
+                     "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m" # fits last
+)
+
+print(paste0("Using ", testConstraintsList))
+
+# select just the constraints we want to use ----
+ipfInputSurveyDT <- surveyDT[,testConstraintsList , with=FALSE]
+message("Input survey constraints:")
+names(ipfInputSurveyDT)
+
+ipfInputConstraintsDT <- censusDT[, testConstraintsList, with=FALSE]
+message("Input census constraints:")
+names(ipfInputConstraintsDT)
+
+```
+
+Notes:
+
+ * we don't use fuel/heat source as we get a lot of Nan
+
+
+```{r runIPF}
+# expects:
+#   an area level constraints dt and a survey constraints dt (vars in same order)
+#   a list of area codes & bmg_ids
+# returns a long form weights dt with oacodes labelled
+
+# transpose the survey for ipf
+surv_dtt <- t(ipfInputSurveyDT)
+
+# set up initial vector of the same length as the number of households as a load of 1s (could be the response weights...)
+x0 <- rep(1, nrow(ipfInputSurveyDT))
+
+# create 2D weighst matrix (households by areas) - this will get filled with the weights
+weights = array(NA, dim=c(nrow(ipfInputSurveyDT), nrow(ipfInputConstraintsDT)))
+
+for(i in 1:ncol(weights)){ # for each area (cols of weights)
+  print(paste0("Running ipfp on area ", i))
+  weights[,i] <- ipfp(as.numeric(ipfInputConstraintsDT[i,]), # only accepts a vector of numbers 
+                      surv_dtt, 
+                      x0, 
+                      maxit = 10, # maxit = 10 iterations (for now), 
+                      verbose = F) # careful with verbose - too much output
+}
+
+```
+
+
+```{r processIpfResults}
+# this is where we have to assume ipfp maintained the oaCode order
+wDT <- as.data.table(weights) # convert to dt
+wDT <- setnames(wDT, as.character(areaCodes)) # add col names
+
+# this is where we have to assume ipfp maintained the bmg_id order
+dt <- cbind(hhIdsDT, wDT) # add linkID to each row of weights (fills down, if you get recycling then there is a problem)
+setkey(dt, linkID)
+
+# need to reshape the weightsDT into 1 column of weights with oacode set to oacode etc
+longFormDT <- as.data.table(melt(dt, id=c("linkID")))
+# add correct area code name for future matching
+longFormDT <- setnames(longFormDT, c("variable", "value"), c("AU2013_code", "ipfWeight"))
+setkey(longFormDT,AU2013_code)
+wDT <- NULL # save mem
+
+# add region
+
+longFormDT <- auListDT[longFormDT]
+
+kableExtra::kable(caption = "First few rows/columns of ipf weights table", head(longFormDT))
+
+kableExtra::kable(caption = "Summary of ipf weights table", summary(longFormDT))
+
+message("Dimensions of ipf weights table: ", dkUtils::tidyNum(nrow(longFormDT)), " rows x ", ncol(longFormDT), " cols")
+```
+
+This produces a long form data file of nrow = n(linkID) x n(areaCode) (!) so that linkIDs are repeated and the weights are held in a single column (variable). This makes everything else a lot easier later. The dataset comprises `r dkUtils::tidyNum(uniqueN(longFormDT$linkID))` GREENGrid households replicated (with weights) across `r dkUtils::tidyNum(uniqueN(longFormDT$areaCode))` unit areas.
+
+It is possible that some of the weights are 0. This means there are households which have 0 weights in some areas. We do not need these households in the data so we will remove them (if any exist).
+
+```{r removeZeroWeights}
+message("N rows before removing zero weights: ", nrow(longFormDT))
+longFormDT <- longFormDT[ipfWeight > 0]
+message("N rows after removing zero weights: ", nrow(longFormDT))
+```
+
+```{r saveData}
+# save the results for future use since they won't change unless we change the constraints
+data.table::fwrite(longFormDT, paste0(sParams$ggPath, "/safe/ipf/nonZeroWeightsAu2013.csv"))
+```
+
+We now need to add the survey-based attributes back (from the GREENGrid survey). If we had a much larger sample of households and a lot more areas we would not do this here as it would create a very large file.
+
+```{r addAttributes}
+
+setkey(longFormDT, linkID)
+attDT <- ggHhDT[, .(linkID, `Hot water cylinder`, `Heat pump number`, `PV Inverter`, 
+                    `nPeopleCat`, presenceKids, `nBedroomsCat`, `nRoomsCat`,
+                    heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther, 
+                    nKids_0, nKids_1m,
+                     nBedrooms_1_2, nBedrooms_3, nBedrooms_4m,
+                     nRooms1_4, nRooms5_6, nRooms7m,
+                     nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
+                    heatFuel, Q20_coded, Q49_coded )] # keep the vars we want to use
+setkey(attDT, linkID)
+longFormSvyDT <- merge(longFormDT, attDT)
+
+# set svydesign
+longFormSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
+                                    weights = ~ipfWeight, 
+                                    data = longFormSvyDT[!is.na(ipfWeight)] # make sure no NA weights
+)
+```
+
+### Check outputs
+
+#### Overall weights and household counts
+
+
+```{r testIpfOutcomes}
+message("Calculating weighted household counts - can take some time...")
+svyt <- svytable(~AU2013_code, longFormSvyDes, 
+                 exclude=NULL, 
+                 na.action=na.pass
+)
+
+ipfHhCountsDT <- data.table::as.data.table(svyt)
+ipfHhCountsDT <- ipfHhCountsDT[, simCount := N]
+setkey(ipfHhCountsDT, AU2013_code)
+setkey(censusDT, AU2013_code)
+ipfOutputTestDT <- ipfHhCountsDT[censusDT]
+
+t_dt <- longFormDT[ipfWeight != 0, .(nGGHouseholds = uniqueN(linkID),
+                       meanWeight = mean(ipfWeight),
+                       minWeight = min(ipfWeight),
+                       maxWeight = max(ipfWeight)),
+                   keyby = .(AU2013_code)]
+
+ipfOutputTestDT <- ipfOutputTestDT[t_dt]
+#ipfOutputTestDT <- ipfOutputTestDT[, AU2013_code := as.character(AU2013_code)]
+ipfOutputTestDT <- auListDT[ipfOutputTestDT]
+
+ipfOutputTestDT <- ipfOutputTestDT[, ratio := simCount/nGGHouseholds]
+
+t <- longFormDT[ipfWeight != 0, .(nGGHouseholds = uniqueN(linkID),
+                       meanWeight = mean(ipfWeight),
+                       minWeight = min(ipfWeight),
+                       maxWeight = max(ipfWeight)),
+                   keyby = .(REGC2013_label)]
+
+kableExtra::kable(t, caption = "GREENGrid - summary of weights by region")%>%
+  kable_styling()
+
+```
+
+
+Figure \@ref(fig:checkTotalCounts_nPeople) shows the actual and simulated number of households using the nPeople constraint.
+
+```{r checkTotalCounts_nPeople}
+myCaption <- "Slope = line of exact fit"
+t <- ipfOutputTestDT[, .(mean_nHHs = mean(npeople_totalHouseholds),
+                              min_nHHs = min(npeople_totalHouseholds),
+                              max_nHHs = max(npeople_totalHouseholds)), keyby = .(REGC2013_label)]
+kableExtra::kable(t, caption = "Summary of n households (n people)") %>%
+  kable_styling()
+
+ggplot2::ggplot(ipfOutputTestDT, aes(x = npeople_totalHouseholds, y = simCount, colour = REGC2013_label)) +
+  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
+  geom_point() +
+  scale_colour_discrete(name = "Region") +
+  labs(x = "Number of households (using census n people variable)",
+       y = "Number of households (ipf results)",
+       caption = myCaption)
+```
+
+Figure \@ref(fig:checkTotalCounts_nRooms) shows the actual and simulated number of households using the nRooms constraint. This was the penultimate constraint to be fitted.
+
+```{r checkTotalCounts_nRooms}
+t <- ipfOutputTestDT[, .(mean_nHHs = mean(nrooms_totalHouseholds),
+                              min_nHHs = min(nrooms_totalHouseholds),
+                              max_nHHs = max(nrooms_totalHouseholds)), keyby = .(REGC2013_label)]
+kableExtra::kable(t, caption = "Summary of n households (n rooms)") %>%
+  kable_styling()
+ggplot2::ggplot(ipfOutputTestDT, aes(x = nrooms_totalHouseholds, y = simCount, colour = REGC2013_label)) +
+  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
+  geom_point() +
+  scale_colour_discrete(name = "Region") +
+  labs(x = "Number of households (using census n rooms variable)",
+       y = "Number of households (ipf results)",
+       caption = myCaption)
+```
+
+> Can't repeat for n kids as the base doesn't match
+
+Figure \@ref(fig:checkTotalCounts_ggHHs) shows the number of GREEN Grid households who are being 'upweighted' to match the given number of census households.
+
+```{r checkTotalCounts_ggHHs}
+ggplot2::ggplot(ipfOutputTestDT, aes(x = nGGHouseholds, y = simCount, colour = REGC2013_label)) +
+  geom_jitter() +
+  scale_colour_discrete(name = "Region") +
+  labs(x = "Number of GREENGrid households",
+       y = "Number of households represented in area unit",
+       caption = paste0("Plotting ", uniqueN(ipfOutputTestDT$AU2013_code), " area units"))
+
+totalIpfHHs <- sum(ipfHhCountsDT$N)
+```
+
+
+#### Constrained variables
+
+Table \@ref(tab:testnPeople) shows the weighted household counts for the number of people. This was the last constraint so it fitted last.
+
+```{r testnPeople}
+# create summary table 
+svyt <- survey::svytable(~ AU2013_code + nPeopleCat, longFormSvyDes, 
+                 exclude=NULL, 
+                 na.action=na.pass
+)
+
+dt <- data.table::as.data.table(svyt)
+wdt <- reshape(dt, direction = "wide", timevar = "nPeopleCat", idvar = "AU2013_code")
+
+setnames(wdt, c("N.1", "N.2", "N.3", "N.4+"), c("ipf_nPeople_1",
+                                                "ipf_nPeople_2",
+                                                "ipf_nPeople_3",
+                                                "ipf_nPeople_4+"))
+
+setkey(wdt, AU2013_code)
+setkey(censusDT, AU2013_code)
+
+tmpDT <- wdt[censusDT] # merge to census data
+tmpDT <- auListDT[tmpDT] # add region labels
+
+plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
+                    ipf_nPeople_1, ipf_nPeople_2, ipf_nPeople_3, `ipf_nPeople_4+`,
+                    nPeople_1, nPeople_2, nPeople_3, nPeople_4m)]
+t <- summary(plotDT)
+
+kableExtra::kable(t, caption = "Summary of n people outcomes (and census counts)") %>%
+  kable_styling()
+
+ggplot2::ggplot(plotDT) +
+  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
+  geom_point(aes(x = nPeople_1, y =  ipf_nPeople_1, colour = "1 person")) +
+  geom_point(aes(x = nPeople_2, y =  ipf_nPeople_2, colour = "2 people")) +
+  geom_point(aes(x = nPeople_3, y =  ipf_nPeople_3, colour = "3 people")) +
+  geom_point(aes(x = nPeople_4m, y =  `ipf_nPeople_4+`, colour = "4+ people")) +
+  scale_colour_discrete(name = "N people") +
+  labs(x = "Census count",
+       y = "IPF (weighted) count",
+       caption = myCaption) +
+  facet_grid(. ~ REGC2013_label)
+```
+
+Table \@ref(tab:testnRooms) shows the weighted household counts for the number of rooms This was the penultimate constraint to be fitted.
+
+```{r testnRooms}
+# create summary table 
+svyt <- survey::svytable(~ AU2013_code + nRoomsCat, longFormSvyDes, 
+                 exclude=NULL, 
+                 na.action=na.pass
+)
+
+dt <- data.table::as.data.table(svyt)
+wdt <- reshape(dt, direction = "wide", timevar = "nRoomsCat", idvar = "AU2013_code")
+
+setnames(wdt, c("N.1-4", "N.5-6", "N.7+"), c("ipf_nRooms_1_4",
+                                                "ipf_nRooms_5_6",
+                                                "ipf_nRooms_7m"))
+
+setkey(wdt, AU2013_code)
+
+tmpDT <- wdt[censusDT] # merge to census data
+tmpDT <- auListDT[tmpDT] # add region labels
+
+plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
+                    ipf_nRooms_1_4, ipf_nRooms_5_6, ipf_nRooms_7m,
+                    nRooms1_4, nRooms5_6, nRooms7m)]
+
+t <- summary(plotDT)
+
+kableExtra::kable(t, caption = "Summary of n rooms outcomes (and census counts)") %>%
+  kable_styling()
+
+ggplot2::ggplot(plotDT) +
+  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
+  geom_point(aes(x = nRooms1_4, y =  ipf_nRooms_1_4, colour = "1-4 rooms")) +
+  geom_point(aes(x = nRooms5_6, y =  ipf_nRooms_5_6, colour = "5-6 rooms")) +
+  geom_point(aes(x = nRooms7m, y =  ipf_nRooms_7m, colour = "7+ rooms")) +
+  scale_colour_discrete(name = "N rooms") +
+  labs(x = "Census count",
+       y = "IPF (weighted) count",
+       caption = myCaption) +
+  facet_grid(. ~ REGC2013_label)
+```
+
+
+Table \@ref(tab:testnKids) shows the weighted household counts for the presence of children.
+
+```{r testnKids}
+# create summary table 
+svyt <- survey::svytable(~ AU2013_code + presenceKids, longFormSvyDes, 
+                 exclude=NULL, 
+                 na.action=na.pass
+)
+
+dt <- data.table::as.data.table(svyt)
+wdt <- reshape(dt, direction = "wide", timevar = "presenceKids", idvar = "AU2013_code")
+
+setnames(wdt, c("N.1+", "N.None"), c("ipf_nKids_1m","ipf_nKids_0"))
+
+setkey(wdt, AU2013_code)
+
+tmpDT <- wdt[censusDT] # merge to census data
+tmpDT <- auListDT[tmpDT] # add region labels
+
+plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
+                    ipf_nKids_0, ipf_nKids_1m,
+                    nKids_0, nKids_1m)]
+t <- summary(plotDT)
+
+kableExtra::kable(t, caption = "Summary of n kids outcomes (and census counts)") %>%
+  kable_styling()
+
+ggplot2::ggplot(plotDT) +
+  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
+  geom_point(aes(x = nKids_0, y =  ipf_nKids_0, colour = "0 children")) +
+  geom_point(aes(x = nKids_1m, y =  ipf_nKids_1m, colour = "1+ child")) +
+  labs(x = "Census count",
+       y = "IPF (weighted) count",
+       caption = myCaption) +
+  scale_colour_discrete(name = "N children") +
+  facet_grid(. ~ REGC2013_label)
+
+```
+
+#### Unconstrained variables
+
+Table \@ref(tab:testHeatSource) shows the weighted household counts for main heat source. Remember that this was _not_ used in the final constraints.
+
+```{r testHeatSource}
+# create summary table 
+svyt <- survey::svytable(~ AU2013_code + heatFuel, longFormSvyDes, 
+                 exclude=NULL, 
+                 na.action=na.pass
+)
+
+dt <- data.table::as.data.table(svyt)
+wdt <- reshape(dt, direction = "wide", timevar = "heatFuel", idvar = "AU2013_code")
+
+setnames(wdt, c("N.Electricity", "N.Gas", "N.Other", "N.Wood"), 
+         c("ipf_Electricity","ipf_Gas", "ipf_Other", "ipf_Wood"))
+
+setkey(wdt, AU2013_code)
+
+tmpDT <- wdt[au2013DT] # merge to census data
+tmpDT <- auListDT[tmpDT] # add region labels
+
+plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
+                    ipf_Electricity, ipf_Gas, ipf_Other, ipf_Wood,
+                    heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceWood, heatSourceOther)]
+
+ggplot2::ggplot(plotDT) +
+  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
+  geom_point(aes(x = ipf_Electricity, y =  ipf_Electricity, colour = "Electricity")) +
+  geom_point(aes(x = heatSourceGas, y =  ipf_Gas, colour = "Gas")) +
+  geom_point(aes(x = heatSourceWood, y =  ipf_Wood, colour = "Wood")) +
+  geom_point(aes(x = heatSourceOther, y =  ipf_Other, colour = "Other")) +
+  scale_colour_discrete(name = "Main heating fuel") +
+  labs(x = "Census count",
+       y = "IPF (weighted) count",
+       caption = paste0(myCaption, "\nNote: Coal not shown as no GREEN Grid households used coal")) +
+  facet_grid(. ~ REGC2013_label)
+
+```
+
+
+```{r run to here}
+```
\ No newline at end of file
diff --git a/analysis/GREENGridModel/old/_loadNonPowerData.Rmd b/analysis/GREENGridModel/old/_loadNonPowerData.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..56c0524c11ac2bc15515f91952bc992895445d70
--- /dev/null
+++ b/analysis/GREENGridModel/old/_loadNonPowerData.Rmd
@@ -0,0 +1,508 @@
+
+## Load data
+
+### NZ Area Unit data
+
+2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
+
+ * n people
+ * n dependent children
+ * fuel source (all counted - may cause confusion as sum to > 100% of households)
+ * n rooms
+
+NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
+
+First load area labels as we use these to select the right data rows.
+
+```{r loadAreaLabels}
+areasDT <- data.table::fread(sParams$areasTable2013)
+
+auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
+auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
+setkey(auListDT, AU2013_code)
+```
+
+
+```{r loadAU2013}
+# fuelSource ----
+fuelf <- paste0(sParams$censusPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
+outputMessage(paste0("Loading: ", fuelf))
+fuelDT <- data.table::fread(fuelf)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT))))
+fuelDT <- fuelDT[, AU2013_code := AREA]
+setkey(fuelDT, AU2013_code)
+fuelDT <- fuelDT[auListDT]
+message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
+
+# create categories
+# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
+
+fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
+                 censusConstraint := "fuel_totalHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
+                 censusConstraint := "fuel_totalStatedHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
+                 censusConstraint := "heatSourceWood"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
+                   `Fuel type used to heat dwelling` == "Electricity",
+                 censusConstraint := "heatSourceElectricity"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling`  %like% "gas",
+                 censusConstraint := "heatSourceGas"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
+                 censusConstraint := "heatSourceCoal"]
+table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
+
+# convert to wide
+fuelDT <- fuelDT[, count := Value]
+
+fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+
+setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
+                        "count.heatSourceElectricity", "count.heatSourceGas", 
+                        "count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
+         c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
+                        "heatSourceElectricity", "heatSourceGas", 
+                        "heatSourceWood", "heatSourceCoal", "heatSourceOther"))
+
+# nKids ----
+kidsf <- paste0(sParams$censusPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
+outputMessage(paste0("Loading: ", kidsf))
+kidsDT <- data.table::fread(kidsf)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT))))
+kidsDT <- kidsDT[, AU2013_code := AREA]
+setkey(kidsDT, AU2013_code)
+kidsDT <- kidsDT[auListDT]
+
+message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
+
+# > create categories
+# "nKids_0", "nKids_1m"
+kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
+kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
+                 censusConstraint := "nkids_totalFamilies"]
+
+table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
+
+# > convert to wide
+kidsDT <- kidsDT[, count := Value]
+
+kids2013WDT <- reshape(kidsDT[YEAR == 2013,
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+# > calculate n households with 0 kids
+# do this again later when we have the number of _households_ to use as the base
+kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
+setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"), 
+         c("nkids_totalFamilies", "nKids_1m"))
+
+
+# nPeople ----
+npeoplef <- paste0(sParams$censusPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
+outputMessage(paste0("Loading: ", npeoplef))
+npeopleDT <- data.table::fread(npeoplef)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))))
+setkey(npeopleDT, Area) # forgot to get code attached to this one
+npeopleDT <- npeopleDT[, AU2013_label := Area]
+setkey(auListDT,AU2013_label) # set to label instead of code - should still work
+npeopleDT <- npeopleDT[auListDT]
+message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
+
+# > create categories ----
+# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
+npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
+npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
+                       censusConstraint := "npeople_totalHouseholds"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
+                       censusConstraint := "nPeople_1"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
+                       censusConstraint := "nPeople_2"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household`  %like% "Three",
+                       censusConstraint := "nPeople_3"] 
+table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
+
+# convert to wide
+npeopleDT <- npeopleDT[, count := Value]
+
+npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+
+setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"), 
+         c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
+
+# nRooms ----
+nroomsf <- paste0(sParams$censusPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
+outputMessage(paste0("Loading: ", nroomsf))
+nroomsDT <- data.table::fread(nroomsf)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))))
+nroomsDT <- nroomsDT[, AU2013_code := AREA]
+setkey(nroomsDT, AU2013_code)
+setkey(auListDT,AU2013_code) # set to back to code
+nroomsDT <- nroomsDT[auListDT]
+message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
+
+# > create categories ----
+# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
+nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
+nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
+                       censusConstraint := "nrooms_statedtotalHouseholds"] 
+nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
+                       censusConstraint := "nrooms_totalHouseholds"] 
+nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
+                       `Number of rooms` %like% "Two" |
+                       `Number of rooms` %like% "Three" |
+                       `Number of rooms` %like% "Four",
+                       censusConstraint := "nRooms1_4"] 
+nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" | 
+                       `Number of rooms` %like% "Six" ,
+                       censusConstraint := "nRooms5_6"] 
+table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
+
+# convert to wide
+nroomsDT <- nroomsDT[, count := Value]
+
+nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+
+setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
+                          "count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"), 
+         c("nRooms1_4", "nRooms5_6", "nRooms7m",
+           "nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
+
+# combine them ----
+setkey(auListDT, AU2013_code, AU2013_label)
+setkey(fuel2013WDT, AU2013_code, AU2013_label)
+setkey(kids2013WDT, AU2013_code, AU2013_label)
+setkey(npeople2013WDT, AU2013_code, AU2013_label)
+setkey(nrooms2013WDT, AU2013_code, AU2013_label)
+
+au2013DT <- fuel2013WDT[auListDT]
+au2013DT <- kids2013WDT[au2013DT]
+au2013DT <- npeople2013WDT[au2013DT]
+au2013DT <- nrooms2013WDT[au2013DT]
+
+# recalulate n households without children
+au2013DT <- au2013DT[, nKids_0 :=  nrooms_totalHouseholds- nKids_1m]
+
+```
+
+Figure \@ref(fig:checkTotals) shows the match between the total household/family counts derived from each NZ stats table. As we would expect there are very minor differences with the exception of the totals derived from the families table (presence of children).
+
+```{r checkTotals, fig.cap="Plots of household/family totals"}
+pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
+                        npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
+
+pairs(pairsDT)
+```
+
+We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
+
+### GREENGrid Survey data
+
+This comprises the household attributes (survey) data:
+
+```{r loadGGHousehold}
+f <- paste0(sParams$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source & 
+outputMessage(paste0("Loading: ", f))
+ggHhDT <- data.table::as.data.table(readr::read_csv(f))
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))))
+outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID))))
+
+```
+
+
+We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding.
+
+#### Families: Presence of children
+
+Survey: 
+
+ * nChildren0_12: 0,1,2+
+ * nTeenagers13_18: 0,1,2+
+
+Census:
+
+ * 0,1+ (simplest)
+ 
+```{r presenceChildren}
+# survey: number of kids: categorised as: 1,2,3,4+ ----
+ggHhDT <- ggHhDT[, nKids :=  nChildren0_12 + nTeenagers13_18]
+table(ggHhDT$nKids, useNA = "always")
+message("There are NA - we will need to remove them later")
+ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)]
+ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)]
+
+message("Household data: n kids")
+table(ggHhDT$nKids_0)
+table(ggHhDT$nKids_1m)
+
+# create category
+ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]
+
+table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")
+
+```
+
+
+
+#### Households: Number of people
+
+```{r nPeople}
+# survey: number of people: categorised as: 1,2,3,4+ ----
+ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
+table(ggHhDT$nPeople, useNA = "always")
+message("There are NA - we will need to remove them later")
+ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)]
+ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)]
+ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)]
+ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)]
+
+message("Household data: n people")
+table(ggHhDT$nPeople_1)
+table(ggHhDT$nPeople_2)
+table(ggHhDT$nPeople_3)
+table(ggHhDT$nPeople_4m)
+
+# create category
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)]
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)]
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)]
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]
+
+table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")
+
+```
+
+#### Dwellings: Number of rooms
+
+We have to do some imputation here
+
+```{r nRooms}
+# survey: n bedrooms via Q10 ----
+ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
+                 `Q10#1_2_1_TEXT` + # living rooms
+                 `Q10#1_3_1_TEXT` + # dining rooms-kitchens
+                 `Q10#1_4_1_TEXT` + # kitchens
+                 `Q10#1_5_1_TEXT`  # studies
+                 # Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx
+                 # Q10#1_7_1_TEXT + # toilets - excluded from census
+                 # Q10#1_8_1_TEXT + # laundries - excluded from census
+                 ]
+table(ggHhDT$nRooms, useNA = "always")
+message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...")
+message("Test the model")
+r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)
+
+summary(r)
+
+# > impute where missing ----
+message("that looks reasonable...impute where missing")
+ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
+
+# check what we got
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) + 
+  geom_point() # original shape
+
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) + 
+  geom_point()  + # imputed shape
+  labs(caption = "Well, it is a linear model...")
+
+message("So we use the imputed values only where we had NA")
+ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms), 
+                                                round(nRoomsImputed),
+                                                nRooms
+)
+]
+
+message("How bad does that look?")
+ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed",
+                                             "observed")]
+ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) + 
+  geom_jitter() # 
+
+message("Well, we can live with that for now...")
+
+#
+message("There are NA - we will need to remove them later")
+ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)]
+ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)]
+ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)]
+table(ggHhDT$nRooms1_4)
+table(ggHhDT$nRooms5_6)
+table(ggHhDT$nRooms7m)
+
+# create category
+ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)]
+ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)]
+ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]
+
+message("Check coding")
+table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")
+
+```
+
+#### Dwellings: Number of bedrooms
+
+Probably collinear with n people/n rooms. Also have to impute.
+
+```{r nBedrooms}
+
+# survey: Q10#1_1_1_TEXT ----
+
+ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`]
+table(ggHhDT$nBedrooms,  useNA = "always")
+# best to do 1-2, 3, 4m
+
+message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...")
+message("Test the model")
+r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)
+
+summary(r)
+
+# > impute where missing ----
+message("that looks reasonable...impute where missing")
+ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
+
+# check what we got
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) + 
+  geom_point() # original shape
+
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) + 
+  geom_point()  + # imputed shape
+  labs(caption = "Well, it is a linear model...")
+
+message("So we use the imputed values only where we had NA and we round them to whole numbers")
+ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms), 
+                                                round(nBedroomsImputed),
+                                                nBedrooms
+)
+]
+
+message("How bad does that look?")
+ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed",
+                                             "observed")]
+ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) + 
+  geom_jitter() # 
+
+message("Well, we can live with that for now...")
+
+ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)]
+ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)]
+ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)]
+message("Household data: n bedrooms")
+table(ggHhDT$nBedrooms_1_2)
+table(ggHhDT$nBedrooms_3)
+table(ggHhDT$nBedrooms_4m)
+
+# create category
+ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)]
+ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)]
+ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)]
+
+table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")
+
+```
+
+Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households.
+
+#### Dwellings: Main heat source for hot water
+
+Survey - known
+Census - unknown
+
+#### Dwellings: Main fuel used for heat
+
+```{r surveyHeatSource}
+
+# survey: Q20 ----
+
+table(ggHhDT$Q20_coded,  useNA = "always")
+
+ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
+                                             "Enclosed wood burner" = "Wood",
+                                             "Open fire" = "Wood", # could be coal
+                                             "Heat pump"= "Electricity",
+                                             "HRV or other ventilation system" = "Electricity",
+                                             "Portable electric heaters" = "Electricity",
+                                             "Portable gas heater" = "Gas", 
+                                             "Underfloor gas heating" = "Gas",
+                                             "Other" = "Other")]
+
+table(ggHhDT$Q20_coded,  ggHhDT$heatFuel, useNA = "always")
+ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]
+
+table(ggHhDT$heatFuel,  ggHhDT$Location, useNA = "always")
+```
+
+
+#### Dwelling: Dwelling type
+
+Survey - not known
+Census - known
+
+#### Fix final survey data
+
+Filter survey data to include just the variables we want and fix any NAs.
+
+```{r filterSurveyData}
+surveyDT <- ggHhDT[, .(linkID, 
+                       nKids_0, nKids_1m,
+                       nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
+                       nRooms1_4, nRooms5_6, nRooms7m,
+                       heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
+                       nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]
+
+skimr::skim(surveyDT)
+
+message("There are still a few NAs so we need to remove these households")
+surveyDT <- na.omit(surveyDT)
+
+skimr::skim(surveyDT)
+message("Fixed it?")
+```
+
+
+### Select areas of interest
+
+Now select just the census data for:
+
+* Hawke's Bay Region
+* Taranaki Region
+
+```{r filterCensusRegion}
+censusAuWideDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
+                         REGC2013_label == "Taranaki Region"] 
+
+message("Total households (rooms table) = ", sum(censusAuWideDT$nrooms_totalHouseholds))
+
+message("Total households (fuel table) = ", sum(censusAuWideDT$fuel_totalHouseholds))
+
+message("Total households (npeople table) = ", sum(censusAuWideDT$npeople_totalHouseholds))
+
+message("Total families (kids table) = ", sum(censusAuWideDT$nkids_totalFamilies))
+
+t <- censusAuWideDT[, .(nHouseholdsRooms = sum(nrooms_totalHouseholds),
+                        nHouseholdsFuel = sum(fuel_totalHouseholds),
+                        nHouseholdsPeople = sum(npeople_totalHouseholds)), keyby = REGC2013_label]
+
+kableExtra::kable(t, caption = "Household counts by region (by table source)")
+```
+
+```{r run to here}
+```
diff --git a/analysis/GREENGridModel/old/_loadPowerData.Rmd b/analysis/GREENGridModel/old/_loadPowerData.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..fd5e857d0aec4c346088333fa5d8c6f4cf477c6a
--- /dev/null
+++ b/analysis/GREENGridModel/old/_loadPowerData.Rmd
@@ -0,0 +1,658 @@
+
+Do not do this earlier as it takes a lot of memory. For now we're going to use:
+
+* Heat Pump demand
+* Hot Water demand
+* Lighting demand
+
+These have been extracted from the GREENGrid power demand data [@anderson_new_2018].
+
+## Hot Water
+
+```{r hw.load, eval=TRUE}
+
+# f <- paste0(p_Params$ggPath, "/safe/gridSpy/1min/dataExtracts/Hot Water_2015-04-01_2016-03-31_observations.csv.gz")
+# message("Loading: ", f)
+# ggHwOrigDT <- data.table::as.data.table(readr::read_csv(f))
+ggHwOrigDT <- drake::readd(hhHWLoad)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(ggHwOrigDT)))
+message("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHwOrigDT$linkID)))
+
+#ggHwOrigDT$hhID <- NULL # not needed
+
+h <- head(ggHwOrigDT)
+kableExtra::kable(h, caption = "GREENGrid Household hot water demand data: head")%>%
+  kable_styling()
+
+message("Summary of variables:")
+skimr::skim(ggHwOrigDT)
+```
+
+Note that there are -ve power values in the above summary tables. According to [previous work](https://cfsotago.github.io/GREENGridData/gridSpy1mOutliersReport_v1.0.html) these are predominantly found in one dwelling (rf_46) due to possible incorrect instrument installation and very briefly in others for the same reason (but quickly corrected). We therefore fix the rf_46 data (see https://cfsotago.github.io/GREENGridData/gridSpy1mOutliersReport_v1.0.html#rf46) and any -ve readings before proceeding...
+
+You will notice this also reduces the number of circuits so that they match the number of households. We also:
+
+ * convert the UTC date to NZ time
+ * set some useful time/date variables
+
+```{r hw.clean}
+# remove negative ----
+ggHwDT <- ggHwOrigDT[circuit != "Hot Water - Controlled$4231" &  meanPowerW >= 0]
+
+
+```
+
+```{r hw.summary.setup}
+myCaption <- paste0("GREENGrid household power demand data from ",
+                    min(lubridate::date(ggHwDT$r_dateTimeNZ)), " to ", 
+                    max(lubridate::date(ggHwDT$r_dateTimeNZ)),
+                    "\n Aggregated to: 15 minute periods",
+                    "\n Hot water: n households = ", uniqueN(ggHwDT$linkID)
+                    )
+
+
+# set keys for linkage ----
+setkey(ggHhDT, linkID)
+setkey(ggHwDT, linkID)
+```
+
+```{r hw.summary.overall}
+
+rectAlpha <- 0.1
+vLineAlpha <- 0.4
+vLineCol <- "#0072B2" # http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
+myTextSize <- 3
+
+addPeaks <- function(p){
+  # takes a plot, assumes hms is hms, adds peak period annotations
+  # assumes you've set yMin & yMax already
+  # breaks with facet_grid, scales = "free" so you have to build seperate plots if you want to do that
+  # there is a complex solution (https://stackoverflow.com/questions/27898651/get-scales-range-of-facets-scales-free) but...
+  p <- p + annotate("rect", xmin = amPeakStart,
+             xmax = amPeakEnd, 
+             ymin = yMin, ymax = yMax, 
+             alpha = rectAlpha, fill = vLineCol)
+  p <- p + annotate("rect", xmin = pmPeakStart,
+             xmax = pmPeakEnd, 
+             ymin = yMin, ymax = yMax, 
+             alpha = rectAlpha, fill = vLineCol)
+  return(p)
+}
+
+plotDT <- ggHhDT[, .(linkID)][ggHwDT][, .(meanW = mean(meanPowerW), # merge on linkID here as we summarise (data.table trick)
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, season)]
+
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meanW/1000, colour = season)) +
+  geom_step() +
+  scale_colour_discrete(name = "Season") +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW, na.rm = TRUE)
+yMax <- max(plotDT$meankW, na.rm = TRUE)
+addPeaks(p)
+```
+
+```{r hw.summary.presenceKids}
+plotDT <- ggHhDT[, .(linkID, presenceKids)][ggHwDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, presenceKids, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(presenceKids)], aes(x = hms, y = meanW/1000, colour = presenceKids)) +
+  geom_step() +
+  scale_colour_discrete(name = "Presence of children") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) + 
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+# need to count households per category for the actual data plotted
+ggHwIDsDT <- ggHwDT[, .(nObs = .N), keyby = linkID]
+
+# it will change for each type of circuit
+t <- ggHhDT[ggHwIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('Presence of children' = presenceKids)]
+ 
+kableExtra::kable(t, caption = "N households (Presence of children)") %>%
+  kable_styling()
+```
+
+```{r hw.summary.nBedroomsCat}
+# aggregate plot over all households: nBedroomsCat ----
+plotDT <- ggHhDT[, .(linkID, nBedroomsCat)][ggHwDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, nBedroomsCat, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(nBedroomsCat)], aes(x = hms, y = meanW/1000, colour = nBedroomsCat)) +
+  geom_step() +
+  facet_grid(season ~ .) +
+  scale_colour_discrete(name = "N bedrooms") +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+
+t <- ggHhDT[ggHwIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('N bedrooms' = nBedroomsCat)]
+
+kableExtra::kable(t, caption = "N households (N bedrooms)") %>%
+  kable_styling()
+```
+
+```{r hw.summary.nPeopleCat}
+# aggregate plot over all households: nPeopleCat ----
+plotDT <- ggHhDT[, .(linkID, nPeopleCat)][ggHwDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, nPeopleCat, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(nPeopleCat)], aes(x = hms, y = meanW/1000, colour = nPeopleCat)) +
+  geom_step() +
+  scale_colour_discrete(name = "N people") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",caption = paste0(myCaption, "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+
+t <- ggHhDT[ggHwIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('N people' = nPeopleCat)]
+
+kableExtra::kable(t, caption = "N households (N people)") %>%
+  kable_styling()
+```
+
+```{r hw.summary.season.byID}
+# aggregate to half-hour per hh per season
+plotDT <- ggHwDT[, .(meanW = mean(meanPowerW),
+                         sdW = sd(meanPowerW),
+                         nObs = .N), keyby = .(linkID, hms, season)]
+
+# aggregate plot per hh
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meanW/1000, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (within household)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits, n = ", 
+                        uniqueN(ggHwDT$circuit),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r hw.season.byLocation}
+
+# aggregate to half-hour for overall summary plot
+setkey(ggHwDT, linkID)
+plotDT <- ggHhDT[, .(linkID, Location)][ggHwDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, Location)]
+
+# aggregate plot over all households
+p <- ggplot2::ggplot(plotDT[!is.na(Location)], aes(x = hms, y = meanW/1000, colour = Location)) +
+  geom_step() +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits: n = ", 
+                        uniqueN(ggHwDT$linkID),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+## Heat Pump
+
+```{r hp.load}
+
+file.info(hhHPLoadF)
+
+ggHpOrigDT <- drake::readd(hhHPLoad)
+
+message(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHpOrigDT))))
+message(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHpOrigDT$linkID))))
+
+h <- head(ggHpOrigDT)
+kableExtra::kable(h, caption = "GREENGrid Household heat pump demand data: head")%>%
+  kable_styling()
+
+message("Summary of variables:")
+skimr::skim(ggHpOrigDT)
+
+
+```
+
+Note that there are -ve power values in the above summary tables. According to [previous work](https://cfsotago.github.io/GREENGridData/gridSpy1mOutliersReport_v1.0.html) these are predominantly found in one dwelling (rf_46) due to possible incorrect instrument installation and very briefly in others for the same reason (but quickly corrected). We therefore fix the rf_46 data (see https://cfsotago.github.io/GREENGridData/gridSpy1mOutliersReport_v1.0.html#rf46) and any -ve readings before proceeding...
+
+You will notice this also reduces the number of circuits so that they match the number of households. We also:
+
+ * convert the UTC date to NZ time
+ * set some useful time/date variables
+
+```{r hp.clean}
+# remove negative ----
+# rf_27    2015-08-22 10:33:00     Heat Pump$2826  27759
+ggHpDT <- ggHpOrigDT[circuit != "Heat Pumps (2x) & Power$4399" & meanPowerW >= 0 & meanPowerW != 27759]
+
+h <- head(ggHpOrigDT)
+kableExtra::kable(h, caption = "GREENGrid Household heat pump demand data: head")%>%
+  kable_styling()
+
+message("Summary of variables:")
+skimr::skim(ggHpOrigDT)
+```
+
+```{r hp.summary.setup}
+myCaption <- paste0("GREENGrid household power demand data from ",
+                    min(lubridate::date(ggHpDT$r_dateTimeNZ)), " to ", 
+                    max(lubridate::date(ggHpDT$r_dateTimeNZ)),
+                    "\n Aggregated to: 15 minute periods",
+                    "\n Heat pumps: n households = ", uniqueN(ggHpDT$linkID)
+                    )
+
+
+# set keys for linkage ----
+setkey(ggHpDT, linkID)
+```
+
+```{r hp.summary.charts.overall}
+# aggregate plot over all households ----
+plotDT <- ggHhDT[, .(linkID)][ggHpDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, season)]
+
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meanW/1000, colour = season)) +
+  geom_step() +
+  scale_colour_discrete(name = "Season") +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r hp.summary.charts.presenceKids}
+# aggregate plot over all households: nKids ----
+plotDT <- ggHhDT[, .(linkID, presenceKids)][ggHpDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, presenceKids, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(presenceKids)], aes(x = hms, y = meanW/1000, colour = presenceKids)) +
+  geom_step() +
+  scale_colour_discrete(name = "Presence of children") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Heat pumps, n = ", 
+                        uniqueN(ggHpDT$linkID),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+  
+# need to count households per category for the actual data plotted
+ggHpIDsDT <- ggHpDT[, .(nObs = .N), keyby = linkID]
+
+# it will change for each type of circuit
+t <- ggHhDT[ggHpIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('Presence of children' = presenceKids)]
+
+kableExtra::kable(t, caption = "N households (Presence of children)") %>%
+  kable_styling()
+```
+
+```{r hp.summary.charts.nBedroomsCat}
+# aggregate plot over all households: nBedroomsCat ----
+plotDT <- ggHhDT[, .(linkID, nBedroomsCat)][ggHpDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, nBedroomsCat, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(nBedroomsCat)], aes(x = hms, y = meanW/1000, colour = nBedroomsCat)) +
+  geom_step() +
+  scale_colour_discrete(name = "N bedrooms") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+
+t <- ggHhDT[ggHpIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('N bedrooms' = nBedroomsCat)]
+
+kableExtra::kable(t, caption = "N households (N bedrooms)") %>%
+  kable_styling()
+```
+
+```{r hw.summary.charts.nPeopleCat}
+# aggregate plot over all households: nPeopleCat ----
+plotDT <- ggHhDT[, .(linkID, nPeopleCat)][ggHpDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, nPeopleCat, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(nPeopleCat)], aes(x = hms, y = meanW/1000, colour = nPeopleCat)) +
+  geom_step() +
+  scale_colour_discrete(name = "N people") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+
+t <- ggHhDT[ggHpIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('N people' = nPeopleCat)]
+
+kableExtra::kable(t, caption = "N households (N people)") %>%
+  kable_styling()
+```
+
+```{r hp.summary.season.byID}
+# aggregate to half-hour per hh per season
+plotDT <- ggHpDT[, .(meanW = mean(meanPowerW),
+                         sdW = sd(meanPowerW),
+                         nObs = .N), keyby = .(linkID, hms, season)]
+
+# aggregate plot per hh
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meanW/1000, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (within household)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Heat Pump circuits, n = ", 
+                        uniqueN(ggHpDT$circuit),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r hp.season.byLocation}
+
+# aggregate to half-hour for overall summary plot
+setkey(ggHpDT, linkID)
+plotDT <- ggHhDT[, .(linkID, Location)][ggHpDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, Location)]
+
+# aggregate plot over all households
+p <- ggplot2::ggplot(plotDT[!is.na(Location)], aes(x = hms, y = meanW/1000, colour = Location)) +
+  geom_step() +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Heat Pump circuits: n = ", 
+                        uniqueN(ggHpDT$linkID),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+## Lighting
+
+```{r l.load, eval=TRUE}
+
+file.info(hhLightingLoadF)
+
+ggLightingOrigDT <- drake::readd(hhLightingLoad)
+message(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggLightingOrigDT))))
+message(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggLightingOrigDT$linkID))))
+
+h <- head(ggLightingOrigDT)
+kableExtra::kable(h, caption = "GREENGrid Household lighting demand data: head")%>%
+  kable_styling()
+
+message("Summary of variables:")
+skimr::skim(ggLightingOrigDT)
+
+```
+
+Note that there are -ve power values in the above summary tables. According to [previous work](https://cfsotago.github.io/GREENGridData/gridSpy1mOutliersReport_v1.0.html) these are predominantly found in one dwelling (rf_46) due to possible incorrect instrument installation and very briefly in others for the same reason (but quickly corrected). We therefore fix the rf_46 data (see https://cfsotago.github.io/GREENGridData/gridSpy1mOutliersReport_v1.0.html#rf46) and any -ve readings before proceeding.
+
+As above you will notice this also reduces the number of circuits so that they match the number of households. We also:
+
+ * convert the UTC date to NZ time
+ * set some useful time/date variables
+
+```{r l.clean}
+# remove negative ----
+ggLightingDT <- ggLightingOrigDT[circuit != "Lighting$4404" & meanPowerW >= 0]
+
+# set seasons
+ggLightingDT <- GREENGridData::addNZSeason(ggLightingDT)
+
+# set useful dates and times
+# when loaded the r_dateTime variable is UTC, we need to change this to NZT
+ggLightingDT <- ggLightingDT[, r_dateTimeNZ := lubridate::with_tz(r_dateTime, tzone = "Pacific/Auckland")]
+ggLightingDT <- ggLightingDT[, time := hms::as.hms(r_dateTimeNZ)]
+ggLightingDT <- ggLightingDT[, halfHour := hms::trunc_hms(time, 30 * 60)] # trunc takes us back to the start of the period
+ggLightingDT <- ggLightingDT[, hms := hms::trunc_hms(time, 15 * 60)] # trunc takes us back to the start of the period
+
+
+```
+
+```{r l.summary.charts.setup}
+myCaption <- paste0("GREENGrid household power demand data from ",
+                    min(lubridate::date(ggLightingDT$r_dateTimeNZ)), " to ", 
+                    max(lubridate::date(ggLightingDT$r_dateTimeNZ)),
+                    "\n Aggregated to: 15 minute periods",
+                    "\n Lighting: n households = ", uniqueN(ggLightingDT$linkID)
+                    )
+
+# set keys for linkage ----
+setkey(ggHhDT, linkID)
+setkey(ggLightingDT, linkID)
+```
+
+```{r l.summary.charts.overall}
+# aggregate plot over all households ----
+plotDT <- ggHhDT[, .(linkID)][ggLightingDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, season)]
+
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meanW/1000, colour = season)) +
+  geom_step() +
+  scale_colour_discrete(name = "Season") +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r l.summary.charts.presenceKids}
+# aggregate plot over all households: nKids ----
+plotDT <- ggHhDT[, .(linkID, presenceKids)][ggLightingDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, presenceKids, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(presenceKids)], aes(x = hms, y = meanW/1000, colour = presenceKids)) +
+  geom_step() +
+  scale_colour_discrete(name = "Presence of children") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+  
+# need to count households per category for the actual data plotted
+ggLightingIDsDT <- ggLightingDT[, .(nObs = .N), keyby = linkID]
+
+# it will change for each type of circuit
+t <- ggHhDT[ggLightingIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('Presence of children' = presenceKids)]
+
+kableExtra::kable(t, caption = "N households (Presence of children)") %>%
+  kable_styling()
+```
+
+```{r l.summary.charts.nBedroomsCat}
+# aggregate plot over all households: nBedroomsCat ----
+plotDT <- ggHhDT[, .(linkID, nBedroomsCat)][ggLightingDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, nBedroomsCat, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(nBedroomsCat)], aes(x = hms, y = meanW/1000, colour = nBedroomsCat)) +
+  geom_step() +
+  scale_colour_discrete(name = "N bedrooms") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+
+t <- ggHhDT[ggLightingIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('N bedrooms' = nBedroomsCat)]
+
+kableExtra::kable(t, caption = "N households (N bedrooms)") %>%
+  kable_styling()
+```
+
+```{r l.summary.charts.nPeopleCat}
+# aggregate plot over all households: nPeopleCat ----
+plotDT <- ggHhDT[, .(linkID, nPeopleCat)][ggLightingDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, nPeopleCat, season)]
+
+p <- ggplot2::ggplot(plotDT[!is.na(nPeopleCat)], aes(x = hms, y = meanW/1000, colour = nPeopleCat)) +
+  geom_step() +
+  scale_colour_discrete(name = "N people") +
+  #scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption,
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+
+t <- ggHhDT[ggLightingIDsDT][, .('N households' = uniqueN(linkID)),
+                       keyby = .('N people' = nPeopleCat)]
+
+kableExtra::kable(t, caption = "N households (N people)") %>%
+  kable_styling()
+```
+
+
+```{r l.summary.season.byID}
+# aggregate to half-hour per hh per season
+plotDT <- ggLightingDT[, .(meanW = mean(meanPowerW),
+                         sdW = sd(meanPowerW),
+                         nObs = .N), keyby = .(linkID, hms, season)]
+
+# aggregate plot per hh
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meanW/1000, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (within household)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits, n = ", 
+                        uniqueN(ggLightingDT$circuit),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r l.season.byLocation}
+
+# aggregate to half-hour for overall summary plot
+setkey(ggHpDT, linkID)
+plotDT <- ggHhDT[, .(linkID, Location)][ggLightingDT][, .(meanW = mean(meanPowerW),
+                      sdW = sd(meanPowerW),
+                      sumW = sum(meanPowerW),
+                      nObs = .N), keyby = .(hms, Location)]
+
+# aggregate plot over all households
+p <- ggplot2::ggplot(plotDT[!is.na(Location)], aes(x = hms, y = meanW/1000, colour = Location)) +
+  geom_step() +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits: n = ", 
+                        uniqueN(ggLightingDT$linkID),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r runToHere}
+```
\ No newline at end of file
diff --git a/analysis/GREENGridModel/old/_reportAreaModel.Rmd b/analysis/GREENGridModel/old/_reportAreaModel.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..98c19185addc65ebe1d399211b3c389b5fc8719f
--- /dev/null
+++ b/analysis/GREENGridModel/old/_reportAreaModel.Rmd
@@ -0,0 +1,755 @@
+
+We may have filtered the areas and regions we used as input to speed things up:
+
+```{r whichRegions}
+message("N synthetic households (NB - this is _not_ the actual number of households as it is unweighted):")
+ipfWeights2013DT <- drake::readd(ipfWeights2013)
+
+ipfWeights2013DT[, .(nHouseholds = .N,
+                     meanWeight = mean(ipfWeight)), keyby = .(REGC2013_label)]
+
+message("N synthetic households:")
+ipfWeights2013DT[, .(nHouseholds = .N ), keyby = .(REGC2013_label)]
+message("N AUs:")
+ipfWeights2013DT[, .(nAreas = uniqueN(AU2013_label),
+                     meanWeight = mean(ipfWeight)), by = .(REGC2013_label)]
+
+auLabelsDT <- ipfWeights2013DT[, .(nObs = .N), keyby = .(AU2013_label, REGC2013_label)] 
+
+t <- table(ipfWeights2013DT$AU2013_label,ipfWeights2013DT$REGC2013_label)
+
+kableExtra::kable(t, caption = "Areas used") %>%
+  kable_styling()
+```
+
+## Area Units - Hot water: Specific day
+
+```{r hw.date.select}
+
+message("Which day(s) have the most active household data?")
+
+daysActiveDT <- ggHwDT[, .(count = uniqueN(linkID)), 
+                       keyby = .(date = lubridate::date(r_dateTimeNZ), linkID)]
+
+daysDT <- daysActiveDT[, .(nHHs = sum(count)), keyby = .(date)]
+
+ggplot2::ggplot(daysDT, aes(x = date, y = nHHs)) + 
+  geom_line()
+
+setkey(daysDT, nHHs, date) # auto-sorts
+
+tail(daysDT)
+
+message("So could choose 25th May 2015 which was a Monday. But we should just ")
+message ("check this is when the most survey households were active since they were used for the ipf")
+
+setkey(daysActiveDT, linkID)
+
+ggHhDT <- ggHhDT[, hasSurvey := "Has survey"]
+setkey(ggHhDT, linkID)
+setkey(daysActiveDT, linkID)
+daysActiveDT <- ggHhDT[daysActiveDT]
+daysActiveDT <- daysActiveDT[is.na(hasSurvey), hasSurvey := "No survey"]
+
+t <- table(daysActiveDT$linkID, daysActiveDT$hasSurvey, useNA = "always")
+
+kableExtra::kable(t, 
+                  caption = "GREENGrid Household hot water data - missing surveys (data = number of days of power data)")%>%
+  kable_styling()
+
+message("Looks like we will only lose 2 - re-run the date tests with these two taken out")
+
+daysDT <- daysActiveDT[hasSurvey == "Has survey", .(nHHs = sum(count)), keyby = .(date)]
+
+ggplot2::ggplot(daysDT, aes(x = date, y = nHHs)) + 
+  geom_line()
+
+setkey(daysDT, nHHs, date) # auto-sorts
+
+tail(daysDT)
+
+message("We can still take Monday 25th May...")
+```
+
+Select Monday 25th May from the power demand data and aggregate to half-hourly mean demand for each household. Note that the original data has per minute observations so this will substantially suppress variation.
+
+```{r hw.date.byLocation}
+filter <- lubridate::date("2015-05-25")
+
+myCaption <- paste0("GREENGrid household power demand data, ", filter)
+
+dayHwDT <- ggHwDT[lubridate::date(r_dateTimeNZ) == filter]
+
+message("Testing the start & end times of the subset")
+min(dayHwDT$r_dateTimeNZ)
+max(dayHwDT$r_dateTimeNZ)
+
+# aggregate to half-hour for overall summary plot
+setkey(dayHwDT, linkID)
+plotDT <- dayHwDT[ggHhDT[, .(linkID, Location)]][, .(meankW = mean(meanPowerW/1000),
+                      sdW = sd(meanPowerW/1000),
+                      sumW = sum(meanPowerW/1000),
+                      nObs = .N), keyby = .(hms, Location)]
+
+# aggregate plot over all households
+p <- ggplot2::ggplot(plotDT, aes(x = hms, y = meankW, colour = Location)) +
+  geom_step() +
+  #scale_y_log10() +
+  #scale_colour_viridis(discrete = TRUE) +
+  labs(y = "Mean kW (across households)",
+       x = "Time of day",
+       caption = paste0(myCaption, 
+                        ": Hot Water circuits, n = ", 
+                        uniqueN(dayHwDT$linkID),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(plotDT$meankW)
+yMax <- max(plotDT$meankW)
+addPeaks(p)
+```
+
+```{r hw.date.byID, fig.height=6}
+# aggregate to half-hour per hh (re-use later, gets messy)
+dayHwHhDT <- dayHwDT[, .(meanW = mean(meanPowerW),
+                         sdW = sd(meanPowerW),
+                         nObs = .N), keyby = .(linkID, hms)]
+
+# aggregate plot per hh
+p <- ggplot2::ggplot(dayHwHhDT, aes(x = hms, y = meanW/1000, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  labs(x = "Time of Day", y = "Mean kW (within household)",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits, n = ", 
+                        uniqueN(dayHwDT$linkID),
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(dayHwHhDT$meanW)/1000
+yMax <- max(dayHwHhDT$meanW)/1000
+addPeaks(p)
+
+```
+
+So now we need to link the household level half-hourly aggregate data to the synthetic households.
+
+```{r hw.date.linkAggToSynth}
+setkey(dayHwHhDT, linkID)
+message("N households with demand data: ", uniqueN(dayHwHhDT$linkID))
+
+setkey(ipfWeights2013DT, linkID)
+message("N unique households in synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+dataDT <- dayHwHhDT[ipfWeights2013DT, allow.cartesian=TRUE]
+message("N unique households in merged power data & synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
+message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")
+
+summary(dataDT)
+
+message("We can drop both of these to save memory.")
+
+dataDT <- dataDT[!is.na(meanW) & ipfWeight > 0]
+nrow(dataDT)
+
+table(dataDT$REGC2013_label)
+
+```
+
+Now we can construct synthetic Hot Water demand profiles for each area code. We do this by taking the half-hourly mean of the mean power across all synthetic households in each area code. Note that the 'observation' is the half-hourly household mean value - so variance has already been suppressed at the household level by taking the mean across the half hour.
+
+Figure \@ref(fig:hw.date.AreaProfiles.Unweighted) shows the unweighted profiles. They should be identical as they are based on the same households repeatedly allocated to each region and area unit.
+
+```{r hw.date.AreaProfiles.Unweighted,fig.height=6, fig.cap="Uweighted hot water profile"}
+# no weights
+meanProfilesUnwDT <- dataDT[, .(meankW = mean(meanW/1000)), keyby = .(hms, AU2013_label, REGC2013_label)]
+
+p <- ggplot2::ggplot(meanProfilesUnwDT, aes(x = hms, y = meankW, colour = AU2013_label)) +
+  geom_line() +
+  scale_colour_viridis(discrete = TRUE) +
+  theme(legend.position="none") + # hide for clarity
+  facet_grid(. ~ REGC2013_label) +
+  labs(x = "Time of Day", y = "Mean kW",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits, n households (unweighted) = ", 
+                        uniqueN(dataDT$linkID),
+                        "\n Replicated over ",uniqueN(dataDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(meanProfilesUnwDT$meankW)
+yMax <- max(meanProfilesUnwDT$meankW)
+addPeaks(p)
+
+```
+
+```{r hw.date.AreaProfiles.Weighted,fig.height=6}
+
+# with weights
+
+# retrieve drake object
+
+svytDT <- drake::readd(auHWLoad2015_05_25)
+
+svytDT[, AU2013_label := area]
+
+# add regional labels back
+setkey(svytDT, AU2013_label)
+setkey(auLabelsDT, AU2013_label)
+svytDT <- auLabelsDT[svytDT]
+
+svytDT <- svytDT[, meankW := meanW/1000] # do this here so addplot can use it
+
+totalIpfHHs <- sum(ipfWeights2013DT$ipfWeight) # sum of weights = n hhs
+
+p <- ggplot2::ggplot(svytDT, aes(x = hms, y = meankW, colour = area)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(REGC2013_label ~ .) +
+  theme(legend.position="none") + # hide for clarity
+  labs(x = "Time of Day", y = "Mean kW",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits, n households (base) = ", uniqueN(dataDT$linkID),
+                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
+                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(svytDT$meankW)
+yMax <- max(svytDT$meankW)
+addPeaks(p)
+
+plotly::ggplotly(p)
+
+```
+
+## Area units - Hot water: Mean seasonal
+
+```{r hw.season.agg}
+seasonHwHhDT <- ggHwDT[, .(meanW = mean(meanPowerW)), 
+                       keyby = .(season, linkID, hms)]
+```
+
+```{r hw.season.linkAggToSynth}
+setkey(seasonHwHhDT, linkID)
+message("N households with demand data: ", uniqueN(seasonHwHhDT$linkID))
+
+setkey(ipfWeights2013DT, linkID)
+message("N unique households in synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+dataDT <- seasonHwHhDT[ipfWeights2013DT, allow.cartesian=TRUE]
+message("N unique households in merged power data & synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
+message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")
+
+summary(dataDT)
+
+message("We can drop both of these to save memory and also drop some variables.")
+
+dataDT <- dataDT[!is.na(meanW) | ipfWeight > 0, 
+                 .(season, linkID, AU2013_label, REGC2013_label, meanW, hms, ipfWeight)]
+
+summary(dataDT)
+
+table(dataDT$REGC2013_label)
+
+```
+
+Unweighted area profiles - they will all line up unless a few households got dropped through 0 weights.
+
+```{r hw.season.AreaProfiles.Unweighted,fig.height=6}
+# no weights
+meanProfilesUnwDT <- dataDT[!is.na(meanW), .(meankW = mean(meanW/1000)), keyby = .(hms, AU2013_label, season, REGC2013_label)]
+
+p <- ggplot2::ggplot(meanProfilesUnwDT, aes(x = hms, y = meankW, colour = AU2013_label)) +
+  geom_line() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ REGC2013_label) +
+  theme(legend.position="none") + # hide for clarity
+  labs(x = "Time of Day", y = "Mean kW",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits", 
+                        "\n n households (unweighted) = ", uniqueN(meanProfilesUnwDT$linkID),
+                        "\n replicated over ",uniqueN(meanProfilesUnwDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded")
+       )
+yMin <- min(meanProfilesUnwDT$meankW)
+yMax <- max(meanProfilesUnwDT$meankW)
+addPeaks(p)
+
+```
+
+Now look at the weighted area profiles.
+
+```{r hw.season.AreaProfiles.Weighted, fig.height=6}
+
+# with weights
+# set svydesign
+dataDT <- dataDT[, c_hms := as.character(hms)] # svy breaks on hms for some reason
+message("Setting survey design using weights")
+dataDTSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
+                                  weights = ~ipfWeight, 
+                                  data = dataDT[!is.na(ipfWeight)] # make sure no NA weights
+                                  )
+
+message("Calculating weighted power summaries - can take some time...")
+system.time(svyt <- svyby(~meanW, ~c_hms+season+AU2013_label, dataDTSvyDes, 
+                          svymean,
+                          keep.var = TRUE,
+                          exclude=NULL, 
+                          na.action=na.pass
+                          )
+            )
+
+svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
+# fix the time thing
+svytDT$hms <- hms::parse_hms(svytDT$c_hms)
+
+# add regional labels back
+setkey(svytDT, AU2013_label)
+setkey(auLabelsDT, AU2013_label)
+svytDT <- svytDT[auLabelsDT]
+
+svytDT <- svytDT[, meankW := meanW/1000]
+
+p <- ggplot2::ggplot(svytDT, aes(x = hms, y = meankW, colour = AU2013_label)) +
+  geom_step() +
+  facet_grid(season ~ REGC2013_label) +
+  scale_colour_viridis(discrete = TRUE) +
+  theme(legend.position="none") + # hide for clarity
+  labs(x = "Time of Day", y = "Mean kW",
+       caption = paste0(myCaption, 
+                        "\n Hot Water circuits",
+                        "\n n households (base) = ", uniqueN(dataDT$linkID),
+                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
+                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas"),
+                        "\n Peak demand periods shaded")
+
+yMin <- min(svytDT$meankW)
+yMax <- max(svytDT$meankW)
+addPeaks(p)
+
+plotly::ggplotly(p)
+
+```
+
+
+## Area units - Lighting: Mean seasonal
+
+```{r l.season.byID}
+
+myCaption <- paste0("GREENGrid household power demand data from ", min(ggLightingDT$r_dateTimeNZ), " to ", max(ggLightingDT$r_dateTimeNZ))
+
+# aggregate to half-hour per hh per season (re-use this later on, gets messy)
+plotDT <- ggLightingDT[, .(meanW = mean(meanPowerW),
+                         sdW = sd(meanPowerW),
+                         nObs = .N), keyby = .(hms, season)]
+
+# aggregate to half-hour per hh per season (re-use this later on, gets messy)
+seasonLHhDT <- ggLightingDT[, .(meankW = mean(meanPowerW/1000),
+                         sdW = sd(meanPowerW),
+                         nObs = .N), keyby = .(linkID, hms, season)]
+
+# aggregate plot per hh
+p <- ggplot2::ggplot(seasonLHhDT, aes(x = hms, y = meankW, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (within household)",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits, n = ", uniqueN(seasonLHhDT$linkID),
+                        "\n Peak demand periods shaded"
+                        )
+       )
+
+yMin <- min(seasonLHhDT$meankW)
+yMax <- max(seasonLHhDT$meankW)
+addPeaks(p)
+
+plotly::ggplotly(p)
+
+```
+
+```{r l.season.linkAggToSynth}
+setkey(seasonLHhDT, linkID)
+message("N households with demand data: ", uniqueN(seasonLHhDT$linkID))
+
+setkey(ipfWeights2013DT, linkID)
+message("N unique households in synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+#dataDT <- seasonLHhDT[ipfWeights2013DT, allow.cartesian=TRUE]
+dataDT <- ipfWeights2013DT[seasonLHhDT, allow.cartesian=TRUE]
+message("N unique households in merged power data & synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
+message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")
+
+summary(dataDT)
+
+message("We can drop both of these to save memory.")
+
+dataDT <- dataDT[!is.na(meankW) | ipfWeight > 0]
+
+summary(dataDT)
+
+table(dataDT$REGC2013_label)
+
+```
+
+Construct unweighted area profiles for comparison.
+
+```{r l.season.AreaProfiles.Unweighted}
+# no weights
+meanProfilesUnwDT <- dataDT[!is.na(meankW), .(meankW = mean(meankW)), keyby = .(hms, AU2013_label, season)]
+
+p <- ggplot2::ggplot(meanProfilesUnwDT, aes(x = hms, y = meankW, colour = AU2013_label)) +
+  geom_line() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  theme(legend.position="none") + # hide for clarity
+  labs(x = "Time of Day", y = "Mean kW",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits",
+                        "\n n households (unweighted) = ", uniqueN(dataDT$linkID),
+                        " replicated over ",uniqueN(meanProfilesUnwDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded"
+                        )
+       )
+yMin <- min(meanProfilesUnwDT$meankW)
+yMax <- max(meanProfilesUnwDT$meankW)
+addPeaks(p)
+
+```
+
+Now construct the weighted area profiles.
+
+```{r l.season.AreaProfiles.Weighted}
+
+# with weights
+# set svydesign
+dataDT <- dataDT[, c_hms := as.character(hms)] # svy breaks on hms for some reason
+message("Setting survey design using weights")
+dataDTSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
+                                  weights = ~ipfWeight, 
+                                  data = dataDT[!is.na(ipfWeight)] # make sure no NA weights
+)
+
+message("Calculating weighted power summaries - can take some time...")
+system.time(svyt <- svyby(~meankW, ~AU2013_label+c_hms+season, dataDTSvyDes, 
+                          svymean,
+                          keep.var = TRUE,
+                          exclude=NULL, 
+                          na.action=na.pass
+                          )
+            )
+
+svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
+# fix the time thing
+svytDT$hms <- hms::parse_hm(svytDT$c_hms)
+
+# add regional labels back
+setkey(svytDT, AU2013_label)
+setkey(auLabelsDT, AU2013_label)
+svytDT <- svytDT[auLabelsDT]
+
+p <- ggplot2::ggplot(svytDT, aes(x = hms, y = meankW, colour = AU2013_label)) +
+  geom_step() +
+  facet_grid(season ~ REGC2013_label) +
+  scale_colour_viridis(discrete = TRUE) +
+  theme(legend.position="none") + # hide for clarity
+  labs(y = "Mean kW",
+       x = "Time of Day",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits",
+                        "\n n households (base) = ", uniqueN(dataDT$linkID),
+                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
+                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded"
+                        )
+       )
+yMin <- min(svytDT$meankW)
+yMax <- max(svytDT$meankW)
+addPeaks(p)
+
+plotly::ggplotly(p)
+
+```
+
+
+## Area units - Lighting: Efficiency Model
+
+```{r set efficiency}
+# update using Carsten's figures
+# - if we switched an incandescent bulb to LED what is the % W saving?
+# -> 94%
+# 
+# - if we switched a cfl bulb to LED what is the % W saving?
+# -> 67%
+# 
+# - if we switched a ELV bulb to LED what is the % W saving?
+# -> 91%
+# EnergyConsult PTY LTD. (2015). Residential Energy Baseline Study: New Zealand. Retrieved from http://energyrating.gov.au/sites/new.energyrating/files/documents/Report_Residential_Baseline_Study_for_New_Zealand_2000_-_2030_0_0.pdf
+
+iToLED <- 1-0.94 # see above
+cflToLED <- 1-0.67 # see above
+hToLED <- 1-0.91 # see above
+```
+
+This simple model adjusts each lighting power observation for each household as follows:
+
+ * for households with 'mostly incandescent bulbs' - we assume a switch to LEDs will reduce power demand for lighting by `r 100*(1-iToLED)` %;
+ * for households with 'mostly energy saving - cfl bulbs' - we assume a switch to LEDs will reduce power demand for lighting by `r 100*(1-cflToLED)` %;
+ * for households with 'mostly Halogen' - we assume a switch to LEDs will reduce power demand for lighting by `r 100*(1-hToLED)` %;
+ * for households with 'mostly LEDs' and where we don't know the bulb type we assume no change;
+ 
+This is based on calculations derived from the New Zealand Residential Energy Baseline Study [@nzbaseline2015]
+
+```{r l.eff.model}
+setkey(ggLightingDT, linkID)
+setkey(ggHhDT, linkID)
+ggLightingDT <- ggHhDT[, .(linkID, Q49_coded, Location)][ggLightingDT]
+
+t <- table(ggHhDT$Q49_coded)
+prop.table(t)*100
+
+t <- table(ggHhDT$Q49_coded, ggHhDT$Location, useNA = "always")
+
+kableExtra::kable(t, caption = "Bulb type distribution (survey data)") %>%
+  kable_styling()
+
+t <- table(ggLightingDT$Q49_coded, ggLightingDT$Location, useNA = "always")
+kableExtra::kable(t, caption = "nObs in lighting data by location and bulb type - note the lighting circuits we have for Taranaki are all 'mostly incandescents'") %>%
+  kable_styling()
+
+# efficiency model
+ggLightingDT <- ggLightingDT[, meanPowerWefficient := meanPowerW] # if we don't know or anything else, leave as-is
+ggLightingDT <- ggLightingDT[Q49_coded == "Incandescent", meanPowerWefficient := meanPowerW*iToLED]  
+ggLightingDT <- ggLightingDT[Q49_coded == "Energy saving - cfl", meanPowerWefficient := meanPowerW*cflToLED] 
+ggLightingDT <- ggLightingDT[Q49_coded == "Halogen", meanPowerWefficient := meanPowerW*hToLED] 
+
+t <- ggLightingDT[, .(nObs = .N,
+                      meanW = mean(meanPowerW),
+                      meanWefficient = mean(meanPowerWefficient),
+                      meanWSaving = mean(meanPowerWefficient - meanPowerW),
+                      #meanWSaving_pc = 100*((meanPowerWefficient - meanPowerW)/meanPowerW),
+                      nHouseholds = uniqueN(linkID)), keyby = .(Location, Q49_coded)]
+
+kableExtra::kable(t, caption = "Overall effect of efficiency model (note we only have 2 lighting circuits in Taranaki)") %>%
+  kable_styling()
+
+```
+
+```{r l.eff.season.byID}
+
+myCaption <- paste0("GREENGrid household power demand data from ", min(ggLightingDT$r_dateTimeNZ), " to ", max(ggLightingDT$r_dateTimeNZ))
+
+# aggregate to half-hour per hh per season (re-use this later on, gets messy)
+plotDT <- ggLightingDT[, .(meanWeff = mean(meanPowerWefficient),
+                         sdW = sd(meanPowerWefficient),
+                         nObs = .N), keyby = .(hms, season)]
+
+# aggregate to half-hour per hh per season (re-use this later on, gets messy)
+seasonLHhDT <- ggLightingDT[, .(meankWeff = mean(meanPowerWefficient/1000),
+                         sdkWeff = sd(meanPowerWefficient/1000),
+                         meankW = mean(meanPowerW/1000),
+                         sdkW = sd(meanPowerW/1000),
+                         nObs = .N), keyby = .(linkID, hms, season)]
+
+# aggregate plot per hh
+p1 <- ggplot2::ggplot(seasonLHhDT, aes(x = hms, y = meankW, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (within household)",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits, n = ", uniqueN(seasonLHhDT$linkID),
+                        "\n Peak demand periods shaded"
+                        )
+       )
+
+yMin <- min(seasonLHhDT$meankW)
+yMax <- max(seasonLHhDT$meankW)
+addPeaks(p1)
+
+p2 <- ggplot2::ggplot(seasonLHhDT, aes(x = hms, y = meankWeff, colour = linkID)) +
+  geom_step() +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  labs(y = "Mean kW (within household)",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits, n = ", uniqueN(seasonLHhDT$linkID),
+                        "\n Peak demand periods shaded"
+                        )
+       )
+
+yMin <- min(seasonLHhDT$meankWeff)
+yMax <- max(seasonLHhDT$meankWeff)
+addPeaks(p2)
+
+
+#plotly::ggplotly(p)
+
+```
+
+```{r l.eff.season.linkAggToSynth}
+setkey(seasonLHhDT, linkID)
+message("N households with demand data: ", uniqueN(seasonLHhDT$linkID))
+
+setkey(ipfWeights2013DT, linkID)
+message("N unique households in synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+#dataDT <- seasonLHhDT[ipfWeights2013DT, allow.cartesian=TRUE]
+dataDT <- ipfWeights2013DT[seasonLHhDT, allow.cartesian=TRUE]
+message("N unique households in merged power data & synthetic census: ", uniqueN(ipfWeights2013DT$linkID))
+
+message("We will have NA values for power etc for households who were in the survey but for whom we have no power data.")
+message("We will have 0 values for ipfWeight for for households who are not needed in a particular areaCode.")
+
+summary(dataDT)
+
+message("We can drop both of these to save memory.")
+
+dataEffDT <- dataDT[!is.na(meankWeff) | ipfWeight > 0]
+
+summary(dataEffDT)
+
+```
+
+Construct unweighted overall mean profiles for comparison.
+
+```{r l.eff.season.AreaProfiles.Unweighted}
+# no weights
+meanProfilesUnwDT <- dataEffDT[!is.na(meankW), .( meankW= mean(meankW),
+                                                  meankWeff = mean(meankWeff)
+                                                  ), keyby = .(hms, season)]
+
+p <- ggplot2::ggplot(meanProfilesUnwDT, aes(x = hms)) +
+  geom_line(aes(y = meankW, colour = "Mean")) +
+  geom_line(aes(y = meankWeff, colour = "Mean efficient")) +
+  scale_colour_viridis(discrete = TRUE) +
+  facet_grid(season ~ .) +
+  theme(legend.position="none") + # hide for clarity
+  labs(x = "Time of Day", y = "Mean kW",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits",
+                        "\n n households (unweighted) = ", uniqueN(dataDT$linkID),
+                        " replicated over ",uniqueN(meanProfilesUnwDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded"
+                        )
+       )
+yMin <- min(meanProfilesUnwDT$meankW)
+yMax <- max(meanProfilesUnwDT$meankW)
+addPeaks(p)
+
+```
+
+Now construct the weighted area profiles.
+
+```{r l.eff.season.AreaProfiles.Weighted}
+
+# with weights
+# set svydesign
+dataDT <- dataDT[, c_hms := as.character(hms)] # svy breaks on hms for some reason
+dataDT <- dataDT[, powerkWdiff := meankWeff - meankW] # power saving
+dataDT <- dataDT[, powerkWdiff_pr := (meankWeff - meankW)/meankW] # power saving proportion
+
+message("Setting survey design using weights")
+dataDTSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
+                                  weights = ~ipfWeight, 
+                                  data = dataDT[!is.na(ipfWeight)] # make sure no NA weights
+)
+
+```
+
+```{r plot_meanSaving}
+message("Calculating weighted power summaries - can take some time...")
+system.time(svyt <- svyby(~powerkWdiff, ~AU2013_label+c_hms+season, dataDTSvyDes, 
+                          svymean,
+                          keep.var = TRUE,
+                          exclude=NULL, 
+                          na.action=na.pass
+                          )
+            )
+
+svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
+# fix the time thing
+svytDT$hms <- hms::parse_hm(svytDT$c_hms)
+
+# add regional labels back
+setkey(svytDT, AU2013_label)
+setkey(auLabelsDT, AU2013_label)
+svytDT <- svytDT[auLabelsDT]
+
+p <- ggplot2::ggplot(svytDT, aes(x = hms, y = powerkWdiff, colour = AU2013_label)) +
+  geom_step() +
+  facet_grid(season ~ REGC2013_label) +
+  scale_colour_viridis(discrete = TRUE) +
+  theme(legend.position="none") + # hide for clarity
+  labs(y = "Mean kW saved",
+       x = "Time of Day",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits",
+                        "\n n households (base) = ", uniqueN(dataDT$linkID),
+                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
+                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded"
+                        )
+       )
+yMin <- min(svytDT$powerkWdiff)
+yMax <- max(svytDT$powerkWdiff)
+addPeaks(p)
+
+plotly::ggplotly(p)
+
+```
+
+```{r plot_meanPcSaving}
+message("Calculating weighted power summaries - can take some time...")
+system.time(svyt <- svyby(~powerkWdiff_pr, ~AU2013_label+c_hms+season, dataDTSvyDes, 
+                          svymean,
+                          keep.var = TRUE,
+                          exclude=NULL, 
+                          na.action=na.pass
+                          )
+            )
+
+svytDT <- as.data.table(svyt) # this is just so cool - autoconversion to data.table :-)
+# fix the time thing
+svytDT$hms <- hms::parse_hm(svytDT$c_hms)
+
+setkey(svytDT, AU2013_label)
+
+# add regional labels back
+setkey(svytDT, AU2013_label)
+setkey(auLabelsDT, AU2013_label)
+svytDT <- svytDT[auLabelsDT]
+
+svytDT <- svytDT[, powerkWdiff_pc := powerkWdiff_pr * 100]
+
+p <- ggplot2::ggplot(svytDT, aes(x = hms, y = powerkWdiff_pc, colour = AU2013_label)) +
+  geom_step() +
+  facet_grid(season ~ REGC2013_label) +
+  scale_colour_viridis(discrete = TRUE) +
+  theme(legend.position="none") + # hide for clarity
+  labs(y = "Mean % kW saved",
+       x = "Time of Day",
+       caption = paste0(myCaption, 
+                        "\n Lighting circuits",
+                        "\n n households (base) = ", uniqueN(dataDT$linkID),
+                        "\n n households (reweighted) = ", dkUtils::tidyNum(totalIpfHHs),
+                        "\n Plotting ",uniqueN(svytDT$AU2013_label) , " areas",
+                        "\n Peak demand periods shaded"
+                        )
+       )
+yMin <- min(svytDT$powerkWdiff_pc)
+yMax <- max(svytDT$powerkWdiff_pc)
+addPeaks(p)
+
+plotly::ggplotly(p)
+
+```
+
+
+
+```{r runToHere}
+```
+