moved old code

e670ec2d · Ben Anderson · 9c2e9c6b · 9c2e9c6b · 9c2e9c6b · 9c2e9c6b
Commit e670ec2d authored 5 years ago by Ben Anderson
--- a/analysis/GREENGridModel/_createSyntheticCensus.Rmd
+++ b/analysis/GREENGridModel/_createSyntheticCensus.Rmd
-## Create synthetic weighted households 
-### Set up IPF
-First, test survey variables for collinearity.
-```{r testSurveyCollinearity}
-cor.test(ggHhDT$nPeople, ggHhDT$nBedrooms)
-cor.test(ggHhDT$nPeople, ggHhDT$nRooms)
-cor.test(ggHhDT$nRooms, ggHhDT$nBedrooms)
-```
-So people & bedrooms are the least colinear of these. Which is odd given that we used one to predict the other. Anyway, we will use nBedrooms rather than nRooms.
-```{r setConstraints}
-# relies heavily on: http://robinlovelace.net/spatial-microsim-book/smsim-in-R.html
-# ipfp wants:
-# y = numeric vector of constraints for each area - the names must match the survey, data as counts, no ids or anything
-# which means we have to assume ipfp maintains order
-# A = the transposed survey data (row = var) in dummy variable form - names must match constraints, no ids etc, variables that match constraints only
-# which means we have to assume ipfp maintains order
-# x0 = numeric initial vector of length ncol
-# set order of variables - has to match (probably)
-fullConstraintsList <- c("heatSourceWood", "heatSourceElectricity", "heatSourceGas", "heatSourceCoal", "heatSourceOther", 
-                         "nKids_0", "nKids_1m",
-                     #"nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m",
-                     "nRooms1_4", "nRooms5_6", "nRooms7m",
-                     "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m" # fits last
-) 
-```
-We have the following constraints available:
-`fullConstraintsList`
-Note that we set the order of constraints here - we set n people as the last constraint so that the ipf algoritm fits it perfectly. This is a design choice and it is driven by the size of the coefficients in the lm models above (which lm models?)
-Check census data distributions - we're looking for zeros.
-```{r censusChecks}
-# check constraint distributions - do we have 0s? ----
-# do this here so we only have the relevant regions
-# > heat source ----
-t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "heat"]) # https://stackoverflow.com/questions/30189979/select-columns-of-data-table-based-on-regex
-kableExtra::kable(t, caption = "Test distribution of fuel sources") %>%
-  kable_styling()
-# > n kids ----
-t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "nKids"])
-kableExtra::kable(t, caption = "Test distribution of nKids") %>%
-  kable_styling()
-# > n people ----
-t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "nPeople"])
-kableExtra::kable(t, caption = "Test distribution of nPeople") %>%
-  kable_styling()
-# > n rooms ----
-t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "nRooms"])
-kableExtra::kable(t, caption = "Test distribution of nRooms") %>%
-  kable_styling()
-```
-```{r censusSetup}
-# check totals are not 0
-#au2013DT <- au2013DT[, nBedrooms_Total := nBedrooms_1_2 + nBedrooms_3 + nBedrooms_4m]
-au2013DT <- au2013DT[, nPeople_Total := nPeople_1 + nPeople_2 + nPeople_3 + nPeople_4m]
-au2013DT <- au2013DT[, nRooms_Total := nRooms1_4 + nRooms5_6 + nRooms7m]
-au2013DT <- au2013DT[, nKids_Total := nKids_0 + nKids_1m]
-au2013DT <- au2013DT[, heatSource_Total := heatSourceWood + heatSourceElectricity + heatSourceGas + heatSourceCoal + heatSourceOther]
-t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "_Total"])
-t
-message("Removing areas which have total counts for any constraint = 0")
-nOrig <- nrow(au2013DT)
-zerosDT <- au2013DT[nPeople_Total == 0 | is.na(nPeople_Total)| 
-                            #nBedrooms_Total == 0 | is.na(nBedrooms_Total)| 
-                            nRooms_Total == 0 | is.na(nRooms_Total)| 
-                            nKids_Total == 0 | is.na(nKids_Total)| 
-                            heatSource_Total == 0 | is.na(heatSource_Total), 
-                          .(AU2013_code, nRooms_Total,nPeople_Total, nKids_Total, 
-                            heatSource_Total
-                            )
-                          ]
-zerosDT <- zerosDT[, AU2013_code := as.character(AU2013_code)]
-setkey(zerosDT, AU2013_code)
-zerosDT <- auListDT[zerosDT]
-zerosDT <- zerosDT[, drop := 1]
-test <- zerosDT[au2013DT]
-# remove areas where count is 0 or NA
-censusDT <- test[is.na(drop)]
-nNonZero <- nrow(censusDT)
-```
-We have removed `r nOrig - nNonZero` areas where the household totals were 0 or NA. These are shown in Table \@ref(tab:zeroTotals).
-> NB: these areas will have no ipf results...
-```{r zeroTotals}
-t <- zerosDT[, .(REGC2013_label, AU2013_label, nRooms_Total,nPeople_Total,nKids_Total,
-                 heatSource_Total
-                 )][order(REGC2013_label)]
-kableExtra::kable(t, caption = "Areas where the household totals were 0 or NA") %>%
-  kable_styling()
-```
-```{r checkCensusConstraints}
-# check
-skimr::skim(censusDT)
-```
-### Run IPF
-This section runs the IPF to generate the weights.
-First set the constraints we are going to use.
-```{r setFinalConstraints}
-# need list of area codes
-areaCodes <- censusDT[, AU2013_code]
-# need list of household ids
-hhIdsDT <- surveyDT[, .(linkID)]
-# select the constraints we want & set order
-# coal = 0 might break the IPF
-testConstraintsList <- c(#"heatSourceWood","heatSourceElectricity","heatSourceGas","heatSourceCoal","heatSourceOther", 
-                         "nKids_0", "nKids_1m",
-                     #"nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m",
-                     "nRooms1_4","nRooms5_6","nRooms7m",
-                     "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m" # fits last
-)
-print(paste0("Using ", testConstraintsList))
-# select just the constraints we want to use ----
-ipfInputSurveyDT <- surveyDT[,testConstraintsList , with=FALSE]
-message("Input survey constraints:")
-names(ipfInputSurveyDT)
-ipfInputConstraintsDT <- censusDT[, testConstraintsList, with=FALSE]
-message("Input census constraints:")
-names(ipfInputConstraintsDT)
-```
-Notes:
- * we don't use fuel/heat source as we get a lot of Nan
-```{r runIPF}
-# expects:
-#   an area level constraints dt and a survey constraints dt (vars in same order)
-#   a list of area codes & bmg_ids
-# returns a long form weights dt with oacodes labelled
-# transpose the survey for ipf
-surv_dtt <- t(ipfInputSurveyDT)
-# set up initial vector of the same length as the number of households as a load of 1s (could be the response weights...)
-x0 <- rep(1, nrow(ipfInputSurveyDT))
-# create 2D weighst matrix (households by areas) - this will get filled with the weights
-weights = array(NA, dim=c(nrow(ipfInputSurveyDT), nrow(ipfInputConstraintsDT)))
-for(i in 1:ncol(weights)){ # for each area (cols of weights)
-  print(paste0("Running ipfp on area ", i))
-  weights[,i] <- ipfp(as.numeric(ipfInputConstraintsDT[i,]), # only accepts a vector of numbers 
-                      surv_dtt, 
-                      x0, 
-                      maxit = 10, # maxit = 10 iterations (for now), 
-                      verbose = F) # careful with verbose - too much output
-}
-```
-```{r processIpfResults}
-# this is where we have to assume ipfp maintained the oaCode order
-wDT <- as.data.table(weights) # convert to dt
-wDT <- setnames(wDT, as.character(areaCodes)) # add col names
-# this is where we have to assume ipfp maintained the bmg_id order
-dt <- cbind(hhIdsDT, wDT) # add linkID to each row of weights (fills down, if you get recycling then there is a problem)
-setkey(dt, linkID)
-# need to reshape the weightsDT into 1 column of weights with oacode set to oacode etc
-longFormDT <- as.data.table(melt(dt, id=c("linkID")))
-# add correct area code name for future matching
-longFormDT <- setnames(longFormDT, c("variable", "value"), c("AU2013_code", "ipfWeight"))
-setkey(longFormDT,AU2013_code)
-wDT <- NULL # save mem
-# add region
-longFormDT <- auListDT[longFormDT]
-kableExtra::kable(caption = "First few rows/columns of ipf weights table", head(longFormDT))
-kableExtra::kable(caption = "Summary of ipf weights table", summary(longFormDT))
-message("Dimensions of ipf weights table: ", dkUtils::tidyNum(nrow(longFormDT)), " rows x ", ncol(longFormDT), " cols")
-```
-This produces a long form data file of nrow = n(linkID) x n(areaCode) (!) so that linkIDs are repeated and the weights are held in a single column (variable). This makes everything else a lot easier later. The dataset comprises `r dkUtils::tidyNum(uniqueN(longFormDT$linkID))` GREENGrid households replicated (with weights) across `r dkUtils::tidyNum(uniqueN(longFormDT$areaCode))` unit areas.
-It is possible that some of the weights are 0. This means there are households which have 0 weights in some areas. We do not need these households in the data so we will remove them (if any exist).
-```{r removeZeroWeights}
-message("N rows before removing zero weights: ", nrow(longFormDT))
-longFormDT <- longFormDT[ipfWeight > 0]
-message("N rows after removing zero weights: ", nrow(longFormDT))
-```
-```{r saveData}
-# save the results for future use since they won't change unless we change the constraints
-data.table::fwrite(longFormDT, paste0(sParams$ggPath, "/safe/ipf/nonZeroWeightsAu2013.csv"))
-```
-We now need to add the survey-based attributes back (from the GREENGrid survey). If we had a much larger sample of households and a lot more areas we would not do this here as it would create a very large file.
-```{r addAttributes}
-setkey(longFormDT, linkID)
-attDT <- ggHhDT[, .(linkID, `Hot water cylinder`, `Heat pump number`, `PV Inverter`, 
-                    `nPeopleCat`, presenceKids, `nBedroomsCat`, `nRoomsCat`,
-                    heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther, 
-                    nKids_0, nKids_1m,
-                     nBedrooms_1_2, nBedrooms_3, nBedrooms_4m,
-                     nRooms1_4, nRooms5_6, nRooms7m,
-                     nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
-                    heatFuel, Q20_coded, Q49_coded )] # keep the vars we want to use
-setkey(attDT, linkID)
-longFormSvyDT <- merge(longFormDT, attDT)
-# set svydesign
-longFormSvyDes <- survey::svydesign(ids = ~linkID, # use synthetic hh_id
-                                    weights = ~ipfWeight, 
-                                    data = longFormSvyDT[!is.na(ipfWeight)] # make sure no NA weights
-)
-```
-### Check outputs
-#### Overall weights and household counts
-```{r testIpfOutcomes}
-message("Calculating weighted household counts - can take some time...")
-svyt <- svytable(~AU2013_code, longFormSvyDes, 
-                 exclude=NULL, 
-                 na.action=na.pass
-)
-ipfHhCountsDT <- data.table::as.data.table(svyt)
-ipfHhCountsDT <- ipfHhCountsDT[, simCount := N]
-setkey(ipfHhCountsDT, AU2013_code)
-setkey(censusDT, AU2013_code)
-ipfOutputTestDT <- ipfHhCountsDT[censusDT]
-t_dt <- longFormDT[ipfWeight != 0, .(nGGHouseholds = uniqueN(linkID),
-                       meanWeight = mean(ipfWeight),
-                       minWeight = min(ipfWeight),
-                       maxWeight = max(ipfWeight)),
-                   keyby = .(AU2013_code)]
-ipfOutputTestDT <- ipfOutputTestDT[t_dt]
-#ipfOutputTestDT <- ipfOutputTestDT[, AU2013_code := as.character(AU2013_code)]
-ipfOutputTestDT <- auListDT[ipfOutputTestDT]
-ipfOutputTestDT <- ipfOutputTestDT[, ratio := simCount/nGGHouseholds]
-t <- longFormDT[ipfWeight != 0, .(nGGHouseholds = uniqueN(linkID),
-                       meanWeight = mean(ipfWeight),
-                       minWeight = min(ipfWeight),
-                       maxWeight = max(ipfWeight)),
-                   keyby = .(REGC2013_label)]
-kableExtra::kable(t, caption = "GREENGrid - summary of weights by region")%>%
-  kable_styling()
-```
-Figure \@ref(fig:checkTotalCounts_nPeople) shows the actual and simulated number of households using the nPeople constraint.
-```{r checkTotalCounts_nPeople}
-myCaption <- "Slope = line of exact fit"
-t <- ipfOutputTestDT[, .(mean_nHHs = mean(npeople_totalHouseholds),
-                              min_nHHs = min(npeople_totalHouseholds),
-                              max_nHHs = max(npeople_totalHouseholds)), keyby = .(REGC2013_label)]
-kableExtra::kable(t, caption = "Summary of n households (n people)") %>%
-  kable_styling()
-ggplot2::ggplot(ipfOutputTestDT, aes(x = npeople_totalHouseholds, y = simCount, colour = REGC2013_label)) +
-  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
-  geom_point() +
-  scale_colour_discrete(name = "Region") +
-  labs(x = "Number of households (using census n people variable)",
-       y = "Number of households (ipf results)",
-       caption = myCaption)
-```
-Figure \@ref(fig:checkTotalCounts_nRooms) shows the actual and simulated number of households using the nRooms constraint. This was the penultimate constraint to be fitted.
-```{r checkTotalCounts_nRooms}
-t <- ipfOutputTestDT[, .(mean_nHHs = mean(nrooms_totalHouseholds),
-                              min_nHHs = min(nrooms_totalHouseholds),
-                              max_nHHs = max(nrooms_totalHouseholds)), keyby = .(REGC2013_label)]
-kableExtra::kable(t, caption = "Summary of n households (n rooms)") %>%
-  kable_styling()
-ggplot2::ggplot(ipfOutputTestDT, aes(x = nrooms_totalHouseholds, y = simCount, colour = REGC2013_label)) +
-  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
-  geom_point() +
-  scale_colour_discrete(name = "Region") +
-  labs(x = "Number of households (using census n rooms variable)",
-       y = "Number of households (ipf results)",
-       caption = myCaption)
-```
-> Can't repeat for n kids as the base doesn't match
-Figure \@ref(fig:checkTotalCounts_ggHHs) shows the number of GREEN Grid households who are being 'upweighted' to match the given number of census households.
-```{r checkTotalCounts_ggHHs}
-ggplot2::ggplot(ipfOutputTestDT, aes(x = nGGHouseholds, y = simCount, colour = REGC2013_label)) +
-  geom_jitter() +
-  scale_colour_discrete(name = "Region") +
-  labs(x = "Number of GREENGrid households",
-       y = "Number of households represented in area unit",
-       caption = paste0("Plotting ", uniqueN(ipfOutputTestDT$AU2013_code), " area units"))
-totalIpfHHs <- sum(ipfHhCountsDT$N)
-```
-#### Constrained variables
-Table \@ref(tab:testnPeople) shows the weighted household counts for the number of people. This was the last constraint so it fitted last.
-```{r testnPeople}
-# create summary table 
-svyt <- survey::svytable(~ AU2013_code + nPeopleCat, longFormSvyDes, 
-                 exclude=NULL, 
-                 na.action=na.pass
-)
-dt <- data.table::as.data.table(svyt)
-wdt <- reshape(dt, direction = "wide", timevar = "nPeopleCat", idvar = "AU2013_code")
-setnames(wdt, c("N.1", "N.2", "N.3", "N.4+"), c("ipf_nPeople_1",
-                                                "ipf_nPeople_2",
-                                                "ipf_nPeople_3",
-                                                "ipf_nPeople_4+"))
-setkey(wdt, AU2013_code)
-setkey(censusDT, AU2013_code)
-tmpDT <- wdt[censusDT] # merge to census data
-tmpDT <- auListDT[tmpDT] # add region labels
-plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
-                    ipf_nPeople_1, ipf_nPeople_2, ipf_nPeople_3, `ipf_nPeople_4+`,
-                    nPeople_1, nPeople_2, nPeople_3, nPeople_4m)]
-t <- summary(plotDT)
-kableExtra::kable(t, caption = "Summary of n people outcomes (and census counts)") %>%
-  kable_styling()
-ggplot2::ggplot(plotDT) +
-  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
-  geom_point(aes(x = nPeople_1, y =  ipf_nPeople_1, colour = "1 person")) +
-  geom_point(aes(x = nPeople_2, y =  ipf_nPeople_2, colour = "2 people")) +
-  geom_point(aes(x = nPeople_3, y =  ipf_nPeople_3, colour = "3 people")) +
-  geom_point(aes(x = nPeople_4m, y =  `ipf_nPeople_4+`, colour = "4+ people")) +
-  scale_colour_discrete(name = "N people") +
-  labs(x = "Census count",
-       y = "IPF (weighted) count",
-       caption = myCaption) +
-  facet_grid(. ~ REGC2013_label)
-```
-Table \@ref(tab:testnRooms) shows the weighted household counts for the number of rooms This was the penultimate constraint to be fitted.
-```{r testnRooms}
-# create summary table 
-svyt <- survey::svytable(~ AU2013_code + nRoomsCat, longFormSvyDes, 
-                 exclude=NULL, 
-                 na.action=na.pass
-)
-dt <- data.table::as.data.table(svyt)
-wdt <- reshape(dt, direction = "wide", timevar = "nRoomsCat", idvar = "AU2013_code")
-setnames(wdt, c("N.1-4", "N.5-6", "N.7+"), c("ipf_nRooms_1_4",
-                                                "ipf_nRooms_5_6",
-                                                "ipf_nRooms_7m"))
-setkey(wdt, AU2013_code)
-tmpDT <- wdt[censusDT] # merge to census data
-tmpDT <- auListDT[tmpDT] # add region labels
-plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
-                    ipf_nRooms_1_4, ipf_nRooms_5_6, ipf_nRooms_7m,
-                    nRooms1_4, nRooms5_6, nRooms7m)]
-t <- summary(plotDT)
-kableExtra::kable(t, caption = "Summary of n rooms outcomes (and census counts)") %>%
-  kable_styling()
-ggplot2::ggplot(plotDT) +
-  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
-  geom_point(aes(x = nRooms1_4, y =  ipf_nRooms_1_4, colour = "1-4 rooms")) +
-  geom_point(aes(x = nRooms5_6, y =  ipf_nRooms_5_6, colour = "5-6 rooms")) +
-  geom_point(aes(x = nRooms7m, y =  ipf_nRooms_7m, colour = "7+ rooms")) +
-  scale_colour_discrete(name = "N rooms") +
-  labs(x = "Census count",
-       y = "IPF (weighted) count",
-       caption = myCaption) +
-  facet_grid(. ~ REGC2013_label)
-```
-Table \@ref(tab:testnKids) shows the weighted household counts for the presence of children.
-```{r testnKids}
-# create summary table 
-svyt <- survey::svytable(~ AU2013_code + presenceKids, longFormSvyDes, 
-                 exclude=NULL, 
-                 na.action=na.pass
-)
-dt <- data.table::as.data.table(svyt)
-wdt <- reshape(dt, direction = "wide", timevar = "presenceKids", idvar = "AU2013_code")
-setnames(wdt, c("N.1+", "N.None"), c("ipf_nKids_1m","ipf_nKids_0"))
-setkey(wdt, AU2013_code)
-tmpDT <- wdt[censusDT] # merge to census data
-tmpDT <- auListDT[tmpDT] # add region labels
-plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
-                    ipf_nKids_0, ipf_nKids_1m,
-                    nKids_0, nKids_1m)]
-t <- summary(plotDT)
-kableExtra::kable(t, caption = "Summary of n kids outcomes (and census counts)") %>%
-  kable_styling()
-ggplot2::ggplot(plotDT) +
-  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
-  geom_point(aes(x = nKids_0, y =  ipf_nKids_0, colour = "0 children")) +
-  geom_point(aes(x = nKids_1m, y =  ipf_nKids_1m, colour = "1+ child")) +
-  labs(x = "Census count",
-       y = "IPF (weighted) count",
-       caption = myCaption) +
-  scale_colour_discrete(name = "N children") +
-  facet_grid(. ~ REGC2013_label)
-```
-#### Unconstrained variables
-Table \@ref(tab:testHeatSource) shows the weighted household counts for main heat source. Remember that this was _not_ used in the final constraints.
-```{r testHeatSource}
-# create summary table 
-svyt <- survey::svytable(~ AU2013_code + heatFuel, longFormSvyDes, 
-                 exclude=NULL, 
-                 na.action=na.pass
-)
-dt <- data.table::as.data.table(svyt)
-wdt <- reshape(dt, direction = "wide", timevar = "heatFuel", idvar = "AU2013_code")
-setnames(wdt, c("N.Electricity", "N.Gas", "N.Other", "N.Wood"), 
-         c("ipf_Electricity","ipf_Gas", "ipf_Other", "ipf_Wood"))
-setkey(wdt, AU2013_code)
-tmpDT <- wdt[au2013DT] # merge to census data
-tmpDT <- auListDT[tmpDT] # add region labels
-plotDT <- tmpDT[, .(AU2013_code, REGC2013_label, AU2013_label,
-                    ipf_Electricity, ipf_Gas, ipf_Other, ipf_Wood,
-                    heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceWood, heatSourceOther)]
-ggplot2::ggplot(plotDT) +
-  geom_abline(slope = 1, intercept = 0, alpha = 0.5) +
-  geom_point(aes(x = ipf_Electricity, y =  ipf_Electricity, colour = "Electricity")) +
-  geom_point(aes(x = heatSourceGas, y =  ipf_Gas, colour = "Gas")) +
-  geom_point(aes(x = heatSourceWood, y =  ipf_Wood, colour = "Wood")) +
-  geom_point(aes(x = heatSourceOther, y =  ipf_Other, colour = "Other")) +
-  scale_colour_discrete(name = "Main heating fuel") +
-  labs(x = "Census count",
-       y = "IPF (weighted) count",
-       caption = paste0(myCaption, "\nNote: Coal not shown as no GREEN Grid households used coal")) +
-  facet_grid(. ~ REGC2013_label)
-```
-```{r run to here}
-```
\ No newline at end of file
--- a/analysis/GREENGridModel/_loadNonPowerData.Rmd
+++ b/analysis/GREENGridModel/_loadNonPowerData.Rmd
--- a/analysis/GREENGridModel/_loadPowerData.Rmd
+++ b/analysis/GREENGridModel/_loadPowerData.Rmd
--- a/analysis/GREENGridModel/_reportAreaModel.Rmd
+++ b/analysis/GREENGridModel/_reportAreaModel.Rmd