updated constraint processing & checking; switched census from aggregated...

updated constraint processing & checking; switched census from aggregated meshblock to raw area unit data from NZStats; added heat source/fuel as potential constraint (not used)

updated constraint processing & checking; switched census from aggregated...
9defb49d · Ben Anderson · b180d90e · 9defb49d
Commit 9defb49d authored 6 years ago by Ben Anderson
--- a/analysis/GREENGridModel/_loadNonPowerData.Rmd
+++ b/analysis/GREENGridModel/_loadNonPowerData.Rmd
@@ -3,18 +3,16 @@
 ### NZ Area Unit data
-2013 NZ Census data from NZ Stats at area unit level (pre-processed long form file).
+2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
-```{r loadAU2013}
+ * n people
-f <- paste0(sParams$dataPath, "processed/areaunits2013.csv.gz")
+ * n dependent children
-outputMessage(paste0("Loading: ", f))
+ * fuel source (all counted - may cause confusion as sum to > 100% of households)
-au2013DT <- data.table::fread(f)
+ * n rooms
-outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(au2013DT))))
-outputMessage(paste0("N area units loaded: ", dkUtils::tidyNum(uniqueN(au2013DT$AU2013_code))))
-```
+NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
-Load area labels
+First load area labels as we use these to select the right data rows.
 ```{r loadAreaLabels}
 areasDT <- data.table::fread(sParams$areasTable2013)
@@ -24,6 +22,191 @@ auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier ma
 setkey(auListDT, AU2013_code)
 ```
+```{r loadAU2013}
+# fuelSource ----
+fuelf <- paste0(sParams$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
+outputMessage(paste0("Loading: ", fuelf))
+fuelDT <- data.table::fread(fuelf)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT))))
+fuelDT <- fuelDT[, AU2013_code := AREA]
+setkey(fuelDT, AU2013_code)
+fuelDT <- fuelDT[auListDT]
+message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
+# create categories
+# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
+fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
+                 censusConstraint := "fuel_totalHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
+                 censusConstraint := "fuel_totalStatedHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
+                 censusConstraint := "heatSourceWood"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
+                   `Fuel type used to heat dwelling` == "Electricity",
+                 censusConstraint := "heatSourceElectricity"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling`  %like% "gas",
+                 censusConstraint := "heatSourceGas"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
+                 censusConstraint := "heatSourceCoal"]
+table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
+# convert to wide
+fuelDT <- fuelDT[, count := Value]
+fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
+                        "count.heatSourceElectricity", "count.heatSourceGas", 
+                        "count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
+         c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
+                        "heatSourceElectricity", "heatSourceGas", 
+                        "heatSourceWood", "heatSourceCoal", "heatSourceOther"))
+# nKids ----
+kidsf <- paste0(sParams$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
+outputMessage(paste0("Loading: ", kidsf))
+kidsDT <- data.table::fread(kidsf)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT))))
+kidsDT <- kidsDT[, AU2013_code := AREA]
+setkey(kidsDT, AU2013_code)
+kidsDT <- kidsDT[auListDT]
+message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
+# > create categories
+# "nKids_0", "nKids_1m"
+kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
+kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
+                 censusConstraint := "nkids_totalFamilies"]
+table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
+# > convert to wide
+kidsDT <- kidsDT[, count := Value]
+kids2013WDT <- reshape(kidsDT[YEAR == 2013,
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+# > calculate n households with 0 kids
+# do this again later when we have the number of _households_ to use as the base
+kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
+setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"), 
+         c("nkids_totalFamilies", "nKids_1m"))
+# nPeople ----
+npeoplef <- paste0(sParams$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
+outputMessage(paste0("Loading: ", npeoplef))
+npeopleDT <- data.table::fread(npeoplef)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))))
+setkey(npeopleDT, Area) # forgot to get code attached to this one
+npeopleDT <- npeopleDT[, AU2013_label := Area]
+setkey(auListDT,AU2013_label) # set to label instead of code - should still work
+npeopleDT <- npeopleDT[auListDT]
+message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
+# > create categories ----
+# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
+npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
+npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
+                       censusConstraint := "npeople_totalHouseholds"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
+                       censusConstraint := "nPeople_1"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
+                       censusConstraint := "nPeople_2"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household`  %like% "Three",
+                       censusConstraint := "nPeople_3"] 
+table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
+# convert to wide
+npeopleDT <- npeopleDT[, count := Value]
+npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"), 
+         c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
+# nRooms ----
+nroomsf <- paste0(sParams$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
+outputMessage(paste0("Loading: ", nroomsf))
+nroomsDT <- data.table::fread(nroomsf)
+outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))))
+nroomsDT <- nroomsDT[, AU2013_code := AREA]
+setkey(nroomsDT, AU2013_code)
+setkey(auListDT,AU2013_code) # set to back to code
+nroomsDT <- nroomsDT[auListDT]
+message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
+# > create categories ----
+# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
+nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
+nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
+                       censusConstraint := "nrooms_statedtotalHouseholds"] 
+nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
+                       censusConstraint := "nrooms_totalHouseholds"] 
+nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
+                       `Number of rooms` %like% "Two" |
+                       `Number of rooms` %like% "Three" |
+                       `Number of rooms` %like% "Four",
+                       censusConstraint := "nRooms1_4"] 
+nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" | 
+                       `Number of rooms` %like% "Six" ,
+                       censusConstraint := "nRooms5_6"] 
+table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
+# convert to wide
+nroomsDT <- nroomsDT[, count := Value]
+nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
+                          "count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"), 
+         c("nRooms1_4", "nRooms5_6", "nRooms7m",
+           "nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
+# combine them ----
+setkey(auListDT, AU2013_code, AU2013_label)
+setkey(fuel2013WDT, AU2013_code, AU2013_label)
+setkey(kids2013WDT, AU2013_code, AU2013_label)
+setkey(npeople2013WDT, AU2013_code, AU2013_label)
+setkey(nrooms2013WDT, AU2013_code, AU2013_label)
+au2013DT <- fuel2013WDT[auListDT]
+au2013DT <- kids2013WDT[au2013DT]
+au2013DT <- npeople2013WDT[au2013DT]
+au2013DT <- nrooms2013WDT[au2013DT]
+# recalulate n households without children
+au2013DT <- au2013DT[, nKids_0 :=  nrooms_totalHouseholds- nKids_1m]
+```
+Figure \@ref(fig:checkTotals) shows the match between the total household/family counts derived from each NZ stats table. As we would expect there are very minor differences with the exception of the totals derived from the families table (presence of children).
+```{r checkTotals, fig.cap="Plots of household/family totals"}
+pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
+                        npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
+pairs(pairsDT)
+```
 We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
 ### GREENGrid Survey data
@@ -31,7 +214,7 @@ We focus on households/families/dwellings not individuals as the spatial microsi
 This comprises the household attributes (survey) data:
 ```{r loadGGHousehold}
-f <- paste0(sParams$ggPath, "/reshare/v1.0/data/ggHouseholdAttributesSafe.csv.zip")
+f <- paste0(sParams$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source & 
 outputMessage(paste0("Loading: ", f))
 ggHhDT <- data.table::as.data.table(readr::read_csv(f))
 outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))))
@@ -39,11 +222,7 @@ outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$li
 ```
-## Create synthetic weighted census
-This step uses the household attributes file and the Census file to create a synthetic (weighted) household dataset for the areas of interest.
-### Harmonise categories
 We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding.
 #### Families: Presence of children
@@ -55,12 +234,10 @@ Survey:
 Census:
- * family_type_for_families_in_occupied_private_dwellings_Couple_with_child(ren)	
+ * 0,1+ (simplest)
- * family_type_for_families_in_occupied_private_dwellings_One_parent_with_child(ren)
- * family_type_for_families_in_occupied_private_dwellings_Total_families_in_occupied_private_dwellings (subtract those with children to get those without?)
 ```{r presenceChildren}
-# number of kids: categorised as: 1,2,3,4+ ----
+# survey: number of kids: categorised as: 1,2,3,4+ ----
 ggHhDT <- ggHhDT[, nKids :=  nChildren0_12 + nTeenagers13_18]
 table(ggHhDT$nKids, useNA = "always")
 message("There are NA - we will need to remove them later")
@@ -76,14 +253,6 @@ ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]
 table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")
-# census: ----
-au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_dwellings_Couple_with_child(ren)" | 
-                       variable == "family_type_for_families_in_occupied_private_dwellings_One_parent_with_child(ren)", censusConstraint  := "nKids_1m"]
-au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_dwellings_Total_families_in_occupied_private_dwellings", censusConstraint  := "TotalFamilies"]
-# we can't calculate the residual until we have the wide form
 ```
@@ -91,7 +260,7 @@ au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_d
 #### Households: Number of people
 ```{r nPeople}
-# number of people: categorised as: 1,2,3,4+ ----
+# survey: number of people: categorised as: 1,2,3,4+ ----
 ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
 table(ggHhDT$nPeople, useNA = "always")
 message("There are NA - we will need to remove them later")
@@ -114,38 +283,14 @@ ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]
 table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")
-# census: ----
-# number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_One_Usual_Resident up to number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Eight_or_More_Usual_Residents
-au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_One_Usual_Resident", censusConstraint  := "nPeople_1"]
-au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Two_Usual_Residents", censusConstraint  := "nPeople_2"]
-au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Three_Usual_Residents", censusConstraint  := "nPeople_3"]
-au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Four_Usual_Residents" |
-                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Five_Usual_Residents" |
-                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Six_Usual_Residents" |
-                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Seven_Usual_Residents" |
-                       variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Eight_or_More_Usual_Residents", censusConstraint  := "nPeople_4m"]
-origT <- au2013DT[variable %like% "Usual_Resident", .(nHouseholds = sum(value)), keyby = .(variable)]
-kableExtra::kable(origT, caption = "Census area units: n people (original)")%>%
-  kable_styling()
-recodeT <- au2013DT[censusConstraint %like% "nPeople", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
-kableExtra::kable(recodeT, caption = "Census area units: n people (recoded)")%>%
-  kable_styling()
-message("Total households (original) = ", sum(origT$nHouseholds))
-message("Total households (recoded) = ", sum(recodeT$nHouseholds))
-message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nPeople"]$AU2013_code))
 ```
 #### Dwellings: Number of rooms
+We have to do some imputation here
 ```{r nRooms}
-# survey ----
+# survey: n bedrooms via Q10 ----
 ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
                 `Q10#1_2_1_TEXT` + # living rooms
                 `Q10#1_3_1_TEXT` + # dining rooms-kitchens
@@ -162,6 +307,7 @@ r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)
 summary(r)
+# > impute where missing ----
 message("that looks reasonable...impute where missing")
 ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
@@ -205,38 +351,11 @@ ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]
 message("Check coding")
 table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")
-# census ----
-au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_One_Room" |
-                       variable == "number_of_rooms_for_occupied_private_dwellings_Two_Rooms" |
-                       variable == "number_of_rooms_for_occupied_private_dwellings_Three_Rooms" |
-                       variable == "number_of_rooms_for_occupied_private_dwellings_Four_Rooms",
-                     censusConstraint  := "nRooms1_4"]
-au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_Five_Rooms" |
-                     variable == "number_of_rooms_for_occupied_private_dwellings_Six_Rooms" ,
-                     censusConstraint  := "nRooms5_6"]
-au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_Seven_Rooms" |
-                       variable == "number_of_rooms_for_occupied_private_dwellings_Eight_or_More_Rooms",
-                     censusConstraint  := "nRooms7m"]
-origT <- au2013DT[variable %like% "_Rooms" |
-                    variable %like% "_Room", .(nHouseholds = sum(value)), keyby = .(variable)]
-kableExtra::kable(origT, caption = "Census area units: n people (original)")%>%
-  kable_styling()
-recodeT <- au2013DT[censusConstraint %like% "nRooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
-kableExtra::kable(recodeT, caption = "Census area units: n people (recoded)")%>%
-  kable_styling()
-message("Total households (original) = ", sum(origT$nHouseholds))
-message("Total households (recoded) = ", sum(recodeT$nHouseholds))
-message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nRooms"]$AU2013_code))
 ```
 #### Dwellings: Number of bedrooms
-Probably collinear with n people/n rooms
+Probably collinear with n people/n rooms. Also have to impute.
 ```{r nBedrooms}
@@ -252,6 +371,7 @@ r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)
 summary(r)
+# > impute where missing ----
 message("that looks reasonable...impute where missing")
 ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
@@ -293,35 +413,6 @@ ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)
 table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")
-# census ----
-# census: number_of_bedrooms_for_occupied_private_dwellings_One_Bedroom  to number_of_bedrooms_for_occupied_private_dwellings_Eight_or_More_Bedrooms 
-au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_One_Bedroom" |
-                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Two_Bedrooms", censusConstraint  := "nBedrooms_1_2"]
-au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_Three_Bedrooms", censusConstraint  := "nBedrooms_3"]
-au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_Four_Bedrooms" |
-                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Five_Bedrooms" |
-                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Six_Bedrooms" |
-                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Seven_Bedrooms" |
-                       variable == "number_of_bedrooms_for_occupied_private_dwellings_Eight_or_More_Bedrooms", censusConstraint  := "nBedrooms_4m"]
-origT <- au2013DT[variable %like% "_Bedroom" & 
-                    !(variable == "number_of_bedrooms_for_occupied_private_dwellings_Mean_Number_of_Bedrooms"), 
-                  .(nHouseholds = sum(value)), keyby = .(variable)]
-kableExtra::kable(origT, caption = "Census area units: n bedrooms (original)")%>%
-  kable_styling()
-recodeT <- au2013DT[censusConstraint %like% "nBedrooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
-kableExtra::kable(recodeT, caption = "Census area units: n bedrooms (recoded)")%>%
-  kable_styling()
-message("Total households (original) = ", sum(origT$nHouseholds))
-message("Total households (recoded) = ", sum(recodeT$nHouseholds))
-message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nBedrooms"]$AU2013_code))
-#au2013DT[!is.na(censusConstraint), .(nHHs = sum(value)), keyby = .(variable, censusConstraint)]
 ```
 Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households.
@@ -333,27 +424,39 @@ Census - unknown
 #### Dwellings: Main fuel used for heat
-Survey - known but but we do not have the response code list to determine what the response was :-(
+```{r surveyHeatSource}
-Census - known
+# survey: Q20 ----
-#### Dwelling: Dwelling type
+table(ggHhDT$Q20_coded,  useNA = "always")
-Survey - not known
+ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
-Census - known
+                                             "Enclosed wood burner" = "Wood",
+                                             "Open fire" = "Wood", # could be coal
+                                             "Heat pump"= "Electricity",
+                                             "HRV or other ventilation system" = "Electricity",
+                                             "Portable electric heaters" = "Electricity",
+                                             "Portable gas heater" = "Gas", 
+                                             "Underfloor gas heating" = "Gas",
+                                             "Other" = "Other")]
-#### Fix final survey data
+table(ggHhDT$Q20_coded,  ggHhDT$heatFuel, useNA = "always")
+ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]
-Test colinearity
+table(ggHhDT$heatFuel,  ggHhDT$Location, useNA = "always")
+```
-```{r testCor}
-cor.test(ggHhDT$nPeople, ggHhDT$nBedrooms)
-cor.test(ggHhDT$nPeople, ggHhDT$nRooms)
-cor.test(ggHhDT$nRooms, ggHhDT$nBedrooms)
-```
+#### Dwelling: Dwelling type
+Survey - not known
+Census - known
-So people & bedrooms are the least colinear of these. Which is odd given that we used one to predict the other. Anyway, we will use nBedrooms.
+#### Fix final survey data
 Filter survey data to include just the variables we want and fix any NAs.
@@ -361,7 +464,8 @@ Filter survey data to include just the variables we want and fix any NAs.
 surveyDT <- ggHhDT[, .(linkID, 
                       nKids_0, nKids_1m,
                       nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
-                       #nRooms1_5, nRooms6_7, nRooms8m,
+                       nRooms1_4, nRooms5_6, nRooms7m,
+                       heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
                       nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]
 skimr::skim(surveyDT)
@@ -382,38 +486,23 @@ Now select just the census data for:
 * Taranaki Region
 ```{r filterCensusRegion}
-censusAuDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
+censusAuWideDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
                         REGC2013_label == "Taranaki Region"] 
-# remove variables and rows we don't want
+message("Total households (rooms table) = ", sum(censusAuWideDT$nrooms_totalHouseholds))
-censusAuDT <- censusAuDT[, variable := NULL]
-censusAuDT <- censusAuDT[!is.na(censusConstraint),]
-t <- censusAuDT[censusConstraint %like% "nPeople" , .(nHouseholds = sum(value)), keyby = .(REGC2013_label, AU2013_label, AU2013_code)]
+message("Total households (fuel table) = ", sum(censusAuWideDT$fuel_totalHouseholds))
-kableExtra::kable(t, caption = "Selected area units by region")%>%
+message("Total households (npeople table) = ", sum(censusAuWideDT$npeople_totalHouseholds))
-  kable_styling()
-message("Total households = ", sum(t$nHouseholds))
-nPeopleT <- censusAuDT[censusConstraint %like% "nPeople", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
+message("Total families (kids table) = ", sum(censusAuWideDT$nkids_totalFamilies))
-kableExtra::kable(nPeopleT, caption = "Census area units: n nPeople (recoded)")%>%
-  kable_styling()
-message("Total households = ", sum(nPeopleT$nHouseholds))
+t <- censusAuWideDT[, .(nHouseholdsRooms = sum(nrooms_totalHouseholds),
+                        nHouseholdsFuel = sum(fuel_totalHouseholds),
-nBedroomsT <- censusAuDT[censusConstraint %like% "nBedrooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
+                        nHouseholdsPeople = sum(npeople_totalHouseholds)), keyby = REGC2013_label]
-kableExtra::kable(nBedroomsT, caption = "Census area units: n bedrooms (recoded)")%>%
-  kable_styling()
-message("Total households = ", sum(nBedroomsT$nHouseholds))
-nKidsT <- censusAuDT[censusConstraint %like% "nKids", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
-kableExtra::kable(nKidsT, caption = "Census area units: n kids (recoded) - residual not yet calculated")%>%
-  kable_styling()
-message("Total households = ", sum(nKidsT$nHouseholds))
+kableExtra::kable(t, caption = "Household counts by region (by table source)")
 ```
+```{r run to here}
+```