added census 2013 processing code

eb6b93b3 · Ben Anderson · 9992a12f · eb6b93b3 · eb6b93b3 · eb6b93b3
Commit eb6b93b3 authored Oct 17, 2019 by Ben Anderson
--- a/dataProcessing/processCensusAu2013.R
+++ b/dataProcessing/processCensusAu2013.R
+# Loads NZ Census area unit 2013 data 
+# processes to ipf form and matching coding to Green Grid survey
+require(spatialec) # for parameters etc
+myPackages <- c(
+  "data.table", # data munching
+  "ggplot2", # plot linear imputation model results
+  "here" # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49
+)
+spatialec::loadLibraries(myPackages) # this will try to install packages it can't load
+spatialec::setup()
+# Load data ----
+# NZ Area Unit data
+# Loaded as separate files
+# 2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
+# 
+#  * n people
+#  * n dependent children
+#  * fuel source (all counted - may cause confusion as sum to > 100% of households)
+#  * n rooms
+# 
+# NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
+# 
+# Load areas labels ----
+# First load area labels as we use these to select the right data rows.
+areasDT <- data.table::fread(p_Params$areasTable2013)
+auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
+auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
+setkey(auListDT, AU2013_code)
+# fuelSource ----
+fuelf <- paste0(p_Params$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
+message("Loading: ", fuelf)
+fuelDT <- data.table::fread(fuelf)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT)))
+fuelDT <- fuelDT[, AU2013_code := AREA]
+setkey(fuelDT, AU2013_code)
+fuelDT <- fuelDT[auListDT]
+message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
+# create categories
+# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
+fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
+                 censusConstraint := "fuel_totalHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
+                 censusConstraint := "fuel_totalStatedHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
+                 censusConstraint := "heatSourceWood"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
+                   `Fuel type used to heat dwelling` == "Electricity",
+                 censusConstraint := "heatSourceElectricity"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling`  %like% "gas",
+                 censusConstraint := "heatSourceGas"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
+                 censusConstraint := "heatSourceCoal"]
+table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
+# convert to wide so variables in columns (for IPF)
+fuelDT <- fuelDT[, count := Value]
+fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
+                        "count.heatSourceElectricity", "count.heatSourceGas", 
+                        "count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
+         c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
+                        "heatSourceElectricity", "heatSourceGas", 
+                        "heatSourceWood", "heatSourceCoal", "heatSourceOther"))
+# nKids ----
+kidsf <- paste0(p_Params$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
+message("Loading: ", kidsf)
+kidsDT <- data.table::fread(kidsf)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT)))
+kidsDT <- kidsDT[, AU2013_code := AREA]
+setkey(kidsDT, AU2013_code)
+kidsDT <- kidsDT[auListDT]
+message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
+# > create categories
+# "nKids_0", "nKids_1m"
+kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
+kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
+                 censusConstraint := "nkids_totalFamilies"]
+table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
+# > convert to wide
+kidsDT <- kidsDT[, count := Value]
+kids2013WDT <- reshape(kidsDT[YEAR == 2013,
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+# > calculate n households with 0 kids
+# do this again later when we have the number of _households_ to use as the base
+kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
+setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"), 
+         c("nkids_totalFamilies", "nKids_1m"))
+# nPeople ----
+npeoplef <- paste0(p_Params$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
+message("Loading: ", npeoplef)
+npeopleDT <- data.table::fread(npeoplef)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))
+        )
+setkey(npeopleDT, Area) # forgot to get code attached to this one
+npeopleDT <- npeopleDT[, AU2013_label := Area]
+setkey(auListDT,AU2013_label) # set to label instead of code - should still work
+npeopleDT <- npeopleDT[auListDT]
+message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
+# > create categories ----
+# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
+npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
+npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
+                       censusConstraint := "npeople_totalHouseholds"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
+                       censusConstraint := "nPeople_1"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
+                       censusConstraint := "nPeople_2"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household`  %like% "Three",
+                       censusConstraint := "nPeople_3"] 
+table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
+# convert to wide
+npeopleDT <- npeopleDT[, count := Value]
+npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"), 
+         c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
+# nRooms ----
+nroomsf <- paste0(p_Params$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
+message("Loading: ", nroomsf)
+nroomsDT <- data.table::fread(nroomsf)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))
+        )
+nroomsDT <- nroomsDT[, AU2013_code := AREA]
+setkey(nroomsDT, AU2013_code)
+setkey(auListDT,AU2013_code) # set to back to code
+nroomsDT <- nroomsDT[auListDT]
+message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
+# > create categories ----
+# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
+nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
+nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
+                       censusConstraint := "nrooms_statedtotalHouseholds"] 
+nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
+                       censusConstraint := "nrooms_totalHouseholds"] 
+nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
+                       `Number of rooms` %like% "Two" |
+                       `Number of rooms` %like% "Three" |
+                       `Number of rooms` %like% "Four",
+                       censusConstraint := "nRooms1_4"] 
+nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" | 
+                       `Number of rooms` %like% "Six" ,
+                       censusConstraint := "nRooms5_6"] 
+table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
+# convert to wide
+nroomsDT <- nroomsDT[, count := Value]
+nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
+                              .(AU2013_code,AU2013_label,censusConstraint,count)], 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
+                          "count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"), 
+         c("nRooms1_4", "nRooms5_6", "nRooms7m",
+           "nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
+# combine them into 1 filw ----
+setkey(auListDT, AU2013_code, AU2013_label)
+setkey(fuel2013WDT, AU2013_code, AU2013_label)
+setkey(kids2013WDT, AU2013_code, AU2013_label)
+setkey(npeople2013WDT, AU2013_code, AU2013_label)
+setkey(nrooms2013WDT, AU2013_code, AU2013_label)
+au2013DT <- fuel2013WDT[auListDT]
+au2013DT <- kids2013WDT[au2013DT]
+au2013DT <- npeople2013WDT[au2013DT]
+au2013DT <- nrooms2013WDT[au2013DT]
+# recalulate n households without children
+au2013DT <- au2013DT[, nKids_0 :=  nrooms_totalHouseholds- nKids_1m]
+# Check household totals ----
+# These will vary depending on the source table (families vs households vs dwellings etc)
+pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
+                        npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
+pairs(pairsDT)
+# We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
+# save data for re-use ----
+of <- paste0(p_Params$dataPath, "processed/2013IpfInput.csv")
+data.table::fwrite(au2013DT, of)
+message("Data saved as: ", of)
--- a/dataProcessing/processCensusAu2013.Rmd
+++ b/dataProcessing/processCensusAu2013.Rmd
+---
+title: "Process NZ Census 2013 data for use in ipf"
+subtitle: "v1.0"
+author: "Ben Anderson (University of Otago/University of Southampton)"
+date: 'Last run at: `r Sys.time()`'
+output:
+  bookdown::html_document2:
+    code_folding: hide
+    fig_caption: yes
+    number_sections: yes
+    self_contained: no
+    toc: yes
+    toc_depth: 3
+    toc_float: yes
+bibliography: '`r path.expand("~/bibliography.bib")`'
+---
+# Setup
+```{r setup}
+# Log compile time:
+startTime <- proc.time()
+require(spatialec) # for parameters etc
+myPackages <- c(
+  "data.table", # data munching
+  "ggplot2", # plot linear imputation model results
+  "here", # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49
+  "kableExtra", # fancy tables
+  "skimr" # descripive stats
+)
+spatialec::loadLibraries(myPackages) # Beware: this will try to install packages that may not be installed
+spatialec::setup()
+```
+Additional R packages used in this report (`r myPackages`):
+ * data.table - for fast data munching [@data.table]
+ * ggplot2 - for slick graphics [@ggplot2]
+ * here - for path management
+ * kableExtra - for pretty tables [@kableExtra]
+ * knitr - to create this document [@knitr]
+ * skimr - for data summaries [@skimr]
+# Report Purpose
+To process NZ Census 2013 data (from `r p_Params$censusDataSource`) into a form that can be used in an ipf model of national/regional/local area demand.
+It:
+ * Loads NZ Census area unit 2013 data 
+ * processes to ipf form and matching coding to Green Grid survey
+# Load data
+NZ Area Unit data is loaded as separate files.
+2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
+  * n people
+  * n dependent children
+  * fuel source (all counted - may cause confusion as sum to > 100% of households)
+  * n rooms
+We could also use other tables with other constraints if they are available - e.g. N bedrooms.
+NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
+## Load areas labels
+First load area labels as we use these to select the right data rows.
+```{r loadLabels}
+areasDT <- data.table::fread(p_Params$areasTable2013)
+auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
+auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
+setkey(auListDT, AU2013_code)
+kableExtra::kable(caption = "Example label data", head(auListDT))
+```
+## Fuel source
+```{r fuelSource}
+fuelf <- paste0(p_Params$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
+message("Loading: ", fuelf)
+fuelDT <- data.table::fread(fuelf)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT)))
+fuelDT <- fuelDT[, AU2013_code := AREA]
+setkey(fuelDT, AU2013_code)
+fuelDT <- fuelDT[auListDT]
+message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
+# create categories
+# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
+fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
+                 censusConstraint := "fuel_totalHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
+                 censusConstraint := "fuel_totalStatedHouseholds"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
+                 censusConstraint := "heatSourceWood"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
+                   `Fuel type used to heat dwelling` == "Electricity",
+                 censusConstraint := "heatSourceElectricity"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling`  %like% "gas",
+                 censusConstraint := "heatSourceGas"]
+fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
+                 censusConstraint := "heatSourceCoal"]
+#table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
+# convert to wide so variables in columns (for IPF)
+fuelDT <- fuelDT[, count := Value]
+dt <- fuelDT[Year == 2013, .(count = sum(count, na.rm = TRUE)), # sum the counts by fuel type to ensure uniqe fuel type obs per area
+             keyby = .(AU2013_code,AU2013_label,censusConstraint)]
+fuel2013WDT <- reshape(dt, 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
+                        "count.heatSourceElectricity", "count.heatSourceGas", 
+                        "count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
+         c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
+                        "heatSourceElectricity", "heatSourceGas", 
+                        "heatSourceWood", "heatSourceCoal", "heatSourceOther"))
+```
+```{r fuelDist, fig.cap="Distribution of check sum % "}
+t <- table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
+kableExtra::kable(t, caption = "Cross check (all years)") %>%
+  kable_styling()
+t <- summary(fuel2013WDT)
+kableExtra::kable(t, caption = "Summary of 2013 data") %>%
+  kable_styling()
+# check counts add up
+fuel2013WDT[, calcSum := heatSourceElectricity + heatSourceGas + heatSourceWood + heatSourceCoal + heatSourceOther]
+fuel2013WDT[, calcSumPc := 100 * (calcSum/fuel_totalHouseholds)]
+# check
+summary(fuel2013WDT$calcSumPc)
+kableExtra::kable(head(fuel2013WDT[calcSumPc > 100]), digits = 2, 
+                  caption = "Example areas where > 100% households coded (multiple answers allowed so this table may be quite large)") %>%
+  kable_styling()
+ggplot2::ggplot(fuel2013WDT, aes(x = calcSumPc)) +
+  geom_histogram(binwidth = 10) +
+  labs(x = "% of total households",
+       caption = "0 values indicate area units with small cell counts and hence data redaction\nNote there can be multiple responses")
+```
+## n Kids
+```{r nKids}
+kidsf <- paste0(p_Params$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
+message("Loading: ", kidsf)
+kidsDT <- data.table::fread(kidsf)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT)))
+kidsDT <- kidsDT[, AU2013_code := AREA]
+setkey(kidsDT, AU2013_code)
+kidsDT <- kidsDT[auListDT]
+message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
+# > create categories
+# "nKids_0", "nKids_1m"
+kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
+kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
+                 censusConstraint := "nkids_totalFamilies"]
+#table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
+# > convert to wide
+kidsDT <- kidsDT[, count := Value]
+dt <- kidsDT[Year == 2013, .(count = sum(count, na.rm = TRUE)), 
+             keyby = .(AU2013_code,AU2013_label,censusConstraint)]
+kids2013WDT <- reshape(dt, 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+# > calculate n households with 0 kids
+# do this again later when we have the number of _households_ to use as the base
+kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
+setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"), 
+         c("nkids_totalFamilies", "nKids_1m"))
+```
+```{r nkidsDist, fig.cap="Distribution of check sum %"}
+t <- table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
+kableExtra::kable(t, caption = "Cross check (all years)") %>%
+  kable_styling()
+t <- summary(kids2013WDT)
+kableExtra::kable(t, caption = "Summary of 2013 data") %>%
+  kable_styling()
+# check counts add up
+kids2013WDT[, calcSum := nKids_0_families + nKids_1m ]
+kids2013WDT[, calcSumPc := 100 * (calcSum/nkids_totalFamilies)]
+# check
+summary(kids2013WDT$calcSumPc)
+kableExtra::kable(head(kids2013WDT[calcSumPc > 100]), digits = 2, 
+                  caption = "Example areas where > 100% families coded") %>%
+  kable_styling()
+ggplot2::ggplot(kids2013WDT, aes(x = calcSumPc)) +
+  geom_histogram(binwidth = 10) +
+  labs(x = "% of total families",
+       caption = "0 values indicate area units with small cell counts and hence data redaction")
+```
+## n People
+```{r nPeople}
+npeoplef <- paste0(p_Params$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
+message("Loading: ", npeoplef)
+npeopleDT <- data.table::fread(npeoplef)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))
+        )
+setkey(npeopleDT, Area) # forgot to get code attached to this one
+npeopleDT <- npeopleDT[, AU2013_label := Area]
+setkey(auListDT,AU2013_label) # set to label instead of code - should still work
+npeopleDT <- npeopleDT[auListDT]
+message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
+# > create categories ----
+# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
+npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
+npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
+                       censusConstraint := "npeople_totalHouseholds"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
+                       censusConstraint := "nPeople_1"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
+                       censusConstraint := "nPeople_2"] 
+npeopleDT <- npeopleDT[`Number of usual residents in household`  %like% "Three",
+                       censusConstraint := "nPeople_3"] 
+#table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
+# convert to wide
+npeopleDT <- npeopleDT[, count := Value]
+dt <- npeopleDT[Year == 2013, .(count = sum(count, na.rm = TRUE)), 
+             keyby = .(AU2013_code,AU2013_label,censusConstraint)]
+npeople2013WDT <- reshape(dt, 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"), 
+         c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
+```
+```{r npeopleDist, fig.cap="Test check sum %"}
+t <- table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
+kableExtra::kable(t, caption = "Cross check (all years)") %>%
+  kable_styling()
+t <- summary(npeople2013WDT)
+kableExtra::kable(t, caption = "Summary of 2013 data") %>%
+  kable_styling()
+# check counts add up
+npeople2013WDT[, calcSum := nPeople_1 + nPeople_2 + nPeople_3 +  nPeople_4m]
+npeople2013WDT[, calcSumPc := 100*(calcSum/npeople_totalHouseholds)]
+# check
+summary(npeople2013WDT$calcSumPc)
+kableExtra::kable(head(npeople2013WDT[calcSumPc > 100]), digits = 2, 
+                  caption = "Example areas where > 100% households coded - check how close to 100% they are") %>%
+  kable_styling()
+ggplot2::ggplot(npeople2013WDT, aes(x = calcSumPc)) +
+  geom_histogram(binwidth = 10) +
+  labs(x = "% of total households",
+       caption = "0 values indicate area units with small cell counts and hence data redaction")
+```
+## n Rooms
+```{r nRooms}
+nroomsf <- paste0(p_Params$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
+message("Loading: ", nroomsf)
+nroomsDT <- data.table::fread(nroomsf)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))
+        )
+nroomsDT <- nroomsDT[, AU2013_code := AREA]
+setkey(nroomsDT, AU2013_code)
+setkey(auListDT,AU2013_code) # set to back to code
+nroomsDT <- nroomsDT[auListDT]
+message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
+# > create categories ----
+# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
+nroomsDT <- nroomsDT[ROOMS > 0 & ROOMS < 5,
+                       censusConstraint := "nRooms1_4"] 
+nroomsDT <- nroomsDT[ROOMS > 4 & ROOMS < 7,
+                       censusConstraint := "nRooms5_6"] 
+nroomsDT <- nroomsDT[ROOMS > 6 ,
+                       censusConstraint := "nRooms7m"] 
+# nroomsDT <- nroomsDT[ROOMS == 99,
+#                        censusConstraint := "Other"] # exclude
+nroomsDT <- nroomsDT[ROOMS == 999,
+                       censusConstraint := "nrooms_statedtotalHouseholds"]
+nroomsDT <- nroomsDT[ROOMS == 9999,
+                       censusConstraint := "nrooms_totalHouseholds"] 
+setkey(nroomsDT, Year, Area)
+nroomsDT <- nroomsDT[!is.na(censusConstraint) | !is.na(Year)]
+# convert to wide
+nroomsDT <- nroomsDT[, count := Value]
+dt <- nroomsDT[Year == 2013, .(count = sum(count, na.rm = TRUE)), keyby = .(AU2013_code,AU2013_label,censusConstraint)]
+nrooms2013WDT <- reshape(dt, 
+                          idvar = c("AU2013_code", "AU2013_label"),
+                          timevar = "censusConstraint",
+                          direction = "wide")
+setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
+                          "count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"), 
+         c("nRooms1_4", "nRooms5_6", "nRooms7m",
+           "nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
+```
+```{r nRoomsDist, fig.cap = "Test check sum %"}
+t <- table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
+kableExtra::kable(t, caption = "Cross check (all years)") %>%
+  kable_styling()
+t <- summary(nrooms2013WDT)
+kableExtra::kable(t, caption = "Summary of 2013 data") %>%
+  kable_styling()
+# check counts add up
+nrooms2013WDT[, calcSum := nRooms1_4 + nRooms5_6 + nRooms7m ]
+nrooms2013WDT[, calcSumPc := 100*(calcSum/nrooms_totalHouseholds)]
+# check
+summary(nrooms2013WDT$calcSumPc)
+kableExtra::kable(head(nrooms2013WDT[calcSumPc > 100]), digits = 2, 
+                  caption = "Example areas where > 100% households coded - check how close to 100% they are") %>%
+  kable_styling()
+ggplot2::ggplot(nrooms2013WDT, aes(x = calcSumPc)) +
+  geom_histogram(binwidth = 10) +
+  labs(x = "% of total households",
+       caption = "0 values indicate area units with small cell counts and hence data redaction")
+```
+# Combine and save constraint files
+```{r combinaData}
+setkey(auListDT, AU2013_code, AU2013_label)
+setkey(fuel2013WDT, AU2013_code, AU2013_label)
+setkey(kids2013WDT, AU2013_code, AU2013_label)
+setkey(npeople2013WDT, AU2013_code, AU2013_label)
+setkey(nrooms2013WDT, AU2013_code, AU2013_label)
+message("Merging on AU2013_code * AU2013_label")
+au2013DT <- fuel2013WDT[auListDT]
+au2013DT <- kids2013WDT[au2013DT]
+au2013DT <- npeople2013WDT[au2013DT]
+au2013DT <- nrooms2013WDT[au2013DT]
+# recalculate n households without children here
+au2013DT <- au2013DT[, nKids_0 :=  nrooms_totalHouseholds - nKids_1m]
+message("N rows of data: ", nrow(au2013DT))
+message("N areas: ", uniqueN(au2013DT$AU2013_label))
+```
+Save data...
+```{r saveData}
+of <- paste0(p_Params$dataPath, "processed/2013IpfInput.csv")
+au2013DT$count.NA <- NULL
+data.table::fwrite(au2013DT, of)
+message("Data saved as: ", of)
+message("Variables saved:")
+names(au2013DT)
+```
+# Check household totals
+These will vary depending on the source table (families vs households vs dwellings etc)
+```{r checkTotals}
+pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
+                        npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
+pairs(pairsDT)
+```
+We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
+# Acknowledgements
+```{r generic.Ack, child=p_Params$ackLong}
+# generic acks
+```
+# About
+```{r check.runtime}
+t <- proc.time() - startTime
+elapsed <- t[[3]]
+```
+```{r generic.About, child=p_Params$spatialecAbout}
+# generic about include
+```
+Analysis completed in `r elapsed` seconds ( `r round(elapsed/60,2)` minutes) using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com) with `r R.version.string` running on `r R.version$platform`.
+# Annexes
+## Census data
+```{r skimGgSurveyData}
+if(exists("au2013DT")){
+  skimr::skim(au2013DT)
+}
+```
+```{r runToHere}
+```
+# References
--- a/dataProcessing/processCensusAu2013.html
+++ b/dataProcessing/processCensusAu2013.html
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap-theme.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap-theme.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap-theme.css.map
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap-theme.css.map
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap-theme.min.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap-theme.min.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap.css.map
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap.css.map
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap.min.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/bootstrap.min.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/cerulean.min.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/cerulean.min.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/cosmo.min.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/cosmo.min.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/darkly.min.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/darkly.min.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/flatly.min.css
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/flatly.min.css
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/Lato.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/Lato.ttf
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/LatoBold.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/LatoBold.ttf
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/LatoItalic.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/LatoItalic.ttf
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/NewsCycle.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/NewsCycle.ttf
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/NewsCycleBold.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/NewsCycleBold.ttf
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/OpenSans.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/OpenSans.ttf
--- a/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/OpenSansBold.ttf
+++ b/dataProcessing/processCensusAu2013_files/bootstrap-3.3.5/css/fonts/OpenSansBold.ttf