Skip to content
Snippets Groups Projects
Commit 9defb49d authored by Ben Anderson's avatar Ben Anderson
Browse files

updated constraint processing & checking; switched census from aggregated...

updated constraint processing & checking; switched census from aggregated meshblock to raw area unit data from NZStats; added heat source/fuel as potential constraint (not used)
parent b180d90e
Branches
No related tags found
No related merge requests found
...@@ -3,18 +3,16 @@ ...@@ -3,18 +3,16 @@
### NZ Area Unit data ### NZ Area Unit data
2013 NZ Census data from NZ Stats at area unit level (pre-processed long form file). 2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
```{r loadAU2013} * n people
f <- paste0(sParams$dataPath, "processed/areaunits2013.csv.gz") * n dependent children
outputMessage(paste0("Loading: ", f)) * fuel source (all counted - may cause confusion as sum to > 100% of households)
au2013DT <- data.table::fread(f) * n rooms
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(au2013DT))))
outputMessage(paste0("N area units loaded: ", dkUtils::tidyNum(uniqueN(au2013DT$AU2013_code))))
``` NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
Load area labels First load area labels as we use these to select the right data rows.
```{r loadAreaLabels} ```{r loadAreaLabels}
areasDT <- data.table::fread(sParams$areasTable2013) areasDT <- data.table::fread(sParams$areasTable2013)
...@@ -24,6 +22,191 @@ auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier ma ...@@ -24,6 +22,191 @@ auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier ma
setkey(auListDT, AU2013_code) setkey(auListDT, AU2013_code)
``` ```
```{r loadAU2013}
# fuelSource ----
fuelf <- paste0(sParams$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
outputMessage(paste0("Loading: ", fuelf))
fuelDT <- data.table::fread(fuelf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT))))
fuelDT <- fuelDT[, AU2013_code := AREA]
setkey(fuelDT, AU2013_code)
fuelDT <- fuelDT[auListDT]
message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
# create categories
# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
censusConstraint := "fuel_totalHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
censusConstraint := "fuel_totalStatedHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
censusConstraint := "heatSourceWood"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
`Fuel type used to heat dwelling` == "Electricity",
censusConstraint := "heatSourceElectricity"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "gas",
censusConstraint := "heatSourceGas"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
censusConstraint := "heatSourceCoal"]
table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
# convert to wide
fuelDT <- fuelDT[, count := Value]
fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
"count.heatSourceElectricity", "count.heatSourceGas",
"count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
"heatSourceElectricity", "heatSourceGas",
"heatSourceWood", "heatSourceCoal", "heatSourceOther"))
# nKids ----
kidsf <- paste0(sParams$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
outputMessage(paste0("Loading: ", kidsf))
kidsDT <- data.table::fread(kidsf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT))))
kidsDT <- kidsDT[, AU2013_code := AREA]
setkey(kidsDT, AU2013_code)
kidsDT <- kidsDT[auListDT]
message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
# > create categories
# "nKids_0", "nKids_1m"
kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
censusConstraint := "nkids_totalFamilies"]
table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
# > convert to wide
kidsDT <- kidsDT[, count := Value]
kids2013WDT <- reshape(kidsDT[YEAR == 2013,
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
# > calculate n households with 0 kids
# do this again later when we have the number of _households_ to use as the base
kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"),
c("nkids_totalFamilies", "nKids_1m"))
# nPeople ----
npeoplef <- paste0(sParams$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
outputMessage(paste0("Loading: ", npeoplef))
npeopleDT <- data.table::fread(npeoplef)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))))
setkey(npeopleDT, Area) # forgot to get code attached to this one
npeopleDT <- npeopleDT[, AU2013_label := Area]
setkey(auListDT,AU2013_label) # set to label instead of code - should still work
npeopleDT <- npeopleDT[auListDT]
message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
# > create categories ----
# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
censusConstraint := "npeople_totalHouseholds"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
censusConstraint := "nPeople_1"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
censusConstraint := "nPeople_2"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Three",
censusConstraint := "nPeople_3"]
table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
# convert to wide
npeopleDT <- npeopleDT[, count := Value]
npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"),
c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
# nRooms ----
nroomsf <- paste0(sParams$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
outputMessage(paste0("Loading: ", nroomsf))
nroomsDT <- data.table::fread(nroomsf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))))
nroomsDT <- nroomsDT[, AU2013_code := AREA]
setkey(nroomsDT, AU2013_code)
setkey(auListDT,AU2013_code) # set to back to code
nroomsDT <- nroomsDT[auListDT]
message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
# > create categories ----
# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
censusConstraint := "nrooms_statedtotalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
censusConstraint := "nrooms_totalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
`Number of rooms` %like% "Two" |
`Number of rooms` %like% "Three" |
`Number of rooms` %like% "Four",
censusConstraint := "nRooms1_4"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" |
`Number of rooms` %like% "Six" ,
censusConstraint := "nRooms5_6"]
table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
# convert to wide
nroomsDT <- nroomsDT[, count := Value]
nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
"count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"),
c("nRooms1_4", "nRooms5_6", "nRooms7m",
"nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
# combine them ----
setkey(auListDT, AU2013_code, AU2013_label)
setkey(fuel2013WDT, AU2013_code, AU2013_label)
setkey(kids2013WDT, AU2013_code, AU2013_label)
setkey(npeople2013WDT, AU2013_code, AU2013_label)
setkey(nrooms2013WDT, AU2013_code, AU2013_label)
au2013DT <- fuel2013WDT[auListDT]
au2013DT <- kids2013WDT[au2013DT]
au2013DT <- npeople2013WDT[au2013DT]
au2013DT <- nrooms2013WDT[au2013DT]
# recalulate n households without children
au2013DT <- au2013DT[, nKids_0 := nrooms_totalHouseholds- nKids_1m]
```
Figure \@ref(fig:checkTotals) shows the match between the total household/family counts derived from each NZ stats table. As we would expect there are very minor differences with the exception of the totals derived from the families table (presence of children).
```{r checkTotals, fig.cap="Plots of household/family totals"}
pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
pairs(pairsDT)
```
We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level. We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
### GREENGrid Survey data ### GREENGrid Survey data
...@@ -31,7 +214,7 @@ We focus on households/families/dwellings not individuals as the spatial microsi ...@@ -31,7 +214,7 @@ We focus on households/families/dwellings not individuals as the spatial microsi
This comprises the household attributes (survey) data: This comprises the household attributes (survey) data:
```{r loadGGHousehold} ```{r loadGGHousehold}
f <- paste0(sParams$ggPath, "/reshare/v1.0/data/ggHouseholdAttributesSafe.csv.zip") f <- paste0(sParams$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source &
outputMessage(paste0("Loading: ", f)) outputMessage(paste0("Loading: ", f))
ggHhDT <- data.table::as.data.table(readr::read_csv(f)) ggHhDT <- data.table::as.data.table(readr::read_csv(f))
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT)))) outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))))
...@@ -39,11 +222,7 @@ outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$li ...@@ -39,11 +222,7 @@ outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$li
``` ```
## Create synthetic weighted census
This step uses the household attributes file and the Census file to create a synthetic (weighted) household dataset for the areas of interest.
### Harmonise categories
We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding. We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding.
#### Families: Presence of children #### Families: Presence of children
...@@ -55,12 +234,10 @@ Survey: ...@@ -55,12 +234,10 @@ Survey:
Census: Census:
* family_type_for_families_in_occupied_private_dwellings_Couple_with_child(ren) * 0,1+ (simplest)
* family_type_for_families_in_occupied_private_dwellings_One_parent_with_child(ren)
* family_type_for_families_in_occupied_private_dwellings_Total_families_in_occupied_private_dwellings (subtract those with children to get those without?)
```{r presenceChildren} ```{r presenceChildren}
# number of kids: categorised as: 1,2,3,4+ ---- # survey: number of kids: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nKids := nChildren0_12 + nTeenagers13_18] ggHhDT <- ggHhDT[, nKids := nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nKids, useNA = "always") table(ggHhDT$nKids, useNA = "always")
message("There are NA - we will need to remove them later") message("There are NA - we will need to remove them later")
...@@ -76,14 +253,6 @@ ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")] ...@@ -76,14 +253,6 @@ ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]
table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always") table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")
# census: ----
au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_dwellings_Couple_with_child(ren)" |
variable == "family_type_for_families_in_occupied_private_dwellings_One_parent_with_child(ren)", censusConstraint := "nKids_1m"]
au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_dwellings_Total_families_in_occupied_private_dwellings", censusConstraint := "TotalFamilies"]
# we can't calculate the residual until we have the wide form
``` ```
...@@ -91,7 +260,7 @@ au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_d ...@@ -91,7 +260,7 @@ au2013DT <- au2013DT[variable == "family_type_for_families_in_occupied_private_d
#### Households: Number of people #### Households: Number of people
```{r nPeople} ```{r nPeople}
# number of people: categorised as: 1,2,3,4+ ---- # survey: number of people: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18] ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nPeople, useNA = "always") table(ggHhDT$nPeople, useNA = "always")
message("There are NA - we will need to remove them later") message("There are NA - we will need to remove them later")
...@@ -114,38 +283,14 @@ ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)] ...@@ -114,38 +283,14 @@ ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]
table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always") table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")
# census: ----
# number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_One_Usual_Resident up to number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Eight_or_More_Usual_Residents
au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_One_Usual_Resident", censusConstraint := "nPeople_1"]
au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Two_Usual_Residents", censusConstraint := "nPeople_2"]
au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Three_Usual_Residents", censusConstraint := "nPeople_3"]
au2013DT <- au2013DT[variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Four_Usual_Residents" |
variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Five_Usual_Residents" |
variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Six_Usual_Residents" |
variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Seven_Usual_Residents" |
variable == "number_of_usual_residents_in_household(1)_for_households_in_occupied_private_dwellings_Eight_or_More_Usual_Residents", censusConstraint := "nPeople_4m"]
origT <- au2013DT[variable %like% "Usual_Resident", .(nHouseholds = sum(value)), keyby = .(variable)]
kableExtra::kable(origT, caption = "Census area units: n people (original)")%>%
kable_styling()
recodeT <- au2013DT[censusConstraint %like% "nPeople", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(recodeT, caption = "Census area units: n people (recoded)")%>%
kable_styling()
message("Total households (original) = ", sum(origT$nHouseholds))
message("Total households (recoded) = ", sum(recodeT$nHouseholds))
message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nPeople"]$AU2013_code))
``` ```
#### Dwellings: Number of rooms #### Dwellings: Number of rooms
We have to do some imputation here
```{r nRooms} ```{r nRooms}
# survey ---- # survey: n bedrooms via Q10 ----
ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
`Q10#1_2_1_TEXT` + # living rooms `Q10#1_2_1_TEXT` + # living rooms
`Q10#1_3_1_TEXT` + # dining rooms-kitchens `Q10#1_3_1_TEXT` + # dining rooms-kitchens
...@@ -162,6 +307,7 @@ r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids) ...@@ -162,6 +307,7 @@ r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)
summary(r) summary(r)
# > impute where missing ----
message("that looks reasonable...impute where missing") message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
...@@ -205,38 +351,11 @@ ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)] ...@@ -205,38 +351,11 @@ ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]
message("Check coding") message("Check coding")
table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always") table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")
# census ----
au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_One_Room" |
variable == "number_of_rooms_for_occupied_private_dwellings_Two_Rooms" |
variable == "number_of_rooms_for_occupied_private_dwellings_Three_Rooms" |
variable == "number_of_rooms_for_occupied_private_dwellings_Four_Rooms",
censusConstraint := "nRooms1_4"]
au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_Five_Rooms" |
variable == "number_of_rooms_for_occupied_private_dwellings_Six_Rooms" ,
censusConstraint := "nRooms5_6"]
au2013DT <- au2013DT[variable == "number_of_rooms_for_occupied_private_dwellings_Seven_Rooms" |
variable == "number_of_rooms_for_occupied_private_dwellings_Eight_or_More_Rooms",
censusConstraint := "nRooms7m"]
origT <- au2013DT[variable %like% "_Rooms" |
variable %like% "_Room", .(nHouseholds = sum(value)), keyby = .(variable)]
kableExtra::kable(origT, caption = "Census area units: n people (original)")%>%
kable_styling()
recodeT <- au2013DT[censusConstraint %like% "nRooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(recodeT, caption = "Census area units: n people (recoded)")%>%
kable_styling()
message("Total households (original) = ", sum(origT$nHouseholds))
message("Total households (recoded) = ", sum(recodeT$nHouseholds))
message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nRooms"]$AU2013_code))
``` ```
#### Dwellings: Number of bedrooms #### Dwellings: Number of bedrooms
Probably collinear with n people/n rooms Probably collinear with n people/n rooms. Also have to impute.
```{r nBedrooms} ```{r nBedrooms}
...@@ -252,6 +371,7 @@ r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids) ...@@ -252,6 +371,7 @@ r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)
summary(r) summary(r)
# > impute where missing ----
message("that looks reasonable...impute where missing") message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
...@@ -293,35 +413,6 @@ ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat) ...@@ -293,35 +413,6 @@ ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)
table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always") table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")
# census ----
# census: number_of_bedrooms_for_occupied_private_dwellings_One_Bedroom to number_of_bedrooms_for_occupied_private_dwellings_Eight_or_More_Bedrooms
au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_One_Bedroom" |
variable == "number_of_bedrooms_for_occupied_private_dwellings_Two_Bedrooms", censusConstraint := "nBedrooms_1_2"]
au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_Three_Bedrooms", censusConstraint := "nBedrooms_3"]
au2013DT <- au2013DT[variable == "number_of_bedrooms_for_occupied_private_dwellings_Four_Bedrooms" |
variable == "number_of_bedrooms_for_occupied_private_dwellings_Five_Bedrooms" |
variable == "number_of_bedrooms_for_occupied_private_dwellings_Six_Bedrooms" |
variable == "number_of_bedrooms_for_occupied_private_dwellings_Seven_Bedrooms" |
variable == "number_of_bedrooms_for_occupied_private_dwellings_Eight_or_More_Bedrooms", censusConstraint := "nBedrooms_4m"]
origT <- au2013DT[variable %like% "_Bedroom" &
!(variable == "number_of_bedrooms_for_occupied_private_dwellings_Mean_Number_of_Bedrooms"),
.(nHouseholds = sum(value)), keyby = .(variable)]
kableExtra::kable(origT, caption = "Census area units: n bedrooms (original)")%>%
kable_styling()
recodeT <- au2013DT[censusConstraint %like% "nBedrooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(recodeT, caption = "Census area units: n bedrooms (recoded)")%>%
kable_styling()
message("Total households (original) = ", sum(origT$nHouseholds))
message("Total households (recoded) = ", sum(recodeT$nHouseholds))
message("Total unit areas (recoded) = ", uniqueN(au2013DT[censusConstraint %like% "nBedrooms"]$AU2013_code))
#au2013DT[!is.na(censusConstraint), .(nHHs = sum(value)), keyby = .(variable, censusConstraint)]
``` ```
Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households. Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households.
...@@ -333,27 +424,39 @@ Census - unknown ...@@ -333,27 +424,39 @@ Census - unknown
#### Dwellings: Main fuel used for heat #### Dwellings: Main fuel used for heat
Survey - known but but we do not have the response code list to determine what the response was :-( ```{r surveyHeatSource}
Census - known # survey: Q20 ----
#### Dwelling: Dwelling type table(ggHhDT$Q20_coded, useNA = "always")
Survey - not known ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
Census - known "Enclosed wood burner" = "Wood",
"Open fire" = "Wood", # could be coal
"Heat pump"= "Electricity",
"HRV or other ventilation system" = "Electricity",
"Portable electric heaters" = "Electricity",
"Portable gas heater" = "Gas",
"Underfloor gas heating" = "Gas",
"Other" = "Other")]
#### Fix final survey data table(ggHhDT$Q20_coded, ggHhDT$heatFuel, useNA = "always")
ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]
Test colinearity table(ggHhDT$heatFuel, ggHhDT$Location, useNA = "always")
```
```{r testCor}
cor.test(ggHhDT$nPeople, ggHhDT$nBedrooms)
cor.test(ggHhDT$nPeople, ggHhDT$nRooms)
cor.test(ggHhDT$nRooms, ggHhDT$nBedrooms)
``` #### Dwelling: Dwelling type
Survey - not known
Census - known
So people & bedrooms are the least colinear of these. Which is odd given that we used one to predict the other. Anyway, we will use nBedrooms. #### Fix final survey data
Filter survey data to include just the variables we want and fix any NAs. Filter survey data to include just the variables we want and fix any NAs.
...@@ -361,7 +464,8 @@ Filter survey data to include just the variables we want and fix any NAs. ...@@ -361,7 +464,8 @@ Filter survey data to include just the variables we want and fix any NAs.
surveyDT <- ggHhDT[, .(linkID, surveyDT <- ggHhDT[, .(linkID,
nKids_0, nKids_1m, nKids_0, nKids_1m,
nPeople_1, nPeople_2, nPeople_3, nPeople_4m, nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
#nRooms1_5, nRooms6_7, nRooms8m, nRooms1_4, nRooms5_6, nRooms7m,
heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)] nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]
skimr::skim(surveyDT) skimr::skim(surveyDT)
...@@ -382,38 +486,23 @@ Now select just the census data for: ...@@ -382,38 +486,23 @@ Now select just the census data for:
* Taranaki Region * Taranaki Region
```{r filterCensusRegion} ```{r filterCensusRegion}
censusAuDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" | censusAuWideDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
REGC2013_label == "Taranaki Region"] REGC2013_label == "Taranaki Region"]
# remove variables and rows we don't want message("Total households (rooms table) = ", sum(censusAuWideDT$nrooms_totalHouseholds))
censusAuDT <- censusAuDT[, variable := NULL]
censusAuDT <- censusAuDT[!is.na(censusConstraint),]
t <- censusAuDT[censusConstraint %like% "nPeople" , .(nHouseholds = sum(value)), keyby = .(REGC2013_label, AU2013_label, AU2013_code)] message("Total households (fuel table) = ", sum(censusAuWideDT$fuel_totalHouseholds))
kableExtra::kable(t, caption = "Selected area units by region")%>% message("Total households (npeople table) = ", sum(censusAuWideDT$npeople_totalHouseholds))
kable_styling()
message("Total households = ", sum(t$nHouseholds))
nPeopleT <- censusAuDT[censusConstraint %like% "nPeople", .(nHouseholds = sum(value)), keyby = .(censusConstraint)] message("Total families (kids table) = ", sum(censusAuWideDT$nkids_totalFamilies))
kableExtra::kable(nPeopleT, caption = "Census area units: n nPeople (recoded)")%>%
kable_styling()
message("Total households = ", sum(nPeopleT$nHouseholds)) t <- censusAuWideDT[, .(nHouseholdsRooms = sum(nrooms_totalHouseholds),
nHouseholdsFuel = sum(fuel_totalHouseholds),
nBedroomsT <- censusAuDT[censusConstraint %like% "nBedrooms", .(nHouseholds = sum(value)), keyby = .(censusConstraint)] nHouseholdsPeople = sum(npeople_totalHouseholds)), keyby = REGC2013_label]
kableExtra::kable(nBedroomsT, caption = "Census area units: n bedrooms (recoded)")%>%
kable_styling()
message("Total households = ", sum(nBedroomsT$nHouseholds))
nKidsT <- censusAuDT[censusConstraint %like% "nKids", .(nHouseholds = sum(value)), keyby = .(censusConstraint)]
kableExtra::kable(nKidsT, caption = "Census area units: n kids (recoded) - residual not yet calculated")%>%
kable_styling()
message("Total households = ", sum(nKidsT$nHouseholds))
kableExtra::kable(t, caption = "Household counts by region (by table source)")
``` ```
```{r run to here}
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment