_loadNonPowerData.Rmd
-
Ben Anderson authored
updated constraint processing & checking; switched census from aggregated meshblock to raw area unit data from NZStats; added heat source/fuel as potential constraint (not used)
Ben Anderson authoredupdated constraint processing & checking; switched census from aggregated meshblock to raw area unit data from NZStats; added heat source/fuel as potential constraint (not used)
- Load data
- NZ Area Unit data
- GREENGrid Survey data
- Families: Presence of children
- Households: Number of people
- Dwellings: Number of rooms
- Dwellings: Number of bedrooms
- Dwellings: Main heat source for hot water
- Dwellings: Main fuel used for heat
- Dwelling: Dwelling type
- Fix final survey data
- Select areas of interest
Load data
NZ Area Unit data
2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
- n people
- n dependent children
- fuel source (all counted - may cause confusion as sum to > 100% of households)
- n rooms
NB: these files, when downloaded form the NZStats data extractor come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
First load area labels as we use these to select the right data rows.
areasDT <- data.table::fread(sParams$areasTable2013)
auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
setkey(auListDT, AU2013_code)
# fuelSource ----
fuelf <- paste0(sParams$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
outputMessage(paste0("Loading: ", fuelf))
fuelDT <- data.table::fread(fuelf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT))))
fuelDT <- fuelDT[, AU2013_code := AREA]
setkey(fuelDT, AU2013_code)
fuelDT <- fuelDT[auListDT]
message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
# create categories
# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
censusConstraint := "fuel_totalHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
censusConstraint := "fuel_totalStatedHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
censusConstraint := "heatSourceWood"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
`Fuel type used to heat dwelling` == "Electricity",
censusConstraint := "heatSourceElectricity"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "gas",
censusConstraint := "heatSourceGas"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
censusConstraint := "heatSourceCoal"]
table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
# convert to wide
fuelDT <- fuelDT[, count := Value]
fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
"count.heatSourceElectricity", "count.heatSourceGas",
"count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
"heatSourceElectricity", "heatSourceGas",
"heatSourceWood", "heatSourceCoal", "heatSourceOther"))
# nKids ----
kidsf <- paste0(sParams$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
outputMessage(paste0("Loading: ", kidsf))
kidsDT <- data.table::fread(kidsf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT))))
kidsDT <- kidsDT[, AU2013_code := AREA]
setkey(kidsDT, AU2013_code)
kidsDT <- kidsDT[auListDT]
message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
# > create categories
# "nKids_0", "nKids_1m"
kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
censusConstraint := "nkids_totalFamilies"]
table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
# > convert to wide
kidsDT <- kidsDT[, count := Value]
kids2013WDT <- reshape(kidsDT[YEAR == 2013,
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
# > calculate n households with 0 kids
# do this again later when we have the number of _households_ to use as the base
kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"),
c("nkids_totalFamilies", "nKids_1m"))
# nPeople ----
npeoplef <- paste0(sParams$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
outputMessage(paste0("Loading: ", npeoplef))
npeopleDT <- data.table::fread(npeoplef)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))))
setkey(npeopleDT, Area) # forgot to get code attached to this one
npeopleDT <- npeopleDT[, AU2013_label := Area]
setkey(auListDT,AU2013_label) # set to label instead of code - should still work
npeopleDT <- npeopleDT[auListDT]
message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
# > create categories ----
# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
censusConstraint := "npeople_totalHouseholds"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
censusConstraint := "nPeople_1"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
censusConstraint := "nPeople_2"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Three",
censusConstraint := "nPeople_3"]
table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
# convert to wide
npeopleDT <- npeopleDT[, count := Value]
npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"),
c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
# nRooms ----
nroomsf <- paste0(sParams$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
outputMessage(paste0("Loading: ", nroomsf))
nroomsDT <- data.table::fread(nroomsf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))))
nroomsDT <- nroomsDT[, AU2013_code := AREA]
setkey(nroomsDT, AU2013_code)
setkey(auListDT,AU2013_code) # set to back to code
nroomsDT <- nroomsDT[auListDT]
message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
# > create categories ----
# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
censusConstraint := "nrooms_statedtotalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
censusConstraint := "nrooms_totalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
`Number of rooms` %like% "Two" |
`Number of rooms` %like% "Three" |
`Number of rooms` %like% "Four",
censusConstraint := "nRooms1_4"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" |
`Number of rooms` %like% "Six" ,
censusConstraint := "nRooms5_6"]
table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
# convert to wide
nroomsDT <- nroomsDT[, count := Value]
nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
"count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"),
c("nRooms1_4", "nRooms5_6", "nRooms7m",
"nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
# combine them ----
setkey(auListDT, AU2013_code, AU2013_label)
setkey(fuel2013WDT, AU2013_code, AU2013_label)
setkey(kids2013WDT, AU2013_code, AU2013_label)
setkey(npeople2013WDT, AU2013_code, AU2013_label)
setkey(nrooms2013WDT, AU2013_code, AU2013_label)
au2013DT <- fuel2013WDT[auListDT]
au2013DT <- kids2013WDT[au2013DT]
au2013DT <- npeople2013WDT[au2013DT]
au2013DT <- nrooms2013WDT[au2013DT]
# recalulate n households without children
au2013DT <- au2013DT[, nKids_0 := nrooms_totalHouseholds- nKids_1m]
Figure @ref(fig:checkTotals) shows the match between the total household/family counts derived from each NZ stats table. As we would expect there are very minor differences with the exception of the totals derived from the families table (presence of children).
pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
pairs(pairsDT)
We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
GREENGrid Survey data
This comprises the household attributes (survey) data:
f <- paste0(sParams$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source &
outputMessage(paste0("Loading: ", f))
ggHhDT <- data.table::as.data.table(readr::read_csv(f))
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))))
outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID))))
We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding.
Families: Presence of children
Survey:
- nChildren0_12: 0,1,2+
- nTeenagers13_18: 0,1,2+
Census:
- 0,1+ (simplest)
# survey: number of kids: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nKids := nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nKids, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)]
ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)]
message("Household data: n kids")
table(ggHhDT$nKids_0)
table(ggHhDT$nKids_1m)
# create category
ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]
table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")
Households: Number of people
# survey: number of people: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nPeople, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)]
message("Household data: n people")
table(ggHhDT$nPeople_1)
table(ggHhDT$nPeople_2)
table(ggHhDT$nPeople_3)
table(ggHhDT$nPeople_4m)
# create category
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]
table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")
Dwellings: Number of rooms
We have to do some imputation here
# survey: n bedrooms via Q10 ----
ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
`Q10#1_2_1_TEXT` + # living rooms
`Q10#1_3_1_TEXT` + # dining rooms-kitchens
`Q10#1_4_1_TEXT` + # kitchens
`Q10#1_5_1_TEXT` # studies
# Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx
# Q10#1_7_1_TEXT + # toilets - excluded from census
# Q10#1_8_1_TEXT + # laundries - excluded from census
]
table(ggHhDT$nRooms, useNA = "always")
message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)
summary(r)
# > impute where missing ----
message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) +
geom_point() # original shape
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) +
geom_point() + # imputed shape
labs(caption = "Well, it is a linear model...")
message("So we use the imputed values only where we had NA")
ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms),
round(nRoomsImputed),
nRooms
)
]
message("How bad does that look?")
ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed",
"observed")]
ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) +
geom_jitter() #
message("Well, we can live with that for now...")
#
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)]
ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)]
ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)]
table(ggHhDT$nRooms1_4)
table(ggHhDT$nRooms5_6)
table(ggHhDT$nRooms7m)
# create category
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]
message("Check coding")
table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")
Dwellings: Number of bedrooms
Probably collinear with n people/n rooms. Also have to impute.
# survey: Q10#1_1_1_TEXT ----
ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`]
table(ggHhDT$nBedrooms, useNA = "always")
# best to do 1-2, 3, 4m
message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)
summary(r)
# > impute where missing ----
message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) +
geom_point() # original shape
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) +
geom_point() + # imputed shape
labs(caption = "Well, it is a linear model...")
message("So we use the imputed values only where we had NA and we round them to whole numbers")
ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms),
round(nBedroomsImputed),
nBedrooms
)
]
message("How bad does that look?")
ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed",
"observed")]
ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) +
geom_jitter() #
message("Well, we can live with that for now...")
ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)]
message("Household data: n bedrooms")
table(ggHhDT$nBedrooms_1_2)
table(ggHhDT$nBedrooms_3)
table(ggHhDT$nBedrooms_4m)
# create category
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)]
table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")
Notice that the total counts for bedrooms are lower because they are counts of dwellings instead of households.
Dwellings: Main heat source for hot water
Survey - known Census - unknown
Dwellings: Main fuel used for heat
# survey: Q20 ----
table(ggHhDT$Q20_coded, useNA = "always")
ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
"Enclosed wood burner" = "Wood",
"Open fire" = "Wood", # could be coal
"Heat pump"= "Electricity",
"HRV or other ventilation system" = "Electricity",
"Portable electric heaters" = "Electricity",
"Portable gas heater" = "Gas",
"Underfloor gas heating" = "Gas",
"Other" = "Other")]
table(ggHhDT$Q20_coded, ggHhDT$heatFuel, useNA = "always")
ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]
table(ggHhDT$heatFuel, ggHhDT$Location, useNA = "always")
Dwelling: Dwelling type
Survey - not known Census - known
Fix final survey data
Filter survey data to include just the variables we want and fix any NAs.
surveyDT <- ggHhDT[, .(linkID,
nKids_0, nKids_1m,
nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
nRooms1_4, nRooms5_6, nRooms7m,
heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]
skimr::skim(surveyDT)
message("There are still a few NAs so we need to remove these households")
surveyDT <- na.omit(surveyDT)
skimr::skim(surveyDT)
message("Fixed it?")
Select areas of interest
Now select just the census data for:
- Hawke's Bay Region
- Taranaki Region
censusAuWideDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
REGC2013_label == "Taranaki Region"]
message("Total households (rooms table) = ", sum(censusAuWideDT$nrooms_totalHouseholds))
message("Total households (fuel table) = ", sum(censusAuWideDT$fuel_totalHouseholds))
message("Total households (npeople table) = ", sum(censusAuWideDT$npeople_totalHouseholds))
message("Total families (kids table) = ", sum(censusAuWideDT$nkids_totalFamilies))
t <- censusAuWideDT[, .(nHouseholdsRooms = sum(nrooms_totalHouseholds),
nHouseholdsFuel = sum(fuel_totalHouseholds),
nHouseholdsPeople = sum(npeople_totalHouseholds)), keyby = REGC2013_label]
kableExtra::kable(t, caption = "Household counts by region (by table source)")