Skip to content
Snippets Groups Projects
Commit ae8a05c4 authored by Ben Anderson's avatar Ben Anderson
Browse files

added code to process GREEN grid survey data for ipf input

parent 403dca21
No related branches found
No related tags found
No related merge requests found
# Loads GREEN Grid Household survey data
# processes to match census 2013 as far as possible
# saves processed file for future use
require(spatialec)
myPackages <- c(
"data.table", # data munching
"dkUtils", # utils - from https://github.com/dataknut/dkUtils
"ggplot2", # plot linear imputation model results
"here", # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49
"skimr" # for skim
)
spatialec::loadLibraries(myPackages) # this will try to install packages it can't load
spatialec::setup()
# --- Project Settings ----
source(paste0(p_Params$projRoot, "/_scripts/spatialecSettings.R")) # loads knitr options & assigns file paths and loads system info
# ---- Local Settings ----
p_Params$censusDataSource <- "http://archive.stats.govt.nz/Census/2013-census/data-tables/meshblock-dataset.aspx"
p_Params$ggPath <- "~/Data/NZ_GREENGrid" # over-rides default for speed
# Print system information
outputMessage("Running on", p_Params$sysName,
"from projLoc =", projRoot) # sysName is set in spatialecSettings.R
## Load data ----
f <- paste0(p_Params$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source &
message("Loading: ", f)
ggHhDT <- data.table::fread(f)
message("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT)))
message("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID)))
# We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding.
## Families: Presence of children ----
# Survey:
#
# * nChildren0_12: 0,1,2+
# * nTeenagers13_18: 0,1,2+
#
# Census:
#
# * 0,1+ (simplest)
#>> survey: number of kids: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nKids := nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nKids, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)]
ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)]
message("Household data: n kids")
table(ggHhDT$nKids_0)
table(ggHhDT$nKids_1m)
# create category
ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]
message("Testing nKids")
table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")
# Households: Number of people ----
#>> survey: number of people: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nPeople, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)]
message("Household data: n people")
table(ggHhDT$nPeople_1)
table(ggHhDT$nPeople_2)
table(ggHhDT$nPeople_3)
table(ggHhDT$nPeople_4m)
# create category
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]
message("Testing n People")
table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")
# Dwellings: Number of rooms ----
# We have to do some imputation here
#>> survey: n bedrooms via Q10 ----
ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
`Q10#1_2_1_TEXT` + # living rooms
`Q10#1_3_1_TEXT` + # dining rooms-kitchens
`Q10#1_4_1_TEXT` + # kitchens
`Q10#1_5_1_TEXT` # studies
# Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx
# Q10#1_7_1_TEXT + # toilets - excluded from census
# Q10#1_8_1_TEXT + # laundries - excluded from census
]
message("Testing original n rooms (Q10)")
table(ggHhDT$nRooms, useNA = "always")
message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)
summary(r)
# >> impute where missing ----
message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) +
geom_point() # original shape
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) +
geom_point() + # imputed shape
labs(caption = "Well, it is a linear model...")
message("So we use the imputed values only where we had NA")
ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms),
round(nRoomsImputed),
nRooms
)
]
message("How bad does that look?")
ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed",
"observed")]
ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) +
geom_jitter() #
message("Well, we can live with that for now...")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)]
ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)]
ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)]
table(ggHhDT$nRooms1_4)
table(ggHhDT$nRooms5_6)
table(ggHhDT$nRooms7m)
# create category
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]
message("Check n Rooms coding")
table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")
# Dwellings: Number of bedrooms ----
# Probably collinear with n people/n rooms. Also have to impute.
#> survey: Q10#1_1_1_TEXT ----
ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`]
table(ggHhDT$nBedrooms, useNA = "always")
# best to do 1-2, 3, 4m
message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)
summary(r)
# > impute where missing ----
message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) +
geom_point() # original shape
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) +
geom_point() + # imputed shape
labs(caption = "Well, it is a linear model...")
message("So we use the imputed values only where we had NA and we round them to whole numbers")
ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms),
round(nBedroomsImputed),
nBedrooms
)
]
message("How bad does that look?")
ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed",
"observed")]
ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) +
geom_jitter() #
message("Well, we can live with that for now...")
ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)]
message("Household data: n bedrooms")
table(ggHhDT$nBedrooms_1_2)
table(ggHhDT$nBedrooms_3)
table(ggHhDT$nBedrooms_4m)
# create category
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)]
message("Testing n bedrooms")
table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")
# Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households.
# Dwellings: Main heat source for hot water ----
# > Survey - known ----
# > Census - unknown ----
# Dwellings: Main fuel used for heat ----
#> survey: Q20 ----
table(ggHhDT$Q20_coded, useNA = "always")
ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
"Enclosed wood burner" = "Wood",
"Open fire" = "Wood", # could be coal
"Heat pump"= "Electricity",
"HRV or other ventilation system" = "Electricity",
"Portable electric heaters" = "Electricity",
"Portable gas heater" = "Gas",
"Underfloor gas heating" = "Gas",
"Other" = "Other")]
table(ggHhDT$Q20_coded, ggHhDT$heatFuel, useNA = "always")
ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]
message("Testing main heat source")
table(ggHhDT$heatFuel, ggHhDT$Location, useNA = "always")
# Dwelling: Dwelling type ----
#> Survey - not known ----
#> Census - known ----
# Fix final survey data ----
# Filter survey data to include just the variables we want and fix any NAs.
surveyDT <- ggHhDT[, .(linkID,
nKids_0, nKids_1m,
nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
nRooms1_4, nRooms5_6, nRooms7m,
heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]
skimr::skim(surveyDT)
message("There are still a few NAs so we need to remove these households")
surveyDT <- na.omit(surveyDT)
skimr::skim(surveyDT)
message("Fixed it?")
# save data for re-use ----
data.table::fwrite(surveyDT, paste0(p_Params$ggPath, "/safe/survey/ggIpfInput.csv"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment