diff --git a/dataProcessing/processGreenGridSurvey.R b/dataProcessing/processGreenGridSurvey.R new file mode 100644 index 0000000000000000000000000000000000000000..bc234e22cd6a4058436c4f34c9e88255a1e89c93 --- /dev/null +++ b/dataProcessing/processGreenGridSurvey.R @@ -0,0 +1,277 @@ +# Loads GREEN Grid Household survey data +# processes to match census 2013 as far as possible +# saves processed file for future use +require(spatialec) +myPackages <- c( + "data.table", # data munching + "dkUtils", # utils - from https://github.com/dataknut/dkUtils + "ggplot2", # plot linear imputation model results + "here", # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49 + "skimr" # for skim +) + +spatialec::loadLibraries(myPackages) # this will try to install packages it can't load +spatialec::setup() + +# --- Project Settings ---- +source(paste0(p_Params$projRoot, "/_scripts/spatialecSettings.R")) # loads knitr options & assigns file paths and loads system info + +# ---- Local Settings ---- +p_Params$censusDataSource <- "http://archive.stats.govt.nz/Census/2013-census/data-tables/meshblock-dataset.aspx" + +p_Params$ggPath <- "~/Data/NZ_GREENGrid" # over-rides default for speed + +# Print system information +outputMessage("Running on", p_Params$sysName, + "from projLoc =", projRoot) # sysName is set in spatialecSettings.R + +## Load data ---- +f <- paste0(p_Params$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source & +message("Loading: ", f) +ggHhDT <- data.table::fread(f) +message("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))) +message("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID))) + + +# We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding. + +## Families: Presence of children ---- + +# Survey: +# +# * nChildren0_12: 0,1,2+ +# * nTeenagers13_18: 0,1,2+ +# +# Census: +# +# * 0,1+ (simplest) + +#>> survey: number of kids: categorised as: 1,2,3,4+ ---- +ggHhDT <- ggHhDT[, nKids := nChildren0_12 + nTeenagers13_18] +table(ggHhDT$nKids, useNA = "always") +message("There are NA - we will need to remove them later") +ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)] +ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)] + +message("Household data: n kids") +table(ggHhDT$nKids_0) +table(ggHhDT$nKids_1m) + +# create category +ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")] + +message("Testing nKids") +table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always") + + +# Households: Number of people ---- + +#>> survey: number of people: categorised as: 1,2,3,4+ ---- +ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18] +table(ggHhDT$nPeople, useNA = "always") +message("There are NA - we will need to remove them later") +ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)] +ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)] +ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)] +ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)] + +message("Household data: n people") +table(ggHhDT$nPeople_1) +table(ggHhDT$nPeople_2) +table(ggHhDT$nPeople_3) +table(ggHhDT$nPeople_4m) + +# create category +ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)] +ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)] +ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)] +ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)] + +message("Testing n People") +table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always") + +# Dwellings: Number of rooms ---- + +# We have to do some imputation here + +#>> survey: n bedrooms via Q10 ---- +ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms + `Q10#1_2_1_TEXT` + # living rooms + `Q10#1_3_1_TEXT` + # dining rooms-kitchens + `Q10#1_4_1_TEXT` + # kitchens + `Q10#1_5_1_TEXT` # studies + # Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx + # Q10#1_7_1_TEXT + # toilets - excluded from census + # Q10#1_8_1_TEXT + # laundries - excluded from census + ] +message("Testing original n rooms (Q10)") +table(ggHhDT$nRooms, useNA = "always") +message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...") +message("Test the model") +r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids) + +summary(r) + +# >> impute where missing ---- +message("that looks reasonable...impute where missing") +ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate + +# check what we got +ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) + + geom_point() # original shape + +ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) + + geom_point() + # imputed shape + labs(caption = "Well, it is a linear model...") + +message("So we use the imputed values only where we had NA") +ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms), + round(nRoomsImputed), + nRooms +) +] + +message("How bad does that look?") +ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed", + "observed")] +ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) + + geom_jitter() # + +message("Well, we can live with that for now...") + +message("There are NA - we will need to remove them later") +ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)] +ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)] +ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)] +table(ggHhDT$nRooms1_4) +table(ggHhDT$nRooms5_6) +table(ggHhDT$nRooms7m) + +# create category +ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)] +ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)] +ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)] + +message("Check n Rooms coding") +table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always") + +# Dwellings: Number of bedrooms ---- + +# Probably collinear with n people/n rooms. Also have to impute. + +#> survey: Q10#1_1_1_TEXT ---- + +ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`] +table(ggHhDT$nBedrooms, useNA = "always") +# best to do 1-2, 3, 4m + +message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...") +message("Test the model") +r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids) + +summary(r) + +# > impute where missing ---- +message("that looks reasonable...impute where missing") +ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate + +# check what we got +ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) + + geom_point() # original shape + +ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) + + geom_point() + # imputed shape + labs(caption = "Well, it is a linear model...") + +message("So we use the imputed values only where we had NA and we round them to whole numbers") +ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms), + round(nBedroomsImputed), + nBedrooms +) +] + +message("How bad does that look?") +ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed", + "observed")] +ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) + + geom_jitter() # + +message("Well, we can live with that for now...") + +ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)] +ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)] +ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)] +message("Household data: n bedrooms") +table(ggHhDT$nBedrooms_1_2) +table(ggHhDT$nBedrooms_3) +table(ggHhDT$nBedrooms_4m) + +# create category +ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)] +ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)] +ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)] + +message("Testing n bedrooms") +table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always") + + +# Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households. + +# Dwellings: Main heat source for hot water ---- + +# > Survey - known ---- +# > Census - unknown ---- + +# Dwellings: Main fuel used for heat ---- + +#> survey: Q20 ---- + +table(ggHhDT$Q20_coded, useNA = "always") + +ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded, + "Enclosed wood burner" = "Wood", + "Open fire" = "Wood", # could be coal + "Heat pump"= "Electricity", + "HRV or other ventilation system" = "Electricity", + "Portable electric heaters" = "Electricity", + "Portable gas heater" = "Gas", + "Underfloor gas heating" = "Gas", + "Other" = "Other")] + +table(ggHhDT$Q20_coded, ggHhDT$heatFuel, useNA = "always") +ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)] +ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)] +ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)] +ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)] +ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)] + +message("Testing main heat source") +table(ggHhDT$heatFuel, ggHhDT$Location, useNA = "always") + + +# Dwelling: Dwelling type ---- + +#> Survey - not known ---- +#> Census - known ---- + +# Fix final survey data ---- + +# Filter survey data to include just the variables we want and fix any NAs. + +surveyDT <- ggHhDT[, .(linkID, + nKids_0, nKids_1m, + nPeople_1, nPeople_2, nPeople_3, nPeople_4m, + nRooms1_4, nRooms5_6, nRooms7m, + heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther, + nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)] + +skimr::skim(surveyDT) + +message("There are still a few NAs so we need to remove these households") +surveyDT <- na.omit(surveyDT) + +skimr::skim(surveyDT) +message("Fixed it?") + +# save data for re-use ---- +data.table::fwrite(surveyDT, paste0(p_Params$ggPath, "/safe/survey/ggIpfInput.csv"))