_loadNonPowerData.Rmd

areasDT <- data.table::fread(sParams$areasTable2013)

auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
setkey(auListDT, AU2013_code)
# fuelSource ----
fuelf <- paste0(sParams$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
outputMessage(paste0("Loading: ", fuelf))
fuelDT <- data.table::fread(fuelf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT))))
fuelDT <- fuelDT[, AU2013_code := AREA]
setkey(fuelDT, AU2013_code)
fuelDT <- fuelDT[auListDT]
message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))

# create categories
# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"

fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
                 censusConstraint := "fuel_totalHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
                 censusConstraint := "fuel_totalStatedHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
                 censusConstraint := "heatSourceWood"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
                   `Fuel type used to heat dwelling` == "Electricity",
                 censusConstraint := "heatSourceElectricity"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling`  %like% "gas",
                 censusConstraint := "heatSourceGas"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
                 censusConstraint := "heatSourceCoal"]
table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)

# convert to wide
fuelDT <- fuelDT[, count := Value]

fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
                              .(AU2013_code,AU2013_label,censusConstraint,count)],
                          idvar = c("AU2013_code", "AU2013_label"),
                          timevar = "censusConstraint",
                          direction = "wide")

setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
                        "count.heatSourceElectricity", "count.heatSourceGas",
                        "count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
         c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
                        "heatSourceElectricity", "heatSourceGas",
                        "heatSourceWood", "heatSourceCoal", "heatSourceOther"))

# nKids ----
kidsf <- paste0(sParams$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
outputMessage(paste0("Loading: ", kidsf))
kidsDT <- data.table::fread(kidsf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT))))
kidsDT <- kidsDT[, AU2013_code := AREA]
setkey(kidsDT, AU2013_code)
kidsDT <- kidsDT[auListDT]

message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))

# > create categories
# "nKids_0", "nKids_1m"
kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
                 censusConstraint := "nkids_totalFamilies"]

table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)

# > convert to wide
kidsDT <- kidsDT[, count := Value]

kids2013WDT <- reshape(kidsDT[YEAR == 2013,
                              .(AU2013_code,AU2013_label,censusConstraint,count)],
                          idvar = c("AU2013_code", "AU2013_label"),
                          timevar = "censusConstraint",
                          direction = "wide")
# > calculate n households with 0 kids
# do this again later when we have the number of _households_ to use as the base
kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"),
         c("nkids_totalFamilies", "nKids_1m"))


# nPeople ----
npeoplef <- paste0(sParams$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
outputMessage(paste0("Loading: ", npeoplef))
npeopleDT <- data.table::fread(npeoplef)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))))
setkey(npeopleDT, Area) # forgot to get code attached to this one
npeopleDT <- npeopleDT[, AU2013_label := Area]
setkey(auListDT,AU2013_label) # set to label instead of code - should still work
npeopleDT <- npeopleDT[auListDT]
message("N unique area units (people data): ", uniqueN(npeopleDT$Area))

# > create categories ----
# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
                       censusConstraint := "npeople_totalHouseholds"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
                       censusConstraint := "nPeople_1"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
                       censusConstraint := "nPeople_2"]
npeopleDT <- npeopleDT[`Number of usual residents in household`  %like% "Three",
                       censusConstraint := "nPeople_3"]
table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)

# convert to wide
npeopleDT <- npeopleDT[, count := Value]

npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
                              .(AU2013_code,AU2013_label,censusConstraint,count)],
                          idvar = c("AU2013_code", "AU2013_label"),
                          timevar = "censusConstraint",
                          direction = "wide")

setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"),
         c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))

# nRooms ----
nroomsf <- paste0(sParams$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
outputMessage(paste0("Loading: ", nroomsf))
nroomsDT <- data.table::fread(nroomsf)
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))))
nroomsDT <- nroomsDT[, AU2013_code := AREA]
setkey(nroomsDT, AU2013_code)
setkey(auListDT,AU2013_code) # set to back to code
nroomsDT <- nroomsDT[auListDT]
message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))

# > create categories ----
# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
                       censusConstraint := "nrooms_statedtotalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
                       censusConstraint := "nrooms_totalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
                       `Number of rooms` %like% "Two" |
                       `Number of rooms` %like% "Three" |
                       `Number of rooms` %like% "Four",
                       censusConstraint := "nRooms1_4"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" |
                       `Number of rooms` %like% "Six" ,
                       censusConstraint := "nRooms5_6"]
table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)

# convert to wide
nroomsDT <- nroomsDT[, count := Value]

nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
                              .(AU2013_code,AU2013_label,censusConstraint,count)],
                          idvar = c("AU2013_code", "AU2013_label"),
                          timevar = "censusConstraint",
                          direction = "wide")

setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
                          "count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"),
         c("nRooms1_4", "nRooms5_6", "nRooms7m",
           "nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))

# combine them ----
setkey(auListDT, AU2013_code, AU2013_label)
setkey(fuel2013WDT, AU2013_code, AU2013_label)
setkey(kids2013WDT, AU2013_code, AU2013_label)
setkey(npeople2013WDT, AU2013_code, AU2013_label)
setkey(nrooms2013WDT, AU2013_code, AU2013_label)

au2013DT <- fuel2013WDT[auListDT]
au2013DT <- kids2013WDT[au2013DT]
au2013DT <- npeople2013WDT[au2013DT]
au2013DT <- nrooms2013WDT[au2013DT]

# recalulate n households without children
au2013DT <- au2013DT[, nKids_0 :=  nrooms_totalHouseholds- nKids_1m]

pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
                        npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]

pairs(pairsDT)
f <- paste0(sParams$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source &
outputMessage(paste0("Loading: ", f))
ggHhDT <- data.table::as.data.table(readr::read_csv(f))
outputMessage(paste0("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT))))
outputMessage(paste0("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID))))

# survey: number of kids: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nKids :=  nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nKids, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)]
ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)]

message("Household data: n kids")
table(ggHhDT$nKids_0)
table(ggHhDT$nKids_1m)

# create category
ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]

table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")

# survey: number of people: categorised as: 1,2,3,4+ ----
ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
table(ggHhDT$nPeople, useNA = "always")
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)]
ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)]

message("Household data: n people")
table(ggHhDT$nPeople_1)
table(ggHhDT$nPeople_2)
table(ggHhDT$nPeople_3)
table(ggHhDT$nPeople_4m)

# create category
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)]
ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]

table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")

# survey: n bedrooms via Q10 ----
ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
                 `Q10#1_2_1_TEXT` + # living rooms
                 `Q10#1_3_1_TEXT` + # dining rooms-kitchens
                 `Q10#1_4_1_TEXT` + # kitchens
                 `Q10#1_5_1_TEXT`  # studies
                 # Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx
                 # Q10#1_7_1_TEXT + # toilets - excluded from census
                 # Q10#1_8_1_TEXT + # laundries - excluded from census
                 ]
table(ggHhDT$nRooms, useNA = "always")
message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)

summary(r)

# > impute where missing ----
message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate

# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) +
  geom_point() # original shape

ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) +
  geom_point()  + # imputed shape
  labs(caption = "Well, it is a linear model...")

message("So we use the imputed values only where we had NA")
ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms),
                                                round(nRoomsImputed),
                                                nRooms
)
]

message("How bad does that look?")
ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed",
                                             "observed")]
ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) +
  geom_jitter() #

message("Well, we can live with that for now...")

#
message("There are NA - we will need to remove them later")
ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)]
ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)]
ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)]
table(ggHhDT$nRooms1_4)
table(ggHhDT$nRooms5_6)
table(ggHhDT$nRooms7m)

# create category
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)]
ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]

message("Check coding")
table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")


# survey: Q10#1_1_1_TEXT ----

ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`]
table(ggHhDT$nBedrooms,  useNA = "always")
# best to do 1-2, 3, 4m

message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...")
message("Test the model")
r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)

summary(r)

# > impute where missing ----
message("that looks reasonable...impute where missing")
ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate

# check what we got
ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) +
  geom_point() # original shape

ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) +
  geom_point()  + # imputed shape
  labs(caption = "Well, it is a linear model...")

message("So we use the imputed values only where we had NA and we round them to whole numbers")
ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms),
                                                round(nBedroomsImputed),
                                                nBedrooms
)
]

message("How bad does that look?")
ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed",
                                             "observed")]
ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) +
  geom_jitter() #

message("Well, we can live with that for now...")

ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)]
ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)]
message("Household data: n bedrooms")
table(ggHhDT$nBedrooms_1_2)
table(ggHhDT$nBedrooms_3)
table(ggHhDT$nBedrooms_4m)

# create category
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)]
ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)]

table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")


# survey: Q20 ----

table(ggHhDT$Q20_coded,  useNA = "always")

ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
                                             "Enclosed wood burner" = "Wood",
                                             "Open fire" = "Wood", # could be coal
                                             "Heat pump"= "Electricity",
                                             "HRV or other ventilation system" = "Electricity",
                                             "Portable electric heaters" = "Electricity",
                                             "Portable gas heater" = "Gas",
                                             "Underfloor gas heating" = "Gas",
                                             "Other" = "Other")]

table(ggHhDT$Q20_coded,  ggHhDT$heatFuel, useNA = "always")
ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]

table(ggHhDT$heatFuel,  ggHhDT$Location, useNA = "always")
surveyDT <- ggHhDT[, .(linkID,
                       nKids_0, nKids_1m,
                       nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
                       nRooms1_4, nRooms5_6, nRooms7m,
                       heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
                       nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]

skimr::skim(surveyDT)

message("There are still a few NAs so we need to remove these households")
surveyDT <- na.omit(surveyDT)

skimr::skim(surveyDT)
message("Fixed it?")
censusAuWideDT <- au2013DT[REGC2013_label == "Hawke's Bay Region" |
                         REGC2013_label == "Taranaki Region"]

message("Total households (rooms table) = ", sum(censusAuWideDT$nrooms_totalHouseholds))

message("Total households (fuel table) = ", sum(censusAuWideDT$fuel_totalHouseholds))

message("Total households (npeople table) = ", sum(censusAuWideDT$npeople_totalHouseholds))

message("Total families (kids table) = ", sum(censusAuWideDT$nkids_totalFamilies))

t <- censusAuWideDT[, .(nHouseholdsRooms = sum(nrooms_totalHouseholds),
                        nHouseholdsFuel = sum(fuel_totalHouseholds),
                        nHouseholdsPeople = sum(npeople_totalHouseholds)), keyby = REGC2013_label]

kableExtra::kable(t, caption = "Household counts by region (by table source)")