Skip to content
Snippets Groups Projects
Commit eb6b93b3 authored by Ben Anderson's avatar Ben Anderson
Browse files

added census 2013 processing code

parent 9992a12f
No related branches found
No related tags found
No related merge requests found
Showing
with 11253 additions and 0 deletions
# Loads NZ Census area unit 2013 data
# processes to ipf form and matching coding to Green Grid survey
require(spatialec) # for parameters etc
myPackages <- c(
"data.table", # data munching
"ggplot2", # plot linear imputation model results
"here" # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49
)
spatialec::loadLibraries(myPackages) # this will try to install packages it can't load
spatialec::setup()
# Load data ----
# NZ Area Unit data
# Loaded as separate files
# 2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
#
# * n people
# * n dependent children
# * fuel source (all counted - may cause confusion as sum to > 100% of households)
# * n rooms
#
# NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
#
# Load areas labels ----
# First load area labels as we use these to select the right data rows.
areasDT <- data.table::fread(p_Params$areasTable2013)
auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
setkey(auListDT, AU2013_code)
# fuelSource ----
fuelf <- paste0(p_Params$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
message("Loading: ", fuelf)
fuelDT <- data.table::fread(fuelf)
message("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT)))
fuelDT <- fuelDT[, AU2013_code := AREA]
setkey(fuelDT, AU2013_code)
fuelDT <- fuelDT[auListDT]
message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
# create categories
# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
censusConstraint := "fuel_totalHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
censusConstraint := "fuel_totalStatedHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
censusConstraint := "heatSourceWood"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
`Fuel type used to heat dwelling` == "Electricity",
censusConstraint := "heatSourceElectricity"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "gas",
censusConstraint := "heatSourceGas"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
censusConstraint := "heatSourceCoal"]
table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
# convert to wide so variables in columns (for IPF)
fuelDT <- fuelDT[, count := Value]
fuel2013WDT <- reshape(fuelDT[YEAR == 2013,
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
"count.heatSourceElectricity", "count.heatSourceGas",
"count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
"heatSourceElectricity", "heatSourceGas",
"heatSourceWood", "heatSourceCoal", "heatSourceOther"))
# nKids ----
kidsf <- paste0(p_Params$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
message("Loading: ", kidsf)
kidsDT <- data.table::fread(kidsf)
message("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT)))
kidsDT <- kidsDT[, AU2013_code := AREA]
setkey(kidsDT, AU2013_code)
kidsDT <- kidsDT[auListDT]
message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
# > create categories
# "nKids_0", "nKids_1m"
kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
censusConstraint := "nkids_totalFamilies"]
table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
# > convert to wide
kidsDT <- kidsDT[, count := Value]
kids2013WDT <- reshape(kidsDT[YEAR == 2013,
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
# > calculate n households with 0 kids
# do this again later when we have the number of _households_ to use as the base
kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"),
c("nkids_totalFamilies", "nKids_1m"))
# nPeople ----
npeoplef <- paste0(p_Params$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
message("Loading: ", npeoplef)
npeopleDT <- data.table::fread(npeoplef)
message("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))
)
setkey(npeopleDT, Area) # forgot to get code attached to this one
npeopleDT <- npeopleDT[, AU2013_label := Area]
setkey(auListDT,AU2013_label) # set to label instead of code - should still work
npeopleDT <- npeopleDT[auListDT]
message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
# > create categories ----
# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
censusConstraint := "npeople_totalHouseholds"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
censusConstraint := "nPeople_1"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
censusConstraint := "nPeople_2"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Three",
censusConstraint := "nPeople_3"]
table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
# convert to wide
npeopleDT <- npeopleDT[, count := Value]
npeople2013WDT <- reshape(npeopleDT[Year == 2013, # helpful consistency of var names across NZ stats tables
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"),
c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
# nRooms ----
nroomsf <- paste0(p_Params$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
message("Loading: ", nroomsf)
nroomsDT <- data.table::fread(nroomsf)
message("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))
)
nroomsDT <- nroomsDT[, AU2013_code := AREA]
setkey(nroomsDT, AU2013_code)
setkey(auListDT,AU2013_code) # set to back to code
nroomsDT <- nroomsDT[auListDT]
message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
# > create categories ----
# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
nroomsDT <- nroomsDT[`Number of rooms` != "Not elsewhere included", censusConstraint := "nRooms7m"] # default (most complex to code)
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings stated",
censusConstraint := "nrooms_statedtotalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` == "Total dwellings, number of rooms",
censusConstraint := "nrooms_totalHouseholds"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "One" |
`Number of rooms` %like% "Two" |
`Number of rooms` %like% "Three" |
`Number of rooms` %like% "Four",
censusConstraint := "nRooms1_4"]
nroomsDT <- nroomsDT[`Number of rooms` %like% "Five" |
`Number of rooms` %like% "Six" ,
censusConstraint := "nRooms5_6"]
table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
# convert to wide
nroomsDT <- nroomsDT[, count := Value]
nrooms2013WDT <- reshape(nroomsDT[Year == 2013, # helpful consistency of var names across NZ stats tables
.(AU2013_code,AU2013_label,censusConstraint,count)],
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
"count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"),
c("nRooms1_4", "nRooms5_6", "nRooms7m",
"nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
# combine them into 1 filw ----
setkey(auListDT, AU2013_code, AU2013_label)
setkey(fuel2013WDT, AU2013_code, AU2013_label)
setkey(kids2013WDT, AU2013_code, AU2013_label)
setkey(npeople2013WDT, AU2013_code, AU2013_label)
setkey(nrooms2013WDT, AU2013_code, AU2013_label)
au2013DT <- fuel2013WDT[auListDT]
au2013DT <- kids2013WDT[au2013DT]
au2013DT <- npeople2013WDT[au2013DT]
au2013DT <- nrooms2013WDT[au2013DT]
# recalulate n households without children
au2013DT <- au2013DT[, nKids_0 := nrooms_totalHouseholds- nKids_1m]
# Check household totals ----
# These will vary depending on the source table (families vs households vs dwellings etc)
pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
pairs(pairsDT)
# We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
# save data for re-use ----
of <- paste0(p_Params$dataPath, "processed/2013IpfInput.csv")
data.table::fwrite(au2013DT, of)
message("Data saved as: ", of)
---
title: "Process NZ Census 2013 data for use in ipf"
subtitle: "v1.0"
author: "Ben Anderson (University of Otago/University of Southampton)"
date: 'Last run at: `r Sys.time()`'
output:
bookdown::html_document2:
code_folding: hide
fig_caption: yes
number_sections: yes
self_contained: no
toc: yes
toc_depth: 3
toc_float: yes
bibliography: '`r path.expand("~/bibliography.bib")`'
---
# Setup
```{r setup}
# Log compile time:
startTime <- proc.time()
require(spatialec) # for parameters etc
myPackages <- c(
"data.table", # data munching
"ggplot2", # plot linear imputation model results
"here", # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49
"kableExtra", # fancy tables
"skimr" # descripive stats
)
spatialec::loadLibraries(myPackages) # Beware: this will try to install packages that may not be installed
spatialec::setup()
```
Additional R packages used in this report (`r myPackages`):
* data.table - for fast data munching [@data.table]
* ggplot2 - for slick graphics [@ggplot2]
* here - for path management
* kableExtra - for pretty tables [@kableExtra]
* knitr - to create this document [@knitr]
* skimr - for data summaries [@skimr]
# Report Purpose
To process NZ Census 2013 data (from `r p_Params$censusDataSource`) into a form that can be used in an ipf model of national/regional/local area demand.
It:
* Loads NZ Census area unit 2013 data
* processes to ipf form and matching coding to Green Grid survey
# Load data
NZ Area Unit data is loaded as separate files.
2013 NZ Census data from NZ Stats at area unit level. For simplicity we use one file per constraint:
* n people
* n dependent children
* fuel source (all counted - may cause confusion as sum to > 100% of households)
* n rooms
We could also use other tables with other constraints if they are available - e.g. N bedrooms.
NB: these files, when downloaded form the [NZStats data extractor](http://nzdotstat.stats.govt.nz/wbos/Index.aspx?DataSetCode=TABLECODE8165#) come with higher levels of aggregation in the tables. These have to be removed by extracting just area unit rows.
## Load areas labels
First load area labels as we use these to select the right data rows.
```{r loadLabels}
areasDT <- data.table::fread(p_Params$areasTable2013)
auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)]
auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching
setkey(auListDT, AU2013_code)
kableExtra::kable(caption = "Example label data", head(auListDT))
```
## Fuel source
```{r fuelSource}
fuelf <- paste0(p_Params$dataPath, "raw/areaUnits/fuelSource/TABLECODE8100_Data_47b7b3fc-0e40-431f-b313-141de4fb0013.csv")
message("Loading: ", fuelf)
fuelDT <- data.table::fread(fuelf)
message("N rows loaded: ", dkUtils::tidyNum(nrow(fuelDT)))
fuelDT <- fuelDT[, AU2013_code := AREA]
setkey(fuelDT, AU2013_code)
fuelDT <- fuelDT[auListDT]
message("N unique area units (fuel data): ", uniqueN(fuelDT$AU2013_code))
# create categories
# "value.heatSourceWood", "value.heatSourceElectricity", "value.heatSourceGas", "value.heatSourceCoal", "value.heatSourceOther"
fuelDT <- fuelDT[, censusConstraint := "heatSourceOther"] # complex one - note this contains 'None' as well
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings, fuel type used to heat dwelling",
censusConstraint := "fuel_totalHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Total dwellings stated",
censusConstraint := "fuel_totalStatedHouseholds"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Wood",
censusConstraint := "heatSourceWood"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "Solar" | # <- what is this?
`Fuel type used to heat dwelling` == "Electricity",
censusConstraint := "heatSourceElectricity"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` %like% "gas",
censusConstraint := "heatSourceGas"]
fuelDT <- fuelDT[`Fuel type used to heat dwelling` == "Coal",
censusConstraint := "heatSourceCoal"]
#table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
# convert to wide so variables in columns (for IPF)
fuelDT <- fuelDT[, count := Value]
dt <- fuelDT[Year == 2013, .(count = sum(count, na.rm = TRUE)), # sum the counts by fuel type to ensure uniqe fuel type obs per area
keyby = .(AU2013_code,AU2013_label,censusConstraint)]
fuel2013WDT <- reshape(dt,
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(fuel2013WDT, c("count.fuel_totalHouseholds", "count.fuel_totalStatedHouseholds",
"count.heatSourceElectricity", "count.heatSourceGas",
"count.heatSourceWood", "count.heatSourceCoal", "count.heatSourceOther"),
c("fuel_totalHouseholds", "fuel_totalStatedHouseholds",
"heatSourceElectricity", "heatSourceGas",
"heatSourceWood", "heatSourceCoal", "heatSourceOther"))
```
```{r fuelDist, fig.cap="Distribution of check sum % "}
t <- table(fuelDT$`Fuel type used to heat dwelling`, fuelDT$censusConstraint)
kableExtra::kable(t, caption = "Cross check (all years)") %>%
kable_styling()
t <- summary(fuel2013WDT)
kableExtra::kable(t, caption = "Summary of 2013 data") %>%
kable_styling()
# check counts add up
fuel2013WDT[, calcSum := heatSourceElectricity + heatSourceGas + heatSourceWood + heatSourceCoal + heatSourceOther]
fuel2013WDT[, calcSumPc := 100 * (calcSum/fuel_totalHouseholds)]
# check
summary(fuel2013WDT$calcSumPc)
kableExtra::kable(head(fuel2013WDT[calcSumPc > 100]), digits = 2,
caption = "Example areas where > 100% households coded (multiple answers allowed so this table may be quite large)") %>%
kable_styling()
ggplot2::ggplot(fuel2013WDT, aes(x = calcSumPc)) +
geom_histogram(binwidth = 10) +
labs(x = "% of total households",
caption = "0 values indicate area units with small cell counts and hence data redaction\nNote there can be multiple responses")
```
## n Kids
```{r nKids}
kidsf <- paste0(p_Params$dataPath, "raw/areaUnits/nKids/TABLECODE8141_Data_e6f03066-7bbf-4ba0-94b0-1821d5a4665a.csv")
message("Loading: ", kidsf)
kidsDT <- data.table::fread(kidsf)
message("N rows loaded: ", dkUtils::tidyNum(nrow(kidsDT)))
kidsDT <- kidsDT[, AU2013_code := AREA]
setkey(kidsDT, AU2013_code)
kidsDT <- kidsDT[auListDT]
message("N unique area units (kids data): ", uniqueN(kidsDT$AU2013_code))
# > create categories
# "nKids_0", "nKids_1m"
kidsDT <- kidsDT[, censusConstraint := "nKids_1m"] # we selected rows with dependent children in the census extractor
kidsDT <- kidsDT[`Family type by child dependency status` == "Total families",
censusConstraint := "nkids_totalFamilies"]
#table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
# > convert to wide
kidsDT <- kidsDT[, count := Value]
dt <- kidsDT[Year == 2013, .(count = sum(count, na.rm = TRUE)),
keyby = .(AU2013_code,AU2013_label,censusConstraint)]
kids2013WDT <- reshape(dt,
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
# > calculate n households with 0 kids
# do this again later when we have the number of _households_ to use as the base
kids2013WDT <- kids2013WDT[, nKids_0_families := count.nkids_totalFamilies - count.nKids_1m]
setnames(kids2013WDT, c("count.nkids_totalFamilies", "count.nKids_1m"),
c("nkids_totalFamilies", "nKids_1m"))
```
```{r nkidsDist, fig.cap="Distribution of check sum %"}
t <- table(kidsDT$`Family type by child dependency status`, kidsDT$censusConstraint)
kableExtra::kable(t, caption = "Cross check (all years)") %>%
kable_styling()
t <- summary(kids2013WDT)
kableExtra::kable(t, caption = "Summary of 2013 data") %>%
kable_styling()
# check counts add up
kids2013WDT[, calcSum := nKids_0_families + nKids_1m ]
kids2013WDT[, calcSumPc := 100 * (calcSum/nkids_totalFamilies)]
# check
summary(kids2013WDT$calcSumPc)
kableExtra::kable(head(kids2013WDT[calcSumPc > 100]), digits = 2,
caption = "Example areas where > 100% families coded") %>%
kable_styling()
ggplot2::ggplot(kids2013WDT, aes(x = calcSumPc)) +
geom_histogram(binwidth = 10) +
labs(x = "% of total families",
caption = "0 values indicate area units with small cell counts and hence data redaction")
```
## n People
```{r nPeople}
npeoplef <- paste0(p_Params$dataPath, "raw/areaUnits/nPeople/TABLECODE8169_Data_bfad6f1a-c9af-4adb-a141-e13a83e175d0.csv")
message("Loading: ", npeoplef)
npeopleDT <- data.table::fread(npeoplef)
message("N rows loaded: ", dkUtils::tidyNum(nrow(npeopleDT))
)
setkey(npeopleDT, Area) # forgot to get code attached to this one
npeopleDT <- npeopleDT[, AU2013_label := Area]
setkey(auListDT,AU2013_label) # set to label instead of code - should still work
npeopleDT <- npeopleDT[auListDT]
message("N unique area units (people data): ", uniqueN(npeopleDT$Area))
# > create categories ----
# "value.nPeople_1", "value.nPeople_2", "value.nPeople_3", "value.nPeople_4m",
npeopleDT <- npeopleDT[, censusConstraint := "nPeople_4m"] # default (most complex to code)
npeopleDT <- npeopleDT[`Number of usual residents in household` == "Total households",
censusConstraint := "npeople_totalHouseholds"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "One",
censusConstraint := "nPeople_1"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Two",
censusConstraint := "nPeople_2"]
npeopleDT <- npeopleDT[`Number of usual residents in household` %like% "Three",
censusConstraint := "nPeople_3"]
#table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
# convert to wide
npeopleDT <- npeopleDT[, count := Value]
dt <- npeopleDT[Year == 2013, .(count = sum(count, na.rm = TRUE)),
keyby = .(AU2013_code,AU2013_label,censusConstraint)]
npeople2013WDT <- reshape(dt,
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(npeople2013WDT, c("count.nPeople_1", "count.nPeople_2", "count.nPeople_3", "count.nPeople_4m", "count.npeople_totalHouseholds"),
c("nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m", "npeople_totalHouseholds"))
```
```{r npeopleDist, fig.cap="Test check sum %"}
t <- table(npeopleDT$`Number of usual residents in household`, npeopleDT$censusConstraint)
kableExtra::kable(t, caption = "Cross check (all years)") %>%
kable_styling()
t <- summary(npeople2013WDT)
kableExtra::kable(t, caption = "Summary of 2013 data") %>%
kable_styling()
# check counts add up
npeople2013WDT[, calcSum := nPeople_1 + nPeople_2 + nPeople_3 + nPeople_4m]
npeople2013WDT[, calcSumPc := 100*(calcSum/npeople_totalHouseholds)]
# check
summary(npeople2013WDT$calcSumPc)
kableExtra::kable(head(npeople2013WDT[calcSumPc > 100]), digits = 2,
caption = "Example areas where > 100% households coded - check how close to 100% they are") %>%
kable_styling()
ggplot2::ggplot(npeople2013WDT, aes(x = calcSumPc)) +
geom_histogram(binwidth = 10) +
labs(x = "% of total households",
caption = "0 values indicate area units with small cell counts and hence data redaction")
```
## n Rooms
```{r nRooms}
nroomsf <- paste0(p_Params$dataPath, "raw/areaUnits/nRooms/TABLECODE8098_Data_62c5ce5c-23cf-44a2-b25e-b287fe9645e7.csv")
message("Loading: ", nroomsf)
nroomsDT <- data.table::fread(nroomsf)
message("N rows loaded: ", dkUtils::tidyNum(nrow(nroomsDT))
)
nroomsDT <- nroomsDT[, AU2013_code := AREA]
setkey(nroomsDT, AU2013_code)
setkey(auListDT,AU2013_code) # set to back to code
nroomsDT <- nroomsDT[auListDT]
message("N unique area units (rooms data): ", uniqueN(nroomsDT$AU2013_code))
# > create categories ----
# "value.nRooms1_4", "value.nRooms5_6", "value.nRooms7m",
nroomsDT <- nroomsDT[ROOMS > 0 & ROOMS < 5,
censusConstraint := "nRooms1_4"]
nroomsDT <- nroomsDT[ROOMS > 4 & ROOMS < 7,
censusConstraint := "nRooms5_6"]
nroomsDT <- nroomsDT[ROOMS > 6 ,
censusConstraint := "nRooms7m"]
# nroomsDT <- nroomsDT[ROOMS == 99,
# censusConstraint := "Other"] # exclude
nroomsDT <- nroomsDT[ROOMS == 999,
censusConstraint := "nrooms_statedtotalHouseholds"]
nroomsDT <- nroomsDT[ROOMS == 9999,
censusConstraint := "nrooms_totalHouseholds"]
setkey(nroomsDT, Year, Area)
nroomsDT <- nroomsDT[!is.na(censusConstraint) | !is.na(Year)]
# convert to wide
nroomsDT <- nroomsDT[, count := Value]
dt <- nroomsDT[Year == 2013, .(count = sum(count, na.rm = TRUE)), keyby = .(AU2013_code,AU2013_label,censusConstraint)]
nrooms2013WDT <- reshape(dt,
idvar = c("AU2013_code", "AU2013_label"),
timevar = "censusConstraint",
direction = "wide")
setnames(nrooms2013WDT, c("count.nRooms1_4", "count.nRooms5_6", "count.nRooms7m",
"count.nrooms_totalHouseholds", "count.nrooms_statedtotalHouseholds"),
c("nRooms1_4", "nRooms5_6", "nRooms7m",
"nrooms_totalHouseholds", "nrooms_statedtotalHouseholds"))
```
```{r nRoomsDist, fig.cap = "Test check sum %"}
t <- table(nroomsDT$`Number of rooms`, nroomsDT$censusConstraint)
kableExtra::kable(t, caption = "Cross check (all years)") %>%
kable_styling()
t <- summary(nrooms2013WDT)
kableExtra::kable(t, caption = "Summary of 2013 data") %>%
kable_styling()
# check counts add up
nrooms2013WDT[, calcSum := nRooms1_4 + nRooms5_6 + nRooms7m ]
nrooms2013WDT[, calcSumPc := 100*(calcSum/nrooms_totalHouseholds)]
# check
summary(nrooms2013WDT$calcSumPc)
kableExtra::kable(head(nrooms2013WDT[calcSumPc > 100]), digits = 2,
caption = "Example areas where > 100% households coded - check how close to 100% they are") %>%
kable_styling()
ggplot2::ggplot(nrooms2013WDT, aes(x = calcSumPc)) +
geom_histogram(binwidth = 10) +
labs(x = "% of total households",
caption = "0 values indicate area units with small cell counts and hence data redaction")
```
# Combine and save constraint files
```{r combinaData}
setkey(auListDT, AU2013_code, AU2013_label)
setkey(fuel2013WDT, AU2013_code, AU2013_label)
setkey(kids2013WDT, AU2013_code, AU2013_label)
setkey(npeople2013WDT, AU2013_code, AU2013_label)
setkey(nrooms2013WDT, AU2013_code, AU2013_label)
message("Merging on AU2013_code * AU2013_label")
au2013DT <- fuel2013WDT[auListDT]
au2013DT <- kids2013WDT[au2013DT]
au2013DT <- npeople2013WDT[au2013DT]
au2013DT <- nrooms2013WDT[au2013DT]
# recalculate n households without children here
au2013DT <- au2013DT[, nKids_0 := nrooms_totalHouseholds - nKids_1m]
message("N rows of data: ", nrow(au2013DT))
message("N areas: ", uniqueN(au2013DT$AU2013_label))
```
Save data...
```{r saveData}
of <- paste0(p_Params$dataPath, "processed/2013IpfInput.csv")
au2013DT$count.NA <- NULL
data.table::fwrite(au2013DT, of)
message("Data saved as: ", of)
message("Variables saved:")
names(au2013DT)
```
# Check household totals
These will vary depending on the source table (families vs households vs dwellings etc)
```{r checkTotals}
pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds,
npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)]
pairs(pairsDT)
```
We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.
# Acknowledgements
```{r generic.Ack, child=p_Params$ackLong}
# generic acks
```
# About
```{r check.runtime}
t <- proc.time() - startTime
elapsed <- t[[3]]
```
```{r generic.About, child=p_Params$spatialecAbout}
# generic about include
```
Analysis completed in `r elapsed` seconds ( `r round(elapsed/60,2)` minutes) using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com) with `r R.version.string` running on `r R.version$platform`.
# Annexes
## Census data
```{r skimGgSurveyData}
if(exists("au2013DT")){
skimr::skim(au2013DT)
}
```
```{r runToHere}
```
# References
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
File added
File added
File added
File added
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment