diff --git a/dataProcessing/processCensusAu2013.Rmd b/dataProcessing/processCensusAu2013.Rmd index f98aca10b7cacbb1da1974473c4aa90f1b7423b3..fe6ba9c9f86249023741361f9570b7c03bc951fe 100644 --- a/dataProcessing/processCensusAu2013.Rmd +++ b/dataProcessing/processCensusAu2013.Rmd @@ -227,6 +227,7 @@ ggplot2::ggplot(kids2013WDT, aes(x = calcSumPc)) + caption = "0 values indicate area units with small cell counts and hence data redaction") ``` + ## n People ```{r nPeople} @@ -386,14 +387,29 @@ au2013DT <- kids2013WDT[au2013DT] au2013DT <- npeople2013WDT[au2013DT] au2013DT <- nrooms2013WDT[au2013DT] -# recalculate n households without children here -au2013DT <- au2013DT[, nKids_0 := nrooms_totalHouseholds - nKids_1m] - message("N rows of data: ", nrow(au2013DT)) message("N areas: ", uniqueN(au2013DT$AU2013_label)) ``` -Save data... +## Impute household level kids counts +As the number of kids constraint is derived from a families base we need to impute the same values for the larger base of all households so that the totals of kids + no kids households is consistent with the other constraints. In the absence of household level data from the Census on the number of children in households we assume that: + + * households which are not families do not include children; + * => the number of households with children = the number of households with children; + * => we add the difference between the number of households and the nunmber of families to the `0 kids` constraint + +```{r imputeHouseholdKids} +au2013DT[, nKids_0 := nKids_0_families + (npeople_totalHouseholds - nkids_totalFamilies)] +au2013DT[, nkids_totalHouseholds := nKids_0 + nKids_1m] +t <- head(au2013DT[,.(AU2013_code, nKids_0_families, nKids_0, nKids_1m, nkids_totalFamilies,nkids_totalHouseholds, npeople_totalHouseholds)]) + +kableExtra::kable(t, caption = "Test nKids_0 imputation") %>% + kable_styling() +``` + +## Save data + +Save data for re-use ```{r saveData} of <- paste0(p_Params$dataPath, "processed/2013IpfInput.csv") @@ -406,11 +422,29 @@ names(au2013DT) # Check household totals +```{r nNzHH} +# 2013 +# 1,549,890 +# http://archive.stats.govt.nz/Census/2013-census/profile-and-summary-reports/qstats-families-households/households.aspx +# basically matches sum(ipfCensusDT$npeople_totalHouseholds, na.rm = TRUE) +nzHhPop2013 <- 1549890 + +``` + +Totals: + + * Census 2013 household total `r tidyNum(nzHhPop2013)` - http://archive.stats.govt.nz/Census/2013-census/profile-and-summary-reports/qstats-families-households/households.aspx + * IPF constraints: fuel: `r tidyNum(sum(au2013DT$fuel_totalHouseholds, na.rm = TRUE))` + * IPF constraints: nrooms: `r tidyNum(sum(au2013DT$nrooms_totalHouseholds, na.rm = TRUE))` + * IPF constraints: npeople: `r tidyNum(sum(au2013DT$npeople_totalHouseholds, na.rm = TRUE))` + * IPF constraints: nkids (families): `r tidyNum(sum(au2013DT$nkids_totalFamilies, na.rm = TRUE))` + * IPF constraints: nkids (households): `r tidyNum(sum(au2013DT$nkids_totalHouseholds, na.rm = TRUE))` <- imputed + These will vary depending on the source table (families vs households vs dwellings etc) ```{r checkTotals} -pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_statedtotalHouseholds, nrooms_totalHouseholds, - npeople_totalHouseholds,fuel_totalStatedHouseholds, nkids_totalFamilies)] +pairsDT <- au2013DT[, .(fuel_totalHouseholds, nrooms_totalHouseholds, + npeople_totalHouseholds, nkids_totalFamilies, nkids_totalHouseholds)] pairs(pairsDT) ``` diff --git a/dataProcessing/processCensusAu2013.html b/dataProcessing/processCensusAu2013.html index 6fb75b995ed628a24744202e9b85d86fde89fa15..48266352a2d52c0f8e81b8934b9bf809e8b72b62 100644 --- a/dataProcessing/processCensusAu2013.html +++ b/dataProcessing/processCensusAu2013.html @@ -263,7 +263,7 @@ div.tocify { <h1 class="title toc-ignore">Process NZ Census 2013 data for use in ipf</h1> <h3 class="subtitle">v1.0</h3> <h4 class="author">Ben Anderson (University of Otago/University of Southampton)</h4> -<h4 class="date">Last run at: 2019-10-17 16:29:56</h4> +<h4 class="date">Last run at: 2019-10-21 15:21:52</h4> </div> @@ -2850,7 +2850,188 @@ Figure 3.4: Test check sum % <pre><code>## Merging on AU2013_code * AU2013_label</code></pre> <pre><code>## N rows of data: 2020</code></pre> <pre><code>## N areas: 2020</code></pre> -<p>Save data…</p> +<div id="impute-household-level-kids-counts" class="section level2"> +<h2><span class="header-section-number">4.1</span> Impute household level kids counts</h2> +<p>As the number of kids constraint is derived from a families base we need to impute the same values for the larger base of all households so that the totals of kids + no kids households is consistent with the other constraints. In the absence of household level data from the Census on the number of children in households we assume that:</p> +<ul> +<li>households which are not families do not include children;</li> +<li>=> the number of households with children = the number of households with children;</li> +<li>=> we add the difference between the number of households and the nunmber of families to the <code>0 kids</code> constraint</li> +</ul> +<table class="table" style="margin-left: auto; margin-right: auto;"> +<caption> +<span id="tab:imputeHouseholdKids">Table 4.1: </span>Test nKids_0 imputation +</caption> +<thead> +<tr> +<th style="text-align:left;"> +AU2013_code +</th> +<th style="text-align:right;"> +nKids_0_families +</th> +<th style="text-align:right;"> +nKids_0 +</th> +<th style="text-align:right;"> +nKids_1m +</th> +<th style="text-align:right;"> +nkids_totalFamilies +</th> +<th style="text-align:right;"> +nkids_totalHouseholds +</th> +<th style="text-align:right;"> +npeople_totalHouseholds +</th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:left;"> +500100 +</td> +<td style="text-align:right;"> +33 +</td> +<td style="text-align:right;"> +81 +</td> +<td style="text-align:right;"> +42 +</td> +<td style="text-align:right;"> +75 +</td> +<td style="text-align:right;"> +123 +</td> +<td style="text-align:right;"> +123 +</td> +</tr> +<tr> +<td style="text-align:left;"> +500202 +</td> +<td style="text-align:right;"> +690 +</td> +<td style="text-align:right;"> +1200 +</td> +<td style="text-align:right;"> +522 +</td> +<td style="text-align:right;"> +1212 +</td> +<td style="text-align:right;"> +1722 +</td> +<td style="text-align:right;"> +1722 +</td> +</tr> +<tr> +<td style="text-align:left;"> +500203 +</td> +<td style="text-align:right;"> +300 +</td> +<td style="text-align:right;"> +606 +</td> +<td style="text-align:right;"> +165 +</td> +<td style="text-align:right;"> +465 +</td> +<td style="text-align:right;"> +771 +</td> +<td style="text-align:right;"> +771 +</td> +</tr> +<tr> +<td style="text-align:left;"> +500204 +</td> +<td style="text-align:right;"> +273 +</td> +<td style="text-align:right;"> +456 +</td> +<td style="text-align:right;"> +267 +</td> +<td style="text-align:right;"> +540 +</td> +<td style="text-align:right;"> +723 +</td> +<td style="text-align:right;"> +723 +</td> +</tr> +<tr> +<td style="text-align:left;"> +500205 +</td> +<td style="text-align:right;"> +144 +</td> +<td style="text-align:right;"> +261 +</td> +<td style="text-align:right;"> +129 +</td> +<td style="text-align:right;"> +273 +</td> +<td style="text-align:right;"> +390 +</td> +<td style="text-align:right;"> +390 +</td> +</tr> +<tr> +<td style="text-align:left;"> +500206 +</td> +<td style="text-align:right;"> +51 +</td> +<td style="text-align:right;"> +108 +</td> +<td style="text-align:right;"> +60 +</td> +<td style="text-align:right;"> +111 +</td> +<td style="text-align:right;"> +168 +</td> +<td style="text-align:right;"> +168 +</td> +</tr> +</tbody> +</table> +</div> +<div id="save-data" class="section level2"> +<h2><span class="header-section-number">4.2</span> Save data</h2> +<p>Save data for re-use</p> <pre><code>## Warning in `[<-.data.table`(x, j = name, value = value): Column 'count.NA' ## does not exist to remove</code></pre> <pre><code>## Data saved as: ~/Data/NZ_Census/data/processed/2013IpfInput.csv</code></pre> @@ -2871,10 +3052,20 @@ Figure 3.4: Test check sum % ## [27] "heatSourceOther" "heatSourceWood" ## [29] "i.calcSum.1" "i.calcSumPc.1" ## [31] "REGC2013_label" "nMBs" -## [33] "nKids_0"</code></pre> +## [33] "nKids_0" "nkids_totalHouseholds"</code></pre> +</div> </div> <div id="check-household-totals" class="section level1"> <h1><span class="header-section-number">5</span> Check household totals</h1> +<p>Totals:</p> +<ul> +<li>Census 2013 household total 1,549,890 - <a href="http://archive.stats.govt.nz/Census/2013-census/profile-and-summary-reports/qstats-families-households/households.aspx" class="uri">http://archive.stats.govt.nz/Census/2013-census/profile-and-summary-reports/qstats-families-households/households.aspx</a></li> +<li>IPF constraints: fuel: 1,561,905</li> +<li>IPF constraints: nrooms: 1,561,905</li> +<li>IPF constraints: npeople: 1,549,875</li> +<li>IPF constraints: nkids (families): 1,136,394</li> +<li>IPF constraints: nkids (households): 1,549,875 <- imputed</li> +</ul> <p>These will vary depending on the source table (families vs households vs dwellings etc)</p> <p><img src="processCensusAu2013_files/figure-html/checkTotals-1.png" width="672" /></p> <p>We focus on households/families/dwellings not individuals as the spatial microsimulation will operate at the household level.</p> @@ -2886,7 +3077,7 @@ Figure 3.4: Test check sum % <h1><span class="header-section-number">7</span> About</h1> <p>This Action has received funding from the European Union’s Horizon 2020 research and innovation programme under the <a href="http://ec.europa.eu/research/mariecurieactions/index_en.htm">Marie Skłodowska-Curie</a> grant agreement No 700386 (<a href="http://www.energy.soton.ac.uk/tag/spatialec/">SPATIALEC</a>).</p> <p>This work is (c) 2019 the University of Southampton.</p> -<p>Analysis completed in 35.159 seconds ( 0.59 minutes) using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a> with R version 3.5.2 (2018-12-20) running on x86_64-apple-darwin15.6.0.</p> +<p>Analysis completed in 45.187 seconds ( 0.75 minutes) using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a> with R version 3.5.2 (2018-12-20) running on x86_64-apple-darwin15.6.0.</p> </div> <div id="annexes" class="section level1"> <h1><span class="header-section-number">8</span> Annexes</h1> @@ -2894,15 +3085,15 @@ Figure 3.4: Test check sum % <h2><span class="header-section-number">8.1</span> Census data</h2> <pre><code>## Skim summary statistics ## n obs: 2020 -## n variables: 33 +## n variables: 34 ## -## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────────────────── +## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n min max empty n_unique ## AU2013_code 0 2020 2020 6 6 0 2020 ## AU2013_label 0 2020 2020 4 34 0 2020 ## REGC2013_label 0 2020 2020 12 24 0 17 ## -## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── +## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n mean sd p0 ## calcSum 8 2012 2020 776.19 624.19 0 ## fuel_totalHouseholds 8 2012 2020 776.29 623.96 0 @@ -2915,10 +3106,11 @@ Figure 3.4: Test check sum % ## i.calcSum 8 2012 2020 770.13 618.81 0 ## i.calcSum.1 8 2012 2020 1170.03 898.84 0 ## i.calcSum.2 8 2012 2020 564.81 452.99 0 -## nKids_0 8 2012 2020 514.73 439.55 0 +## nKids_0 8 2012 2020 508.75 433.1 0 ## nKids_0_families 8 2012 2020 303.24 247 0 ## nKids_1m 8 2012 2020 261.57 220.29 0 ## nkids_totalFamilies 8 2012 2020 564.81 452.99 0 +## nkids_totalHouseholds 8 2012 2020 770.32 618.78 0 ## nMBs 0 2020 2020 23.09 16.9 1 ## nPeople_1 8 2012 2020 176.58 168.71 0 ## nPeople_2 8 2012 2020 262.21 217.39 0 @@ -2942,10 +3134,11 @@ Figure 3.4: Test check sum % ## 231 684 1191 5364 ▇▅▂▁▁▁▁▁ ## 393 1071 1773.75 5922 ▇▆▅▂▁▁▁▁ ## 168 502.5 861 2460 ▇▅▅▃▁▁▁▁ -## 153 438 777 5130 ▇▃▁▁▁▁▁▁ +## 152.25 430.5 771 4929 ▇▃▁▁▁▁▁▁ ## 96 262.5 453 1728 ▇▆▃▁▁▁▁▁ ## 69 222 402 1335 ▇▅▃▂▁▁▁▁ ## 168 502.5 861 2460 ▇▅▅▃▁▁▁▁ +## 231 681 1188.75 5367 ▇▅▂▁▁▁▁▁ ## 9 20 34 124 ▇▆▃▁▁▁▁▁ ## 48 132 261 1797 ▇▃▁▁▁▁▁▁ ## 84 228 390.75 2199 ▇▅▁▁▁▁▁▁ @@ -2958,7 +3151,7 @@ Figure 3.4: Test check sum % ## 84 261 489 1758 ▇▅▃▂▁▁▁▁ ## 108 297 504.75 1992 ▇▆▃▁▁▁▁▁ ## -## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── +## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n mean sd p0 p25 p50 p75 ## calcSumPc 131 1889 2020 98.5 12.46 0 99.69 100 100.28 ## i.calcSumPc 124 1896 2020 98.39 12.52 0 99.69 100 100.24 diff --git a/dataProcessing/processCensusAu2013_files/figure-html/checkTotals-1.png b/dataProcessing/processCensusAu2013_files/figure-html/checkTotals-1.png index cb3acd2731af4a0f2b84c56c3f5c9e33fa40d789..c71f5f355c79611c4fb11e0edc03bd949220e920 100644 Binary files a/dataProcessing/processCensusAu2013_files/figure-html/checkTotals-1.png and b/dataProcessing/processCensusAu2013_files/figure-html/checkTotals-1.png differ