From f92a7bd557799e0be8310f976a16ca8fcfb263c3 Mon Sep 17 00:00:00 2001 From: Ben Anderson <dataknut@icloud.com> Date: Tue, 23 May 2017 13:38:58 +0100 Subject: [PATCH] fixed to make use of new hashed pid & also an episode start time as a fake POSIXct & a string; latest run --- MTUS-W6-adult-episodes-data-processing.Rmd | 45 ++--- MTUS-W6-adult-episodes-data-processing.html | 206 +++++++++++++++----- MTUS-W6-adult-survey-data-processing.Rmd | 22 +-- MTUS-W6-adult-survey-data-processing.html | 25 +-- MTUS-W6-adult-survey-data-processing.md | 23 +-- 5 files changed, 185 insertions(+), 136 deletions(-) diff --git a/MTUS-W6-adult-episodes-data-processing.Rmd b/MTUS-W6-adult-episodes-data-processing.Rmd index 99d13b3..746e21b 100644 --- a/MTUS-W6-adult-episodes-data-processing.Rmd +++ b/MTUS-W6-adult-episodes-data-processing.Rmd @@ -127,29 +127,11 @@ We now delete the non-UK data leaving us with `r format(nrow(mtusUKEpsDT), big.m ```{r setKeys} # This works but we'll create a concatenated id to make life easier -setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) +setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) # sorts into order print("-> Create uniq id for diaries (for matching) and persons") # Create unique ids ---- - -# diarypid -mtusUKEpsDT$ba_diarypid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -mtusUKEpsDT$ba_pid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid - ) +ba_MTUScreateIds(mtusUKEpsDT) ``` ## Episode dates @@ -603,13 +585,13 @@ t <- mtusUKEpsDT[, .( N = length(epnum) ), - by = str_epStartTime - ][order(str_epStartTime)] + by = r_epStartTimeStr + ][order(r_epStartTimeStr)] kable(caption = "Check 1/2 hour coding (as string)",t) -print("# Check NAs in str_epStartTime (should be none)") -print(paste0("# N rows with NAs in str_epStartTime:", - nrow(mtusUKEpsDT[is.na(str_epStartTime)])) +print("# Check NAs in r_epStartTimeStr (should be none)") +print(paste0("# N rows with NAs in r_epStartTimeStr:", + nrow(mtusUKEpsDT[is.na(r_epStartTimeStr)])) ) ``` @@ -705,7 +687,7 @@ kable(caption = "Distribution of missing location", inout, eloc, mtrav, r_date, r_dow, time, - str_epStartTime, r_epStartDateTime, r_epEndDateTime)] + r_epStartTimeStr, r_epStartDateTime, r_epEndDateTime)] mtusUKEpsDT <- NULL # Save out the working file for later use (saves re-running) @@ -725,6 +707,15 @@ Variables retained in processed episodes file: `r kable(caption = "Variables retained in episodes file", names(MTUSW6UKdiaryEps_DT))` +```{r firstEps} +# print out first few rows +ht <- head(MTUSW6UKdiaryEps_DT) + +kable(caption = "First few rows of saved episode data", + ht + ) +``` + # Descriptive Statistics ## Number of episodes recorded @@ -771,7 +762,7 @@ Remember that the number of episodes relates to the number of respondents, the l ```{r episodesByActualYearMonth} kable( - table(as.POSIXlt(MTUSW6UKdiaryEps_DT$r_date)$mon, + table(month(MTUSW6UKdiaryEps_DT$r_date,label = TRUE, abbr = TRUE), # requires lubridate MTUSW6UKdiaryEps_DT$r_year, useNA = "always" ) diff --git a/MTUS-W6-adult-episodes-data-processing.html b/MTUS-W6-adult-episodes-data-processing.html index 0b1f17d..bfc97a8 100644 --- a/MTUS-W6-adult-episodes-data-processing.html +++ b/MTUS-W6-adult-episodes-data-processing.html @@ -118,7 +118,7 @@ $(document).ready(function () { <h1 class="title toc-ignore">MTUS World 6 Data Processing</h1> <h4 class="author"><em>Ben Anderson (<a href="mailto:b.anderson@soton.ac.uk">b.anderson@soton.ac.uk</a>, <a href="mailto:/@dataknut">/@dataknut</a>)</em></h4> -<h4 class="date"><em>Last run at: 2017-05-22 12:42:23</em></h4> +<h4 class="date"><em>Last run at: 2017-05-23 13:28:18</em></h4> </div> @@ -188,7 +188,7 @@ $(document).ready(function () { ## Unrecognized record type 7, subtype 18 encountered in system file</code></pre> <pre><code>## re-encoding from CP1252</code></pre> <pre><code>## user system elapsed -## 225.091 12.120 260.905</code></pre> +## 225.749 10.703 260.696</code></pre> <p>We have loaded 11,501,221 rows of data for 11 countries.</p> <pre class="r"><code>kable(caption = "Number of diaries per year", ba_tidyNum(table(mtusEpsW6DT$survey,droplevels(mtusEpsW6DT$countrya)) # removes unused countries @@ -512,30 +512,12 @@ $(document).ready(function () { <pre class="r"><code>mtusUKEpsDT <- subset(mtusEpsW6DT, countrya == "United Kingdom")</code></pre> <p>We now delete the non-UK data leaving us with 1,392,007 rows of survey data.</p> <pre class="r"><code># This works but we'll create a concatenated id to make life easier -setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) +setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) # sorts into order print("-> Create uniq id for diaries (for matching) and persons")</code></pre> <pre><code>## [1] "-> Create uniq id for diaries (for matching) and persons"</code></pre> <pre class="r"><code># Create unique ids ---- - -# diarypid -mtusUKEpsDT$ba_diarypid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -mtusUKEpsDT$ba_pid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid - )</code></pre> +ba_MTUScreateIds(mtusUKEpsDT)</code></pre> <div id="episode-dates" class="section level2"> <h2><span class="header-section-number">3.1</span> Episode dates</h2> <pre class="r"><code># Check cday - the day the episode starts. It could span midnight (2 days) @@ -1531,14 +1513,14 @@ print("# Check distributions of episode starts (all surveys pooled)")< .( N = length(epnum) ), - by = str_epStartTime - ][order(str_epStartTime)] + by = r_epStartTimeStr + ][order(r_epStartTimeStr)] kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> <table> <caption>Check 1/2 hour coding (as string)</caption> <thead> <tr class="header"> -<th align="left">str_epStartTime</th> +<th align="left">r_epStartTimeStr</th> <th align="right">N</th> </tr> </thead> @@ -2689,12 +2671,12 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> </tr> </tbody> </table> -<pre class="r"><code>print("# Check NAs in str_epStartTime (should be none)")</code></pre> -<pre><code>## [1] "# Check NAs in str_epStartTime (should be none)"</code></pre> -<pre class="r"><code>print(paste0("# N rows with NAs in str_epStartTime:", - nrow(mtusUKEpsDT[is.na(str_epStartTime)])) +<pre class="r"><code>print("# Check NAs in r_epStartTimeStr (should be none)")</code></pre> +<pre><code>## [1] "# Check NAs in r_epStartTimeStr (should be none)"</code></pre> +<pre class="r"><code>print(paste0("# N rows with NAs in r_epStartTimeStr:", + nrow(mtusUKEpsDT[is.na(r_epStartTimeStr)])) )</code></pre> -<pre><code>## [1] "# N rows with NAs in str_epStartTime:0"</code></pre> +<pre><code>## [1] "# N rows with NAs in r_epStartTimeStr:0"</code></pre> </div> <div id="create-pooled-survey-variable" class="section level2"> <h2><span class="header-section-number">3.4</span> Create pooled survey variable</h2> @@ -3308,7 +3290,7 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> inout, eloc, mtrav, r_date, r_dow, time, - str_epStartTime, r_epStartDateTime, r_epEndDateTime)] + r_epStartTimeStr, r_epStartDateTime, r_epEndDateTime)] mtusUKEpsDT <- NULL # Save out the working file for later use (saves re-running) @@ -3366,7 +3348,7 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> <td align="left">time</td> </tr> <tr class="odd"> -<td align="left">str_epStartTime</td> +<td align="left">r_epStartTimeStr</td> </tr> <tr class="even"> <td align="left">r_epStartDateTime</td> @@ -3376,6 +3358,138 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> </tr> </tbody> </table> +<pre class="r"><code># print out first few rows +ht <- head(MTUSW6UKdiaryEps_DT) + +kable(caption = "First few rows of saved episode data", + ht + )</code></pre> +<table> +<caption>First few rows of saved episode data</caption> +<thead> +<tr class="header"> +<th align="right">survey</th> +<th align="right">ba_survey</th> +<th align="left">ba_diarypid</th> +<th align="left">ba_pid</th> +<th align="left">main</th> +<th align="left">sec</th> +<th align="left">inout</th> +<th align="left">eloc</th> +<th align="left">mtrav</th> +<th align="left">r_date</th> +<th align="left">r_dow</th> +<th align="right">time</th> +<th align="left">r_epStartTimeStr</th> +<th align="left">r_epStartDateTime</th> +<th align="left">r_epEndDateTime</th> +</tr> +</thead> +<tbody> +<tr class="odd"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">sleep and naps</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">90</td> +<td align="left">4:0</td> +<td align="left">1974-08-14 04:00:00</td> +<td align="left">1974-08-14 05:30:00</td> +</tr> +<tr class="even"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">wash, dress, care for self</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">5:30</td> +<td align="left">1974-08-14 05:30:00</td> +<td align="left">1974-08-14 07:30:00</td> +</tr> +<tr class="odd"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">wash, dress, care for self</td> +<td align="left">listen to radio</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">6:0</td> +<td align="left">1974-08-14 06:00:00</td> +<td align="left">1974-08-14 08:30:00</td> +</tr> +<tr class="even"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">meals or snacks in other places</td> +<td align="left">listen to radio</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">6:30</td> +<td align="left">1974-08-14 06:30:00</td> +<td align="left">1974-08-14 09:30:00</td> +</tr> +<tr class="odd"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">travel to/from work</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">travelling</td> +<td align="left">other/unspecified transport</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">7:0</td> +<td align="left">1974-08-14 07:00:00</td> +<td align="left">1974-08-14 10:30:00</td> +</tr> +<tr class="even"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">paid work-main job (not at home)</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">at workplace</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">150</td> +<td align="left">7:30</td> +<td align="left">1974-08-14 07:30:00</td> +<td align="left">1974-08-14 13:30:00</td> +</tr> +</tbody> +</table> </div> </div> <div id="descriptive-statistics" class="section level1"> @@ -3700,7 +3814,7 @@ kable( <h2><span class="header-section-number">4.4</span> Episodes by actual month recorded</h2> <p>Remember that the number of episodes relates to the number of respondents, the length of the diary time slots in each survey, the number of days on which diaries were completed and the number of possible different activity codes… so they are not directly comparable across years (surveys).</p> <pre class="r"><code>kable( - table(as.POSIXlt(MTUSW6UKdiaryEps_DT$r_date)$mon, + table(month(MTUSW6UKdiaryEps_DT$r_date,label = TRUE, abbr = TRUE), # requires lubridate MTUSW6UKdiaryEps_DT$r_year, useNA = "always" ) @@ -3723,7 +3837,7 @@ kable( </thead> <tbody> <tr class="odd"> -<td>0</td> +<td>Jan</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3736,7 +3850,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>1</td> +<td>Feb</td> <td align="right">0</td> <td align="right">129080</td> <td align="right">0</td> @@ -3749,7 +3863,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>2</td> +<td>Mar</td> <td align="right">0</td> <td align="right">49613</td> <td align="right">0</td> @@ -3762,7 +3876,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>3</td> +<td>Apr</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3775,7 +3889,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>4</td> +<td>May</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3788,7 +3902,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>5</td> +<td>Jun</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3801,7 +3915,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>6</td> +<td>Jul</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3814,7 +3928,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>7</td> +<td>Aug</td> <td align="right">96472</td> <td align="right">0</td> <td align="right">0</td> @@ -3827,7 +3941,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>8</td> +<td>Sep</td> <td align="right">26421</td> <td align="right">0</td> <td align="right">257</td> @@ -3840,7 +3954,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>9</td> +<td>Oct</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3853,7 +3967,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>10</td> +<td>Nov</td> <td align="right">0</td> <td align="right">0</td> <td align="right">101961</td> @@ -3866,7 +3980,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>11</td> +<td>Dec</td> <td align="right">0</td> <td align="right">0</td> <td align="right">5462</td> @@ -3894,7 +4008,7 @@ kable( </tbody> </table> <hr /> -<p><strong>Meta:</strong> Analysis completed in: 7.073 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> +<p><strong>Meta:</strong> Analysis completed in: 6.662 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> </div> </div> <div id="references" class="section level1 unnumbered"> diff --git a/MTUS-W6-adult-survey-data-processing.Rmd b/MTUS-W6-adult-survey-data-processing.Rmd index b72411f..5baeac8 100644 --- a/MTUS-W6-adult-survey-data-processing.Rmd +++ b/MTUS-W6-adult-survey-data-processing.Rmd @@ -142,26 +142,8 @@ We now delete the non-UK data leaving us with `r format(nrow(MTUSW6UKsurvey_DT), print("-> Create uniq id for diaries (for matching) and persons") # Create unique ids ---- -# diarypid -MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, - survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, - swave, - msamp, - hldid, - persid - ) - -# create a reduced survey table with the few variables we need so joins -# does not break memory +# re-use the same function as for the episode data as, strangely, the survey data has persid & id +MTUSW6UKsurvey_DT <- ba_MTUScreateIds(MTUSW6UKsurvey_DT) t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, "Number of diary days" = uniqueN(ba_diarypid), diff --git a/MTUS-W6-adult-survey-data-processing.html b/MTUS-W6-adult-survey-data-processing.html index 96a149c..41fbe5f 100644 --- a/MTUS-W6-adult-survey-data-processing.html +++ b/MTUS-W6-adult-survey-data-processing.html @@ -118,7 +118,7 @@ $(document).ready(function () { <h1 class="title toc-ignore">MTUS World 6 Survey Data Processing</h1> <h4 class="author"><em>Ben Anderson (<a href="mailto:b.anderson@soton.ac.uk">b.anderson@soton.ac.uk</a>, <a href="mailto:/@dataknut">/@dataknut</a>)</em></h4> -<h4 class="date"><em>Last run at: 2017-05-20 09:46:37</em></h4> +<h4 class="date"><em>Last run at: 2017-05-23 12:58:25</em></h4> </div> @@ -523,26 +523,7 @@ $(document).ready(function () { <pre class="r"><code>print("-> Create uniq id for diaries (for matching) and persons")</code></pre> <pre><code>## [1] "-> Create uniq id for diaries (for matching) and persons"</code></pre> <pre class="r"><code># Create unique ids ---- -# diarypid -MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, - survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, - swave, - msamp, - hldid, - persid - ) - -# create a reduced survey table with the few variables we need so joins -# does not break memory +MTUSW6UKsurvey_DT <- ba_MTUScreateIds(MTUSW6UKsurvey_DT) t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, "Number of diary days" = uniqueN(ba_diarypid), @@ -2803,7 +2784,7 @@ pub_etc <- lm(main39 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, d </div> <div id="about" class="section level1"> <h1><span class="header-section-number">9</span> About</h1> -<p>Analysis completed in: 40.928 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> +<p>Analysis completed in: 41.108 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> </div> <div id="references" class="section level1 unnumbered"> <h1>References</h1> diff --git a/MTUS-W6-adult-survey-data-processing.md b/MTUS-W6-adult-survey-data-processing.md index 05a5fd1..ca8983d 100644 --- a/MTUS-W6-adult-survey-data-processing.md +++ b/MTUS-W6-adult-survey-data-processing.md @@ -125,26 +125,7 @@ print("-> Create uniq id for diaries (for matching) and persons") ```r # Create unique ids ---- -# diarypid -MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, - survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, - swave, - msamp, - hldid, - persid - ) - -# create a reduced survey table with the few variables we need so joins -# does not break memory +MTUSW6UKsurvey_DT <- ba_MTUScreateIds(MTUSW6UKsurvey_DT) t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, "Number of diary days" = uniqueN(ba_diarypid), @@ -1439,6 +1420,6 @@ On the basis of these results we seem justified in assuming that we can pool 198 # About -Analysis completed in: 40.928 seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). +Analysis completed in: 41.108 seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). # References -- GitLab