diff --git a/MTUS-W6-adult-episodes-data-processing.Rmd b/MTUS-W6-adult-episodes-data-processing.Rmd index 99d13b3eef75771e1d2efdb12f7c83c94731e8ca..746e21b95af046c761ff370eaadc09f6f5f995fc 100644 --- a/MTUS-W6-adult-episodes-data-processing.Rmd +++ b/MTUS-W6-adult-episodes-data-processing.Rmd @@ -127,29 +127,11 @@ We now delete the non-UK data leaving us with `r format(nrow(mtusUKEpsDT), big.m ```{r setKeys} # This works but we'll create a concatenated id to make life easier -setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) +setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) # sorts into order print("-> Create uniq id for diaries (for matching) and persons") # Create unique ids ---- - -# diarypid -mtusUKEpsDT$ba_diarypid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -mtusUKEpsDT$ba_pid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid - ) +ba_MTUScreateIds(mtusUKEpsDT) ``` ## Episode dates @@ -603,13 +585,13 @@ t <- mtusUKEpsDT[, .( N = length(epnum) ), - by = str_epStartTime - ][order(str_epStartTime)] + by = r_epStartTimeStr + ][order(r_epStartTimeStr)] kable(caption = "Check 1/2 hour coding (as string)",t) -print("# Check NAs in str_epStartTime (should be none)") -print(paste0("# N rows with NAs in str_epStartTime:", - nrow(mtusUKEpsDT[is.na(str_epStartTime)])) +print("# Check NAs in r_epStartTimeStr (should be none)") +print(paste0("# N rows with NAs in r_epStartTimeStr:", + nrow(mtusUKEpsDT[is.na(r_epStartTimeStr)])) ) ``` @@ -705,7 +687,7 @@ kable(caption = "Distribution of missing location", inout, eloc, mtrav, r_date, r_dow, time, - str_epStartTime, r_epStartDateTime, r_epEndDateTime)] + r_epStartTimeStr, r_epStartDateTime, r_epEndDateTime)] mtusUKEpsDT <- NULL # Save out the working file for later use (saves re-running) @@ -725,6 +707,15 @@ Variables retained in processed episodes file: `r kable(caption = "Variables retained in episodes file", names(MTUSW6UKdiaryEps_DT))` +```{r firstEps} +# print out first few rows +ht <- head(MTUSW6UKdiaryEps_DT) + +kable(caption = "First few rows of saved episode data", + ht + ) +``` + # Descriptive Statistics ## Number of episodes recorded @@ -771,7 +762,7 @@ Remember that the number of episodes relates to the number of respondents, the l ```{r episodesByActualYearMonth} kable( - table(as.POSIXlt(MTUSW6UKdiaryEps_DT$r_date)$mon, + table(month(MTUSW6UKdiaryEps_DT$r_date,label = TRUE, abbr = TRUE), # requires lubridate MTUSW6UKdiaryEps_DT$r_year, useNA = "always" ) diff --git a/MTUS-W6-adult-episodes-data-processing.html b/MTUS-W6-adult-episodes-data-processing.html index 0b1f17d619c1a3d4eb4fb12b773952c4ab25382e..bfc97a8e544d9287fe95de3cc882e04000ff4541 100644 --- a/MTUS-W6-adult-episodes-data-processing.html +++ b/MTUS-W6-adult-episodes-data-processing.html @@ -118,7 +118,7 @@ $(document).ready(function () { <h1 class="title toc-ignore">MTUS World 6 Data Processing</h1> <h4 class="author"><em>Ben Anderson (<a href="mailto:b.anderson@soton.ac.uk">b.anderson@soton.ac.uk</a>, <a href="mailto:/@dataknut">/@dataknut</a>)</em></h4> -<h4 class="date"><em>Last run at: 2017-05-22 12:42:23</em></h4> +<h4 class="date"><em>Last run at: 2017-05-23 13:28:18</em></h4> </div> @@ -188,7 +188,7 @@ $(document).ready(function () { ## Unrecognized record type 7, subtype 18 encountered in system file</code></pre> <pre><code>## re-encoding from CP1252</code></pre> <pre><code>## user system elapsed -## 225.091 12.120 260.905</code></pre> +## 225.749 10.703 260.696</code></pre> <p>We have loaded 11,501,221 rows of data for 11 countries.</p> <pre class="r"><code>kable(caption = "Number of diaries per year", ba_tidyNum(table(mtusEpsW6DT$survey,droplevels(mtusEpsW6DT$countrya)) # removes unused countries @@ -512,30 +512,12 @@ $(document).ready(function () { <pre class="r"><code>mtusUKEpsDT <- subset(mtusEpsW6DT, countrya == "United Kingdom")</code></pre> <p>We now delete the non-UK data leaving us with 1,392,007 rows of survey data.</p> <pre class="r"><code># This works but we'll create a concatenated id to make life easier -setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) +setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id) # sorts into order print("-> Create uniq id for diaries (for matching) and persons")</code></pre> <pre><code>## [1] "-> Create uniq id for diaries (for matching) and persons"</code></pre> <pre class="r"><code># Create unique ids ---- - -# diarypid -mtusUKEpsDT$ba_diarypid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -mtusUKEpsDT$ba_pid <- - group_indices(mtusUKEpsDT, survey, - swave, - msamp, - hldid, - persid - )</code></pre> +ba_MTUScreateIds(mtusUKEpsDT)</code></pre> <div id="episode-dates" class="section level2"> <h2><span class="header-section-number">3.1</span> Episode dates</h2> <pre class="r"><code># Check cday - the day the episode starts. It could span midnight (2 days) @@ -1531,14 +1513,14 @@ print("# Check distributions of episode starts (all surveys pooled)")< .( N = length(epnum) ), - by = str_epStartTime - ][order(str_epStartTime)] + by = r_epStartTimeStr + ][order(r_epStartTimeStr)] kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> <table> <caption>Check 1/2 hour coding (as string)</caption> <thead> <tr class="header"> -<th align="left">str_epStartTime</th> +<th align="left">r_epStartTimeStr</th> <th align="right">N</th> </tr> </thead> @@ -2689,12 +2671,12 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> </tr> </tbody> </table> -<pre class="r"><code>print("# Check NAs in str_epStartTime (should be none)")</code></pre> -<pre><code>## [1] "# Check NAs in str_epStartTime (should be none)"</code></pre> -<pre class="r"><code>print(paste0("# N rows with NAs in str_epStartTime:", - nrow(mtusUKEpsDT[is.na(str_epStartTime)])) +<pre class="r"><code>print("# Check NAs in r_epStartTimeStr (should be none)")</code></pre> +<pre><code>## [1] "# Check NAs in r_epStartTimeStr (should be none)"</code></pre> +<pre class="r"><code>print(paste0("# N rows with NAs in r_epStartTimeStr:", + nrow(mtusUKEpsDT[is.na(r_epStartTimeStr)])) )</code></pre> -<pre><code>## [1] "# N rows with NAs in str_epStartTime:0"</code></pre> +<pre><code>## [1] "# N rows with NAs in r_epStartTimeStr:0"</code></pre> </div> <div id="create-pooled-survey-variable" class="section level2"> <h2><span class="header-section-number">3.4</span> Create pooled survey variable</h2> @@ -3308,7 +3290,7 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> inout, eloc, mtrav, r_date, r_dow, time, - str_epStartTime, r_epStartDateTime, r_epEndDateTime)] + r_epStartTimeStr, r_epStartDateTime, r_epEndDateTime)] mtusUKEpsDT <- NULL # Save out the working file for later use (saves re-running) @@ -3366,7 +3348,7 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> <td align="left">time</td> </tr> <tr class="odd"> -<td align="left">str_epStartTime</td> +<td align="left">r_epStartTimeStr</td> </tr> <tr class="even"> <td align="left">r_epStartDateTime</td> @@ -3376,6 +3358,138 @@ kable(caption = "Check 1/2 hour coding (as string)",t)</code></pre> </tr> </tbody> </table> +<pre class="r"><code># print out first few rows +ht <- head(MTUSW6UKdiaryEps_DT) + +kable(caption = "First few rows of saved episode data", + ht + )</code></pre> +<table> +<caption>First few rows of saved episode data</caption> +<thead> +<tr class="header"> +<th align="right">survey</th> +<th align="right">ba_survey</th> +<th align="left">ba_diarypid</th> +<th align="left">ba_pid</th> +<th align="left">main</th> +<th align="left">sec</th> +<th align="left">inout</th> +<th align="left">eloc</th> +<th align="left">mtrav</th> +<th align="left">r_date</th> +<th align="left">r_dow</th> +<th align="right">time</th> +<th align="left">r_epStartTimeStr</th> +<th align="left">r_epStartDateTime</th> +<th align="left">r_epEndDateTime</th> +</tr> +</thead> +<tbody> +<tr class="odd"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">sleep and naps</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">90</td> +<td align="left">4:0</td> +<td align="left">1974-08-14 04:00:00</td> +<td align="left">1974-08-14 05:30:00</td> +</tr> +<tr class="even"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">wash, dress, care for self</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">5:30</td> +<td align="left">1974-08-14 05:30:00</td> +<td align="left">1974-08-14 07:30:00</td> +</tr> +<tr class="odd"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">wash, dress, care for self</td> +<td align="left">listen to radio</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">6:0</td> +<td align="left">1974-08-14 06:00:00</td> +<td align="left">1974-08-14 08:30:00</td> +</tr> +<tr class="even"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">meals or snacks in other places</td> +<td align="left">listen to radio</td> +<td align="left">location not collected by study</td> +<td align="left">at own home</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">6:30</td> +<td align="left">1974-08-14 06:30:00</td> +<td align="left">1974-08-14 09:30:00</td> +</tr> +<tr class="odd"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">travel to/from work</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">travelling</td> +<td align="left">other/unspecified transport</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">30</td> +<td align="left">7:0</td> +<td align="left">1974-08-14 07:00:00</td> +<td align="left">1974-08-14 10:30:00</td> +</tr> +<tr class="even"> +<td align="right">1974</td> +<td align="right">1974</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac1_</td> +<td align="left">b1c3cf3103b5fdb1924e7526998c54ac</td> +<td align="left">paid work-main job (not at home)</td> +<td align="left">no recorded activity</td> +<td align="left">location not collected by study</td> +<td align="left">at workplace</td> +<td align="left">not travelling</td> +<td align="left">1974-08-14</td> +<td align="left">Wednesday</td> +<td align="right">150</td> +<td align="left">7:30</td> +<td align="left">1974-08-14 07:30:00</td> +<td align="left">1974-08-14 13:30:00</td> +</tr> +</tbody> +</table> </div> </div> <div id="descriptive-statistics" class="section level1"> @@ -3700,7 +3814,7 @@ kable( <h2><span class="header-section-number">4.4</span> Episodes by actual month recorded</h2> <p>Remember that the number of episodes relates to the number of respondents, the length of the diary time slots in each survey, the number of days on which diaries were completed and the number of possible different activity codes… so they are not directly comparable across years (surveys).</p> <pre class="r"><code>kable( - table(as.POSIXlt(MTUSW6UKdiaryEps_DT$r_date)$mon, + table(month(MTUSW6UKdiaryEps_DT$r_date,label = TRUE, abbr = TRUE), # requires lubridate MTUSW6UKdiaryEps_DT$r_year, useNA = "always" ) @@ -3723,7 +3837,7 @@ kable( </thead> <tbody> <tr class="odd"> -<td>0</td> +<td>Jan</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3736,7 +3850,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>1</td> +<td>Feb</td> <td align="right">0</td> <td align="right">129080</td> <td align="right">0</td> @@ -3749,7 +3863,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>2</td> +<td>Mar</td> <td align="right">0</td> <td align="right">49613</td> <td align="right">0</td> @@ -3762,7 +3876,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>3</td> +<td>Apr</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3775,7 +3889,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>4</td> +<td>May</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3788,7 +3902,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>5</td> +<td>Jun</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3801,7 +3915,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>6</td> +<td>Jul</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3814,7 +3928,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>7</td> +<td>Aug</td> <td align="right">96472</td> <td align="right">0</td> <td align="right">0</td> @@ -3827,7 +3941,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>8</td> +<td>Sep</td> <td align="right">26421</td> <td align="right">0</td> <td align="right">257</td> @@ -3840,7 +3954,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>9</td> +<td>Oct</td> <td align="right">0</td> <td align="right">0</td> <td align="right">0</td> @@ -3853,7 +3967,7 @@ kable( <td align="right">0</td> </tr> <tr class="odd"> -<td>10</td> +<td>Nov</td> <td align="right">0</td> <td align="right">0</td> <td align="right">101961</td> @@ -3866,7 +3980,7 @@ kable( <td align="right">0</td> </tr> <tr class="even"> -<td>11</td> +<td>Dec</td> <td align="right">0</td> <td align="right">0</td> <td align="right">5462</td> @@ -3894,7 +4008,7 @@ kable( </tbody> </table> <hr /> -<p><strong>Meta:</strong> Analysis completed in: 7.073 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> +<p><strong>Meta:</strong> Analysis completed in: 6.662 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> </div> </div> <div id="references" class="section level1 unnumbered"> diff --git a/MTUS-W6-adult-survey-data-processing.Rmd b/MTUS-W6-adult-survey-data-processing.Rmd index b72411fb34ba987e3e7c2602825fd64b335b10f1..5baeac804ef89e48cbfc0d30412fcc598f0dbbd8 100644 --- a/MTUS-W6-adult-survey-data-processing.Rmd +++ b/MTUS-W6-adult-survey-data-processing.Rmd @@ -142,26 +142,8 @@ We now delete the non-UK data leaving us with `r format(nrow(MTUSW6UKsurvey_DT), print("-> Create uniq id for diaries (for matching) and persons") # Create unique ids ---- -# diarypid -MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, - survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, - swave, - msamp, - hldid, - persid - ) - -# create a reduced survey table with the few variables we need so joins -# does not break memory +# re-use the same function as for the episode data as, strangely, the survey data has persid & id +MTUSW6UKsurvey_DT <- ba_MTUScreateIds(MTUSW6UKsurvey_DT) t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, "Number of diary days" = uniqueN(ba_diarypid), diff --git a/MTUS-W6-adult-survey-data-processing.html b/MTUS-W6-adult-survey-data-processing.html index 96a149cd879c12b32c4e35f949883e88e0b9f840..41fbe5f42446627e23ae664e2759b767771cd838 100644 --- a/MTUS-W6-adult-survey-data-processing.html +++ b/MTUS-W6-adult-survey-data-processing.html @@ -118,7 +118,7 @@ $(document).ready(function () { <h1 class="title toc-ignore">MTUS World 6 Survey Data Processing</h1> <h4 class="author"><em>Ben Anderson (<a href="mailto:b.anderson@soton.ac.uk">b.anderson@soton.ac.uk</a>, <a href="mailto:/@dataknut">/@dataknut</a>)</em></h4> -<h4 class="date"><em>Last run at: 2017-05-20 09:46:37</em></h4> +<h4 class="date"><em>Last run at: 2017-05-23 12:58:25</em></h4> </div> @@ -523,26 +523,7 @@ $(document).ready(function () { <pre class="r"><code>print("-> Create uniq id for diaries (for matching) and persons")</code></pre> <pre><code>## [1] "-> Create uniq id for diaries (for matching) and persons"</code></pre> <pre class="r"><code># Create unique ids ---- -# diarypid -MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, - survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, - swave, - msamp, - hldid, - persid - ) - -# create a reduced survey table with the few variables we need so joins -# does not break memory +MTUSW6UKsurvey_DT <- ba_MTUScreateIds(MTUSW6UKsurvey_DT) t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, "Number of diary days" = uniqueN(ba_diarypid), @@ -2803,7 +2784,7 @@ pub_etc <- lm(main39 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, d </div> <div id="about" class="section level1"> <h1><span class="header-section-number">9</span> About</h1> -<p>Analysis completed in: 40.928 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> +<p>Analysis completed in: 41.108 seconds using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a>.</p> </div> <div id="references" class="section level1 unnumbered"> <h1>References</h1> diff --git a/MTUS-W6-adult-survey-data-processing.md b/MTUS-W6-adult-survey-data-processing.md index 05a5fd186213dd77ec2b9a652b0c232ad4d1419f..ca8983d67831b6873f351f5265ebc9f24e201c26 100644 --- a/MTUS-W6-adult-survey-data-processing.md +++ b/MTUS-W6-adult-survey-data-processing.md @@ -125,26 +125,7 @@ print("-> Create uniq id for diaries (for matching) and persons") ```r # Create unique ids ---- -# diarypid -MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, - survey, - swave, - msamp, - hldid, - persid, - id - ) - -# pid -MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, - swave, - msamp, - hldid, - persid - ) - -# create a reduced survey table with the few variables we need so joins -# does not break memory +MTUSW6UKsurvey_DT <- ba_MTUScreateIds(MTUSW6UKsurvey_DT) t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, "Number of diary days" = uniqueN(ba_diarypid), @@ -1439,6 +1420,6 @@ On the basis of these results we seem justified in assuming that we can pool 198 # About -Analysis completed in: 40.928 seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). +Analysis completed in: 41.108 seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). # References