diff --git a/MTUS-W9-adult-episodes-data-processing.Rmd b/MTUS-W9-adult-episodes-data-processing.Rmd index 60941d64cf6fbdc690b1fad07bce4e53dd651089..e2fcecf97cea6a2ef386324ad4351ac602963ab6 100644 --- a/MTUS-W9-adult-episodes-data-processing.Rmd +++ b/MTUS-W9-adult-episodes-data-processing.Rmd @@ -68,6 +68,7 @@ reqLibsLocal <- c("foreign", # loading SPSS/STATA "data.table", # fast data manipulation "dplyr", # data manipulation "dtplyr", # data table & dplyr code + "ggplot2", # slick graphs "stargazer", # for pretty tables "knitr" # for kable ) @@ -636,49 +637,62 @@ First the datetime version: Now the HH:MM version: ```{r createAllEpisodeStartEndTimes} - # now the HH:MM version - # hours - mtusUKEpsDT$r_epStartH <- 4 + floor(mtusUKEpsDT$ba_startm/60) - print("# Check start hours") - summary(mtusUKEpsDT$r_epStartH) - print("# This has created hours > 23 (tomorrow). Need to fix them") - mtusUKEpsDT[, r_epStartHf := ifelse(r_epStartH > 23, - r_epStartH - 24, - r_epStartH) - ] - print("# check") - table(mtusUKEpsDT$r_epStartHf[mtusUKEpsDT$r_epStartH > 22], - mtusUKEpsDT$r_epStartH[mtusUKEpsDT$r_epStartH > 22]) - - # minutes - mtusUKEpsDT$r_epStartM <- 0 + (mtusUKEpsDT$ba_startm %% 60) # this is R's way of calculating the remainder!! - print("# Check start minutes") - summary(mtusUKEpsDT$r_epStartM) - - - # concatenate & make POSIXct - mtusUKEpsDT[,r_epStartTime := as.POSIXct(paste0(r_epStartHf, - ":", - r_epStartM), - format = "%H:%M") - ] - print("# Check distributions of episode starts (all surveys pooled)") - t <- mtusUKEpsDT[, - .( - N = length(epnum) - ), - by = r_epStartTime - ] - plot(t) +# now the HH:MM version +# hours +mtusUKEpsDT$r_epStartH <- 4 + floor(mtusUKEpsDT$ba_startm/60) +print("# Check start hours") +summary(mtusUKEpsDT$r_epStartH) +print("# This has created hours > 23 (tomorrow). Need to fix them") +mtusUKEpsDT[, r_epStartHf := ifelse(r_epStartH > 23, + r_epStartH - 24, + r_epStartH) + ] +print("# check") +table(mtusUKEpsDT$r_epStartHf[mtusUKEpsDT$r_epStartH > 22], + mtusUKEpsDT$r_epStartH[mtusUKEpsDT$r_epStartH > 22]) + +# minutes +mtusUKEpsDT$r_epStartM <- 0 + (mtusUKEpsDT$ba_startm %% 60) # this is R's way of calculating the remainder!! +print("# Check start minutes") +summary(mtusUKEpsDT$r_epStartM) + + +# concatenate & make POSIXct +mtusUKEpsDT[,r_epStartTime := as.POSIXct(paste0(r_epStartHf, + ":", + r_epStartM), + format = "%H:%M") + ] + +# create episode end +mtusUKEpsDT$r_epEndTime <- mtusUKEpsDT$r_epStartTime + (mtusUKEpsDT$end*60) + +print("# Check NAs in time version (should be none)") +print("# -> StartTime") +summary(mtusUKEpsDT$r_epStartTime) +print("# -> EndTime") +summary(mtusUKEpsDT$r_epEndTime) +``` + +```{r plotStartTimes, fig.cap = "Check distributions of episode starts by survey"} +ggplot(mtusUKEpsDT) + + geom_histogram(aes(x = r_epStartTime)) + + scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours") + + facet_grid(survey ~ .) + + labs( + caption = "MTUS UK samples" + ) - # create episode end - mtusUKEpsDT$r_epEndTime <- mtusUKEpsDT$r_epStartTime + (mtusUKEpsDT$end*60) +``` + +```{r plotDurations, fig.cap = "Check distributions of episode durations by survey"} +mtusUKEpsDT <- mtusUKEpsDT[, epDurationMins := (r_epEndTime - r_epStartTime)/60] + +ggplot(mtusUKEpsDT) + + geom_histogram(aes(x = epDurationMins)) + + facet_grid(survey ~ .) + + labs(x = "Episode duration (minutes)", caption = "MTUS UK samples") - print("# Check NAs in time version (should be none)") - print("# -> StartTime") - summary(mtusUKEpsDT$r_epStartTime) - print("# -> EndTime") - summary(mtusUKEpsDT$r_epEndTime) ``` ## Create half hours