From 138895382307caedfd8c4c278e1fe10ef3dce8f5 Mon Sep 17 00:00:00 2001
From: Ben Anderson <dataknut@icloud.com>
Date: Fri, 19 May 2017 18:04:23 +0100
Subject: [PATCH] amended yaml; amended setup; removed many derived time
 variables

---
 MTUS-W6-adult-episodes-data-processing.Rmd | 164 +++++++++------------
 MTUS-W6-adult-survey-data-processing.Rmd   | 113 ++++++++------
 2 files changed, 133 insertions(+), 144 deletions(-)

diff --git a/MTUS-W6-adult-episodes-data-processing.Rmd b/MTUS-W6-adult-episodes-data-processing.Rmd
index 0da1f89..0cd85ae 100644
--- a/MTUS-W6-adult-episodes-data-processing.Rmd
+++ b/MTUS-W6-adult-episodes-data-processing.Rmd
@@ -1,11 +1,12 @@
 ---
 title: "MTUS World 6 Data Processing"
-author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)"
+author: "Ben Anderson (b.anderson@soton.ac.uk, /@dataknut)"
 date: 'Last run at: `r Sys.time()`'
 output:
   html_document:
     fig_caption: yes
     number_sections: yes
+    self_contained: no
     theme: journal
     toc: yes
     toc_depth: 3
@@ -16,13 +17,54 @@ output:
 bibliography: ~/bibliography.bib
 ---
 
-```{r setupKnitr, include=FALSE}
-# set default echo to FALSE (code not in output)
-knitr::opts_chunk$set(echo = FALSE)
-knitr::opts_chunk$set(warning = TRUE)
-knitr::opts_chunk$set(message = FALSE)
-knitr::opts_chunk$set(fig_caption = TRUE)
-knitr::opts_chunk$set(tidy = TRUE)
+```{r codeSetup, include=FALSE}
+# Housekeeping ----
+rm(list=ls(all=TRUE)) # remove all objects from workspace
+
+# Set start time ----
+startTime <- Sys.time()
+
+# Where are we?
+sysInfo <- Sys.info()
+sysName <- sysInfo[[1]]
+nodeName <- sysInfo[[4]]
+
+# default code location - needed to load functions & parameters correctly
+projLoc <- "~/github/MTUS/" # <- this seems to work on windows as long as you put the SAVE repo in mydocuments/github
+
+# if necessary set correct path for a different platform
+if(startsWith(nodeName, "octomac")) # => BA laptop
+  projLoc <- "~/github/dataknut/MTUS/"
+
+# give feedback
+print(paste0("Running on ", sysName, " with projLoc = ", projLoc))
+
+# Functions ----
+print(paste0("Loading functions from ", projLoc,"mtusFunctions.R"))
+source(paste0(projLoc,"mtusFunctions.R"))
+
+# Load libraries ----
+# NB libraries required by saveFunctions.R should already be loaded
+reqLibsLocal <- c("foreign", # loading SPSS/STATA
+       "data.table", # fast data manipulation
+       "lubridate", # dates & times made easy
+       "dplyr", # data manipulation
+       "dtplyr", # data table & dplyr code
+       "ggplot2", # slick graphs
+       "stargazer", # for pretty tables
+       "knitr" # for kable
+       )
+print(paste0("Loading the following libraries using lb_myRequiredPackages: ", reqLibsLocal))
+# Use Luke's function to require/install/load
+lb_myRequiredPackages(reqLibsLocal,"http://cran.rstudio.com/")
+
+# Set paths ----
+mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file
+mtusProcPath <- paste0(mtusPath,"processed/") # where to put the processed .csv file(s)
+
+# Set file names ----
+episodesFile <- "MTUS-adult-episode.sav"
+eFile <- paste0(mtusPath, episodesFile)
 ```
 
 # Introduction
@@ -44,11 +86,8 @@ This work was funded by RCUK through the End User Energy Demand Centres Programm
 Code:
  * https://github.com/dataknut/MTUS
  
-`License:`
-
-`The R code embedded in this document is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.`
-    
-`This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.`
+License:
+ * https://github.com/dataknut/MTUS/blob/master/LICENSE
     
 >YMMV - http://en.wiktionary.org/wiki/YMMV
 
@@ -61,46 +100,13 @@ Key packages used:
  * data.table - for fast (big) data handling [@data.table]
  * knitr - to create this document [@knitr]
  * dplyr & dtplyr - for data manipulation [@dplyr][@dtplyr]
- 
-````{r houseKeeping}
-# Clear out all old objects etc ----
-# to avoid confusion
-rm(list = ls()) 
-
-# Set time ----
-starttime <- Sys.time()
-
-# Load required packages ----
-packs <- c("foreign", # loading SPSS/STATA
-       "data.table", # fast data manipulation
-       "dplyr", # data manipulation
-       "dtplyr", # data table & dplyr code
-       "knitr" # for kable
-       )
-
-# do this to install them if needed
-# install.packages(x)
-print("Loading required packages:")
-print(packs)
-
-lapply(packs, require, character.only = T)
-
-# Set paths ----
-mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file
-mtusProcPath <- "~/Data/MTUS/World_6/processed/" # where to put the processed .csv file(s)
-
-# Set file names ----
-episodesFile <- "MTUS-adult-episode.sav"
-
-efile <- paste0(mtusPath, episodesFile)
-````
 
 # Load and process episodes file
-Loading and processing `r efile`.
+Loading and processing `r eFile`.
 
 ```{r loadEpisodesFile}
 system.time(
-  mtusEpsW6DT <- as.data.table(read.spss(efile))
+  mtusEpsW6DT <- as.data.table(read.spss(eFile))
 )
 ```
 
@@ -108,8 +114,7 @@ We have loaded `r format(nrow(mtusEpsW6DT), big.mark=",",scientific=FALSE)` rows
 
 ```{r basicStats}
 kable(caption = "Number of diaries per year",
-      ba_tidyNum(table(mtusEpsW6DT$survey, droplevels(mtusEpsW6DT$countrya) # removes unused countries
-            )
+      ba_tidyNum(table(mtusEpsW6DT$survey,droplevels(mtusEpsW6DT$countrya)) # removes unused countries
       )
 )
 ```
@@ -595,7 +600,7 @@ First the datetime version:
   summary(mtusUKEpsDT$r_epEndDateTime)
 ```
 
-Now the HH:MM version:
+Now the HH:MM version. This is created as a string to avoid having to set a nonsense date.
 
 ```{r createAllEpisodeStartEndTimes}
   # now the HH:MM version
@@ -619,61 +624,26 @@ Now the HH:MM version:
   
   
   # concatenate & make POSIXct
-  mtusUKEpsDT[,r_epStartTime := as.POSIXct(paste0(r_epStartHf,
+  mtusUKEpsDT[,str_epStartTime := paste0(r_epStartHf,
                                               ":",
-                                              r_epStartM), 
-                                       format = "%H:%M")
+                                              r_epStartM)
               ]
   print("# Check distributions of episode starts (all surveys pooled)")
   t <- mtusUKEpsDT[,
                     .(
                       N = length(epnum)
                     ),
-                    by = r_epStartTime
-                      ]
-  plot(t)
+                    by = str_epStartTime
+                      ][order(str_epStartTime)]
+  kable(caption = "Check 1/2 hour coding (as string)",t)
   
-  # create episode end
-  mtusUKEpsDT$r_epEndTime <- mtusUKEpsDT$r_epStartTime + (mtusUKEpsDT$end*60)
+  print("# Check NAs in str_epStartTime (should be none)")
+  print(paste0("# N rows with NAs in str_epStartTime:",
+               nrow(mtusUKEpsDT[is.na(str_epStartTime)]))
+  )
   
-  print("# Check NAs in time version (should be none)")
-  print("# -> StartTime")
-  summary(mtusUKEpsDT$r_epStartTime)
-  print("# -> EndTime")
-  summary(mtusUKEpsDT$r_epEndTime)
 ```
 
-## Create half hours
-Create and check the distribution of half hours. This uses the epStartTime variable to make sure it is set for all cases, even those which still have no date:
-
-```{r createHalfhours}  
-  # Add hour & half hour in which the episode starts
-# Use the fake start time we created earlier which includes all cases
-  mtusUKEpsDT[, st_hour := as.POSIXlt(mtusUKEpsDT$r_epStartTime)$hour]
-  mtusUKEpsDT[, st_hour := ifelse(mtusUKEpsDT$st_hour < 10 , 
-                                        paste0("0",mtusUKEpsDT$st_hour), # if true - add leading 0
-                                        mtusUKEpsDT$st_hour # if not
-                                  )
-  ]
-  mtusUKEpsDT[, st_mins := as.POSIXlt(mtusUKEpsDT$r_epStartTime)$min]
-  mtusUKEpsDT[, st_hh := ifelse(mtusUKEpsDT$st_mins < 30 , 
-                                      "00", # if true
-                                      "30" # if not
-                                )
-  ]
-  mtusUKEpsDT[, st_halfhour := paste0(mtusUKEpsDT$st_hour, 
-                                            ":",
-                                            mtusUKEpsDT$st_hh
-                                      )
-              ]
-  # check
- kable(
-   table(mtusUKEpsDT$st_halfhour,
-         mtusUKEpsDT$survey, 
-         useNA = "always"
-         )
- )
-```
 
 ## Create pooled survey variable
 
@@ -764,7 +734,7 @@ kable(caption = "Distribution of missing location",
   MTUSW6UKdiaryEps_DT <- mtusUKEpsDT[, .(survey, ba_survey, ba_diarypid, ba_pid, main, sec,
                                          inout, eloc, mtrav,
                                          r_date, r_dow, 
-                                         time, r_epStartTime, r_epEndTime, st_halfhour,
+                                         time, str_epStartTime,
                                          r_epStartDateTime, r_epEndDateTime)]
   
   mtusUKEpsDT <- NULL
@@ -840,6 +810,6 @@ kable(
 
 ***
 __Meta:__
-Analysis completed in: `r round(Sys.time() - starttime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com).
+Analysis completed in: `r round(Sys.time() - startTime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com).
 
 # References
diff --git a/MTUS-W6-adult-survey-data-processing.Rmd b/MTUS-W6-adult-survey-data-processing.Rmd
index 693773c..b72411f 100644
--- a/MTUS-W6-adult-survey-data-processing.Rmd
+++ b/MTUS-W6-adult-survey-data-processing.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "MTUS World 6 Survey Data Processing"
-author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)"
+author: "Ben Anderson (b.anderson@soton.ac.uk, /@dataknut)"
 date: 'Last run at: `r Sys.time()`'
 output:
   html_document:
@@ -19,13 +19,55 @@ output:
 bibliography: ~/bibliography.bib
 ---
 
-```{r setupKnitr, include=FALSE}
-# set default echo to FALSE (code not in output)
-knitr::opts_chunk$set(echo = FALSE)
-knitr::opts_chunk$set(warning = TRUE)
-knitr::opts_chunk$set(message = FALSE)
-knitr::opts_chunk$set(fig_caption = TRUE)
-knitr::opts_chunk$set(tidy = TRUE)
+```{r codeSetup, include=FALSE}
+# Housekeeping ----
+rm(list=ls(all=TRUE)) # remove all objects from workspace
+
+# Set start time ----
+startTime <- Sys.time()
+
+# Where are we?
+sysInfo <- Sys.info()
+sysName <- sysInfo[[1]]
+nodeName <- sysInfo[[4]]
+
+# default code location - needed to load functions & parameters correctly
+projLoc <- "~/github/MTUS/" # <- this seems to work on windows as long as you put the SAVE repo in mydocuments/github
+
+# if necessary set correct path for a different platform
+if(startsWith(nodeName, "octomac")) # => BA laptop
+  projLoc <- "~/github/dataknut/MTUS/"
+
+# give feedback
+print(paste0("Running on ", sysName, " with projLoc = ", projLoc))
+
+# Functions ----
+print(paste0("Loading functions from ", projLoc,"mtusFunctions.R"))
+source(paste0(projLoc,"mtusFunctions.R"))
+
+# Load libraries ----
+# NB libraries required by saveFunctions.R should already be loaded
+reqLibsLocal <- c("foreign", # loading SPSS/STATA
+       "data.table", # fast data manipulation
+       "dplyr", # data manipulation
+       "dtplyr", # data table & dplyr code
+       "ggplot2", # slick graphs
+       "stargazer", # for pretty tables
+       "car", # regression diagnostics
+       "knitr" # for kable
+       )
+print(paste0("Loading the following libraries using lb_myRequiredPackages: ", reqLibsLocal))
+# Use Luke's function to require/install/load
+lb_myRequiredPackages(reqLibsLocal,"http://cran.rstudio.com/")
+
+# Set paths ----
+mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file
+mtusProcPath <- paste0(mtusPath,"processed/") # where to put the processed .csv file(s)
+
+# Set file names ----
+surveyFile <- "MTUS-adult-aggregate.sav"
+sfile <- paste0(mtusPath, surveyFile)
+
 ```
 
 # Introduction
@@ -69,41 +111,6 @@ Key packages used:
  * knitr - to create this document [@knitr]
  * dplyr & dtplyr - for data manipulation [@dplyr][@dtplyr]
  * car - regression diagnostics [@fox_car]
-
-```{r houseKeeping}
-# Clear out all old objects etc ----
-# to avoid confusion
-rm(list = ls()) 
-
-# Set time ----
-starttime <- Sys.time()
-
-# Load required packages ----
-packs <- c("foreign", # loading SPSS/STATA
-       "dplyr", # data manipulation
-       "data.table", # fast data manipulation
-       "dtplyr", # data table & dplyr code
-       "car", # regression diagnostics
-       "knitr" # for kable
-       )
-
-# do this to install them if needed
-# install.packages(x)
-print("Loading required packages:")
-print(packs)
-
-lapply(packs, require, character.only = T)
-
-# Set paths ----
-mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file
-mtusProcPath <- "~/Data/MTUS/World_6/processed/" # where to put the processed .csv file(s)
-
-# Set file names ----
-surveyFile <- "MTUS-adult-aggregate.sav"
-
-sfile <- paste0(mtusPath, surveyFile)
-```
-
  
 # Load original survey data
 Loading `r sfile`.
@@ -131,7 +138,7 @@ We now delete the non-UK data leaving us with `r format(nrow(MTUSW6UKsurvey_DT),
 
 # Process UK survey data
 
-```{r processSurveyData}
+```{r uniqueIDs}
 print("-> Create uniq id for diaries (for matching) and persons")
 
 # Create unique ids ----
@@ -156,6 +163,18 @@ MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey,
 # create a reduced survey table with the few variables we need so joins
 # does not break memory
 
+t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N,
+                          "Number of diary days" = uniqueN(ba_diarypid),
+                          "Number of respondents" = uniqueN(ba_pid)
+                          ), by = survey]
+
+kable(caption = "Summary of surveys",
+      t)
+```
+
+Loaded `r ba_tidyNum(nrow(MTUSW6UKsurvey_DT))` rows of data from `r ba_tidyNum(uniqueN(MTUSW6UKsurvey_DT$ba_pid))` individuals and `r ba_tidyNum(uniqueN(MTUSW6UKsurvey_DT$ba_diarypid))` diaries.
+
+```{r hhAttributes}
 # Rename original day/month/year as we think these may be suspect in some years
 print("-> Renaming day/month/year variables")
 setnames(MTUSW6UKsurvey_DT, "day", "mtus_day") # "Original MTUS day - incorrect for 1983/4"
@@ -499,8 +518,8 @@ qqnorm(pub_etc$residuals); qqline(pub_etc$residuals, col = 2)
 
 On the basis of these results we seem justified in assuming that we can pool 1983 & 1987.
 
-***
-__Meta:__
-Analysis completed in: `r round(Sys.time() - starttime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com).
+# About
+
+Analysis completed in: `r round(Sys.time() - startTime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com).
 
 # References
-- 
GitLab