From 138895382307caedfd8c4c278e1fe10ef3dce8f5 Mon Sep 17 00:00:00 2001 From: Ben Anderson <dataknut@icloud.com> Date: Fri, 19 May 2017 18:04:23 +0100 Subject: [PATCH] amended yaml; amended setup; removed many derived time variables --- MTUS-W6-adult-episodes-data-processing.Rmd | 164 +++++++++------------ MTUS-W6-adult-survey-data-processing.Rmd | 113 ++++++++------ 2 files changed, 133 insertions(+), 144 deletions(-) diff --git a/MTUS-W6-adult-episodes-data-processing.Rmd b/MTUS-W6-adult-episodes-data-processing.Rmd index 0da1f89..0cd85ae 100644 --- a/MTUS-W6-adult-episodes-data-processing.Rmd +++ b/MTUS-W6-adult-episodes-data-processing.Rmd @@ -1,11 +1,12 @@ --- title: "MTUS World 6 Data Processing" -author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)" +author: "Ben Anderson (b.anderson@soton.ac.uk, /@dataknut)" date: 'Last run at: `r Sys.time()`' output: html_document: fig_caption: yes number_sections: yes + self_contained: no theme: journal toc: yes toc_depth: 3 @@ -16,13 +17,54 @@ output: bibliography: ~/bibliography.bib --- -```{r setupKnitr, include=FALSE} -# set default echo to FALSE (code not in output) -knitr::opts_chunk$set(echo = FALSE) -knitr::opts_chunk$set(warning = TRUE) -knitr::opts_chunk$set(message = FALSE) -knitr::opts_chunk$set(fig_caption = TRUE) -knitr::opts_chunk$set(tidy = TRUE) +```{r codeSetup, include=FALSE} +# Housekeeping ---- +rm(list=ls(all=TRUE)) # remove all objects from workspace + +# Set start time ---- +startTime <- Sys.time() + +# Where are we? +sysInfo <- Sys.info() +sysName <- sysInfo[[1]] +nodeName <- sysInfo[[4]] + +# default code location - needed to load functions & parameters correctly +projLoc <- "~/github/MTUS/" # <- this seems to work on windows as long as you put the SAVE repo in mydocuments/github + +# if necessary set correct path for a different platform +if(startsWith(nodeName, "octomac")) # => BA laptop + projLoc <- "~/github/dataknut/MTUS/" + +# give feedback +print(paste0("Running on ", sysName, " with projLoc = ", projLoc)) + +# Functions ---- +print(paste0("Loading functions from ", projLoc,"mtusFunctions.R")) +source(paste0(projLoc,"mtusFunctions.R")) + +# Load libraries ---- +# NB libraries required by saveFunctions.R should already be loaded +reqLibsLocal <- c("foreign", # loading SPSS/STATA + "data.table", # fast data manipulation + "lubridate", # dates & times made easy + "dplyr", # data manipulation + "dtplyr", # data table & dplyr code + "ggplot2", # slick graphs + "stargazer", #Â for pretty tables + "knitr" # for kable + ) +print(paste0("Loading the following libraries using lb_myRequiredPackages: ", reqLibsLocal)) +# Use Luke's function to require/install/load +lb_myRequiredPackages(reqLibsLocal,"http://cran.rstudio.com/") + +# Set paths ---- +mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file +mtusProcPath <- paste0(mtusPath,"processed/") # where to put the processed .csv file(s) + +# Set file names ---- +episodesFile <- "MTUS-adult-episode.sav" +eFile <- paste0(mtusPath, episodesFile) ``` # Introduction @@ -44,11 +86,8 @@ This work was funded by RCUK through the End User Energy Demand Centres Programm Code: * https://github.com/dataknut/MTUS -`License:` - -`The R code embedded in this document is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.` - -`This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.` +License: + * https://github.com/dataknut/MTUS/blob/master/LICENSE >YMMV - http://en.wiktionary.org/wiki/YMMV @@ -61,46 +100,13 @@ Key packages used: * data.table - for fast (big) data handling [@data.table] * knitr - to create this document [@knitr] * dplyr & dtplyr - for data manipulation [@dplyr][@dtplyr] - -````{r houseKeeping} -# Clear out all old objects etc ---- -# to avoid confusion -rm(list = ls()) - -# Set time ---- -starttime <- Sys.time() - -# Load required packages ---- -packs <- c("foreign", # loading SPSS/STATA - "data.table", # fast data manipulation - "dplyr", # data manipulation - "dtplyr", # data table & dplyr code - "knitr" # for kable - ) - -# do this to install them if needed -# install.packages(x) -print("Loading required packages:") -print(packs) - -lapply(packs, require, character.only = T) - -# Set paths ---- -mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file -mtusProcPath <- "~/Data/MTUS/World_6/processed/" # where to put the processed .csv file(s) - -# Set file names ---- -episodesFile <- "MTUS-adult-episode.sav" - -efile <- paste0(mtusPath, episodesFile) -```` # Load and process episodes file -Loading and processing `r efile`. +Loading and processing `r eFile`. ```{r loadEpisodesFile} system.time( - mtusEpsW6DT <- as.data.table(read.spss(efile)) + mtusEpsW6DT <- as.data.table(read.spss(eFile)) ) ``` @@ -108,8 +114,7 @@ We have loaded `r format(nrow(mtusEpsW6DT), big.mark=",",scientific=FALSE)` rows ```{r basicStats} kable(caption = "Number of diaries per year", - ba_tidyNum(table(mtusEpsW6DT$survey, droplevels(mtusEpsW6DT$countrya) # removes unused countries - ) + ba_tidyNum(table(mtusEpsW6DT$survey,droplevels(mtusEpsW6DT$countrya)) # removes unused countries ) ) ``` @@ -595,7 +600,7 @@ First the datetime version: summary(mtusUKEpsDT$r_epEndDateTime) ``` -Now the HH:MM version: +Now the HH:MM version. This is created as a string to avoid having to set a nonsense date. ```{r createAllEpisodeStartEndTimes} # now the HH:MM version @@ -619,61 +624,26 @@ Now the HH:MM version: # concatenate & make POSIXct - mtusUKEpsDT[,r_epStartTime := as.POSIXct(paste0(r_epStartHf, + mtusUKEpsDT[,str_epStartTime := paste0(r_epStartHf, ":", - r_epStartM), - format = "%H:%M") + r_epStartM) ] print("# Check distributions of episode starts (all surveys pooled)") t <- mtusUKEpsDT[, .( N = length(epnum) ), - by = r_epStartTime - ] - plot(t) + by = str_epStartTime + ][order(str_epStartTime)] + kable(caption = "Check 1/2 hour coding (as string)",t) - # create episode end - mtusUKEpsDT$r_epEndTime <- mtusUKEpsDT$r_epStartTime + (mtusUKEpsDT$end*60) + print("# Check NAs in str_epStartTime (should be none)") + print(paste0("# N rows with NAs in str_epStartTime:", + nrow(mtusUKEpsDT[is.na(str_epStartTime)])) + ) - print("# Check NAs in time version (should be none)") - print("# -> StartTime") - summary(mtusUKEpsDT$r_epStartTime) - print("# -> EndTime") - summary(mtusUKEpsDT$r_epEndTime) ``` -## Create half hours -Create and check the distribution of half hours. This uses the epStartTime variable to make sure it is set for all cases, even those which still have no date: - -```{r createHalfhours} - # Add hour & half hour in which the episode starts -# Use the fake start time we created earlier which includes all cases - mtusUKEpsDT[, st_hour := as.POSIXlt(mtusUKEpsDT$r_epStartTime)$hour] - mtusUKEpsDT[, st_hour := ifelse(mtusUKEpsDT$st_hour < 10 , - paste0("0",mtusUKEpsDT$st_hour), # if true - add leading 0 - mtusUKEpsDT$st_hour # if not - ) - ] - mtusUKEpsDT[, st_mins := as.POSIXlt(mtusUKEpsDT$r_epStartTime)$min] - mtusUKEpsDT[, st_hh := ifelse(mtusUKEpsDT$st_mins < 30 , - "00", # if true - "30" # if not - ) - ] - mtusUKEpsDT[, st_halfhour := paste0(mtusUKEpsDT$st_hour, - ":", - mtusUKEpsDT$st_hh - ) - ] - # check - kable( - table(mtusUKEpsDT$st_halfhour, - mtusUKEpsDT$survey, - useNA = "always" - ) - ) -``` ## Create pooled survey variable @@ -764,7 +734,7 @@ kable(caption = "Distribution of missing location", MTUSW6UKdiaryEps_DT <- mtusUKEpsDT[, .(survey, ba_survey, ba_diarypid, ba_pid, main, sec, inout, eloc, mtrav, r_date, r_dow, - time, r_epStartTime, r_epEndTime, st_halfhour, + time, str_epStartTime, r_epStartDateTime, r_epEndDateTime)] mtusUKEpsDT <- NULL @@ -840,6 +810,6 @@ kable( *** __Meta:__ -Analysis completed in: `r round(Sys.time() - starttime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). +Analysis completed in: `r round(Sys.time() - startTime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). # References diff --git a/MTUS-W6-adult-survey-data-processing.Rmd b/MTUS-W6-adult-survey-data-processing.Rmd index 693773c..b72411f 100644 --- a/MTUS-W6-adult-survey-data-processing.Rmd +++ b/MTUS-W6-adult-survey-data-processing.Rmd @@ -1,6 +1,6 @@ --- title: "MTUS World 6 Survey Data Processing" -author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)" +author: "Ben Anderson (b.anderson@soton.ac.uk, /@dataknut)" date: 'Last run at: `r Sys.time()`' output: html_document: @@ -19,13 +19,55 @@ output: bibliography: ~/bibliography.bib --- -```{r setupKnitr, include=FALSE} -# set default echo to FALSE (code not in output) -knitr::opts_chunk$set(echo = FALSE) -knitr::opts_chunk$set(warning = TRUE) -knitr::opts_chunk$set(message = FALSE) -knitr::opts_chunk$set(fig_caption = TRUE) -knitr::opts_chunk$set(tidy = TRUE) +```{r codeSetup, include=FALSE} +# Housekeeping ---- +rm(list=ls(all=TRUE)) # remove all objects from workspace + +# Set start time ---- +startTime <- Sys.time() + +# Where are we? +sysInfo <- Sys.info() +sysName <- sysInfo[[1]] +nodeName <- sysInfo[[4]] + +# default code location - needed to load functions & parameters correctly +projLoc <- "~/github/MTUS/" # <- this seems to work on windows as long as you put the SAVE repo in mydocuments/github + +# if necessary set correct path for a different platform +if(startsWith(nodeName, "octomac")) # => BA laptop + projLoc <- "~/github/dataknut/MTUS/" + +# give feedback +print(paste0("Running on ", sysName, " with projLoc = ", projLoc)) + +# Functions ---- +print(paste0("Loading functions from ", projLoc,"mtusFunctions.R")) +source(paste0(projLoc,"mtusFunctions.R")) + +# Load libraries ---- +# NB libraries required by saveFunctions.R should already be loaded +reqLibsLocal <- c("foreign", # loading SPSS/STATA + "data.table", # fast data manipulation + "dplyr", # data manipulation + "dtplyr", # data table & dplyr code + "ggplot2", # slick graphs + "stargazer", #Â for pretty tables + "car", # regression diagnostics + "knitr" # for kable + ) +print(paste0("Loading the following libraries using lb_myRequiredPackages: ", reqLibsLocal)) +# Use Luke's function to require/install/load +lb_myRequiredPackages(reqLibsLocal,"http://cran.rstudio.com/") + +# Set paths ---- +mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file +mtusProcPath <- paste0(mtusPath,"processed/") # where to put the processed .csv file(s) + +# Set file names ---- +surveyFile <- "MTUS-adult-aggregate.sav" +sfile <- paste0(mtusPath, surveyFile) + ``` # Introduction @@ -69,41 +111,6 @@ Key packages used: * knitr - to create this document [@knitr] * dplyr & dtplyr - for data manipulation [@dplyr][@dtplyr] * car - regression diagnostics [@fox_car] - -```{r houseKeeping} -# Clear out all old objects etc ---- -# to avoid confusion -rm(list = ls()) - -# Set time ---- -starttime <- Sys.time() - -# Load required packages ---- -packs <- c("foreign", # loading SPSS/STATA - "dplyr", # data manipulation - "data.table", # fast data manipulation - "dtplyr", # data table & dplyr code - "car", # regression diagnostics - "knitr" # for kable - ) - -# do this to install them if needed -# install.packages(x) -print("Loading required packages:") -print(packs) - -lapply(packs, require, character.only = T) - -# Set paths ---- -mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file -mtusProcPath <- "~/Data/MTUS/World_6/processed/" # where to put the processed .csv file(s) - -# Set file names ---- -surveyFile <- "MTUS-adult-aggregate.sav" - -sfile <- paste0(mtusPath, surveyFile) -``` - # Load original survey data Loading `r sfile`. @@ -131,7 +138,7 @@ We now delete the non-UK data leaving us with `r format(nrow(MTUSW6UKsurvey_DT), # Process UK survey data -```{r processSurveyData} +```{r uniqueIDs} print("-> Create uniq id for diaries (for matching) and persons") # Create unique ids ---- @@ -156,6 +163,18 @@ MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey, # create a reduced survey table with the few variables we need so joins # does not break memory +t <- MTUSW6UKsurvey_DT[, .("Number of rows" = .N, + "Number of diary days" = uniqueN(ba_diarypid), + "Number of respondents" = uniqueN(ba_pid) + ), by = survey] + +kable(caption = "Summary of surveys", + t) +``` + +Loaded `r ba_tidyNum(nrow(MTUSW6UKsurvey_DT))` rows of data from `r ba_tidyNum(uniqueN(MTUSW6UKsurvey_DT$ba_pid))` individuals and `r ba_tidyNum(uniqueN(MTUSW6UKsurvey_DT$ba_diarypid))` diaries. + +```{r hhAttributes} # Rename original day/month/year as we think these may be suspect in some years print("-> Renaming day/month/year variables") setnames(MTUSW6UKsurvey_DT, "day", "mtus_day") # "Original MTUS day - incorrect for 1983/4" @@ -499,8 +518,8 @@ qqnorm(pub_etc$residuals); qqline(pub_etc$residuals, col = 2) On the basis of these results we seem justified in assuming that we can pool 1983 & 1987. -*** -__Meta:__ -Analysis completed in: `r round(Sys.time() - starttime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). +# About + +Analysis completed in: `r round(Sys.time() - startTime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com). # References -- GitLab