diff --git a/MTUS-W9-adult-episodes-data-processing.Rmd b/MTUS-W9-adult-episodes-data-processing.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..5c0174c73eaab8e22bc8d7b8e16a72cd2e4d363a
--- /dev/null
+++ b/MTUS-W9-adult-episodes-data-processing.Rmd
@@ -0,0 +1,890 @@
+---
+title: "MTUS World 9 Data Processing"
+author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)"
+date: 'Last run at: `r Sys.time()`'
+output:
+  html_document:
+    fig_caption: yes
+    number_sections: yes
+    theme: journal
+    toc: yes
+    toc_depth: 3
+  pdf_document:
+    number_sections: yes
+    toc: yes
+    toc_depth: 3
+bibliography: ~/bibliography.bib
+---
+
+```{r setupKnitr, include=FALSE}
+# set default echo to FALSE (code not in output)
+knitr::opts_chunk$set(echo = FALSE)
+knitr::opts_chunk$set(warning = TRUE)
+knitr::opts_chunk$set(message = FALSE)
+knitr::opts_chunk$set(fig_caption = TRUE)
+knitr::opts_chunk$set(tidy = TRUE)
+```
+
+```{r knitrSetUp, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE) # do not echo code
+knitr::opts_chunk$set(warning = FALSE)
+knitr::opts_chunk$set(message = FALSE)
+knitr::opts_chunk$set(fig_caption = TRUE)
+knitr::opts_chunk$set(fig_height = 6) # default, make it bigger to stretch vertical axis
+knitr::opts_chunk$set(fig_width = 8) # full width
+knitr::opts_chunk$set(tidy = TRUE) # tidy up code in case echo = TRUE
+```
+
+```{r codeSetup, include=FALSE}
+# Housekeeping ----
+rm(list=ls(all=TRUE)) # remove all objects from workspace
+
+# Set start time ----
+startTime <- Sys.time()
+
+# Where are we?
+sysInfo <- Sys.info()
+sysName <- sysInfo[[1]]
+nodeName <- sysInfo[[4]]
+
+# default code location - needed to load functions & parameters correctly
+projLoc <- "~/github/MTUS/" # <- this seems to work on windows as long as you put the SAVE repo in mydocuments/github
+
+# if necessary set correct path for a different platform
+if(startsWith(nodeName, "octomac")) # => BA laptop
+  projLoc <- "~/github/dataknut/MTUS/"
+
+# give feedback
+print(paste0("Running on ", sysName, " with projLoc = ", projLoc))
+
+# Functions ----
+print(paste0("Loading functions from ", projLoc,"mtusFunctions.R"))
+source(paste0(projLoc,"mtusFunctions.R"))
+
+# Load libraries ----
+# NB libraries required by saveFunctions.R should already be loaded
+reqLibsLocal <- c("foreign", # loading SPSS/STATA
+       "data.table", # fast data manipulation
+       "dplyr", # data manipulation
+       "dtplyr", # data table & dplyr code
+       "stargazer", #Â for pretty tables
+       "knitr" # for kable
+       )
+print(paste0("Loading the following libraries using lb_myRequiredPackages: ", reqLibsLocal))
+# Use Luke's function to require/install/load
+lb_myRequiredPackages(reqLibsLocal,"http://cran.rstudio.com/")
+
+# Set paths ----
+mtusPath <- "~/Data/MTUS/World_9/" # location of MTUS SPSS file
+mtusProcPath <- paste0(mtusPath,"processed/") # where to put the processed .csv file(s)
+
+# Set file names ----
+episodesFileName <- "MTUS adult SPSS episode file.sav"
+
+# /scratch/ba1e12/Data/MTUS/World_9/MTUS Files/Adult aggregate and episode files/Episode - adult episode files - July 2013
+
+eFile <- paste0(mtusPath,
+                "MTUS Files/Adult aggregate and episode files/Episode - adult episode files - July 2013/",
+                episodesFileName)
+```
+
+# Introduction
+
+Purpose:
+
+* To process the MTUS World 9 episode data
+* To extract the UK 2000 sample & save for reference
+* To save a processed & gzipped .csv file of all UK samples for future use
+
+Data:
+
+* MTUS [World 9](http://www.timeuse.org/mtus) - episodes file
+
+This work was funded by RCUK through the End User Energy Demand Centres Programme via the "DEMAND: Dynamics of Energy, Mobility and Demand" Centre:
+
+ * http://www.demand.ac.uk 
+ * http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/K011723/1
+
+Code:
+ * https://github.com/dataknut/MTUS
+ 
+> NB: This code is based on similar code to [process W6](https://github.com/dataknut/MTUS/blob/master/MTUS-W6-adult-episodes-data-processing.Rmd) and as there are no apparent differences between the W6 & W9 MTUS files that I have, the code is almost identical and has not been extensively checked with respect to changes from W6 to W9.
+
+`License:`
+
+`The R code embedded in this document is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.`
+    
+`This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.`
+    
+>YMMV - http://en.wiktionary.org/wiki/YMMV
+ 
+
+# Load and process episodes file
+Load `r eFile` and save out the UK 2000 sample in original form.
+
+```{r loadEpisodesFile}
+system.time(
+  mtusEpsW9DT <- as.data.table(read.spss(eFile))
+)
+
+stargazer(mtusEpsW9DT, 
+          title = "MTUS W9 Summary",
+          type = "html")
+
+```
+
+We have loaded `r ba_tidyNum(nrow(mtusEpsW9DT))` rows of data for `r ba_tidyNum(uniqueN(mtusEpsW9DT$countrya))` countries.
+
+```{r basicStats}
+kable(caption = "Number of diary episodes per year",
+      ba_tidyNum(table( mtusEpsW9DT$survey, droplevels(mtusEpsW9DT$countrya) # removes unused countries
+           )
+      )
+      )
+```
+
+We now select the UK data and save the 2000 sample in original form for reference.
+
+```{r keepUKOnly}
+mtusUKEpsDT <- subset(mtusEpsW9DT, countrya == "United Kingdom")
+
+mtusEpsUk2000W9DT <- mtusUKEpsDT[countrya == "United Kingdom" & survey == "2000"]
+
+outF <- paste0(mtusPath, 
+               "MTUS Files/Adult aggregate and episode files/Episode - adult episode files - July 2013/MTUS adult SPSS episode file_uk2000.csv")
+write.csv(mtusEpsUk2000W9DT, outF)
+```
+
+This leaves us with `r format(nrow(mtusUKEpsDT), big.mark=",",scientific=FALSE)` rows of survey data.
+
+```{r setKeys}
+# This works but we'll create a concatenated id to make life easier
+setkey(mtusUKEpsDT, countrya, survey, swave, msamp, hldid, persid, id)
+
+print("-> Create uniq id for diaries (for matching) and persons")
+# Create unique ids ----
+
+# diarypid
+mtusUKEpsDT$ba_diarypid <-
+  group_indices(mtusUKEpsDT, survey,
+                swave,
+                msamp,
+                hldid,
+                persid,
+                id
+       )
+
+# pid
+mtusUKEpsDT$ba_pid <-
+  group_indices(mtusUKEpsDT, survey,
+                swave,
+                msamp,
+                hldid,
+                persid
+       )
+```
+
+## Episode dates
+
+```{r checkEpisodeDates}
+# Check cday - the day the episode starts. It could span midnight (2 days)
+summary(mtusUKEpsDT$cday)
+# -9 or -8 = unknown, applies mostly to 2005 data (cday unkown) but also to some of 1995 & 2001
+mtusUKEpsDT[cday < 0,
+            .(
+              Count = length(cday),
+              Min = min(cday),
+              Max = max(cday)
+            ),
+            by = year
+            ]
+# set to missing if so
+mtusUKEpsDT[, cday := ifelse(cday == -9 | cday == -8, NA, cday)]
+#replace cday = . if cday == -9 | cday == -8
+# recheck
+summary(mtusUKEpsDT$cday)
+
+# create a string date where cday is known ----
+mtusUKEpsDT[!is.na(cday), str_date := paste0(month,"-",cday, "-", year)]
+
+# create an imputed date flag
+mtusUKEpsDT[, str_date_imputed := ifelse(is.na(str_date),
+                                       1, # missing -> we don't know date & may be able to impute it
+                                       0 # not missing -> we know the date
+                                       )
+]
+# check which year these fall in
+table(mtusUKEpsDT$str_date_imputed, mtusUKEpsDT$year)
+```
+
+Fix missing 1995 dates as far as possible.
+
+```{r fix1995Dates}
+# fix 1995
+print("-> Check which days we don't know dates for (1995)")
+mtusUKEpsDT[is.na(str_date) & survey == 1995,
+            .(
+              Count = length(cday)
+            ),
+            by = .(day, month, year)
+            ]
+
+# can't do anything about the missing but can fix the others. Assume it was the first day X of the month (they are all May)
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Monday",
+                               "May-1-1995",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Tuesday",
+                               "May-2-1995",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Wednesday",
+                               "May-3-1995",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Thursday",
+                               "May-4-1995",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Friday",
+                               "May-5-1995",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Saturday",
+                               "May-6-1995",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & survey == 1995 & day == "Sunday",
+                               "May-7-1995",
+                               str_date)]
+
+print("-> Check how many are still unknown (1995)")
+mtusUKEpsDT[is.na(str_date) & survey == 1995,
+            .(
+              Count = length(cday)
+            ),
+            by = .(day, month, year)
+            ]
+```
+
+Now fix 2000.
+
+```{r fix2000Dates}
+# fix 2005
+print("-> Check which days we don't know dates for (2000)")
+mtusUKEpsDT[is.na(str_date) & survey == 2000,
+            .(
+              Count = length(cday)
+            ),
+            by = .(day, month, year)
+            ]
+# none are missing but the months & years vary
+# in all cases we assume it is the first such day of the month
+
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & year == 2000 & month == "August" & day == "Saturday",
+                               "August-5-2000",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & year == 2000 & month == "August" & day == "Monday",
+                               "August-7-2000",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & year == 2000 & month == "September" & day == "Wednesday",
+                               "September-6-2000",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & year == 2000 & month == "September" & day == "Saturday",
+                               "September-2-2000",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & year == 2001 & month == "March" & day == "Friday",
+                               "March-2-2001",
+                               str_date)]
+mtusUKEpsDT[, str_date := ifelse(str_date_imputed == 1 & year == 2001 & month == "March" & day == "Sunday",
+                               "March-4-2001",
+                               str_date)]
+print("-> Re-check which days we don't know dates for (2000)")
+mtusUKEpsDT[is.na(str_date) & survey == 2000,
+            .(
+              Count = length(cday)
+            ),
+            by = .(day, month, year)
+            ]
+```
+
+And now we fix 2005. For some reason no dates are available at all (why not?). So we have to impute them all using the first day specified (Mon <-> Sun) of the month specified.
+
+```{r fix2005Dates}  
+# Fixing episode dates for 2005
+
+# Need to create a proper start/end time but 2005 does not have a valid date 
+# Why not???
+# so we are forced to impute them ALL
+# 2005 months:
+# March: synthetic week = Sunday 6th -> Saturday 12th
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Sunday", "06/03/2005", "na"
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Monday", "07/03/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Tuesday", "08/03/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Wednesday", "09/03/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Thursday", "10/03/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Friday", "11/03/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "March"
+  & mtusUKEpsDT$day == "Saturday", "12/03/2005", mtusUKEpsDT$date_2005
+)
+
+# June: 5th -> 11th
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Sunday", "05/06/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Monday", "06/06/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Tuesday", "07/06/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Wednesday", "08/06/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Thursday", "09/06/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Friday", "10/06/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "June"
+  & mtusUKEpsDT$day == "Saturday", "11/06/2005", mtusUKEpsDT$date_2005
+)
+
+# September: 4th -> 10th
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Sunday", "04/09/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Monday", "05/09/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Tuesday", "06/09/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Wednesday", "07/09/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Thursday", "08/09/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Friday", "09/09/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "September"
+  & mtusUKEpsDT$day == "Saturday", "10/09/2005", mtusUKEpsDT$date_2005
+)
+
+# November: 6th - 12th
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Sunday", "06/11/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Monday", "07/11/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Tuesday", "08/11/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Wednesday", "09/11/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Thursday", "10/11/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Friday", "11/11/2005", mtusUKEpsDT$date_2005
+)
+mtusUKEpsDT$date_2005 <- ifelse(
+  mtusUKEpsDT$survey == 2005 & mtusUKEpsDT$month == "November"
+  & mtusUKEpsDT$day == "Saturday", "12/11/2005", mtusUKEpsDT$date_2005
+)
+
+```
+
+At this point we should have fixed all the dates that we can so we need to combine them into a date that R understands.
+
+```{r fixFinalDates}
+# 2005 dates are in d/m/t format
+# set to POSIX - can't use fasttime
+mtusUKEpsDT$r_date_2005 <- as.POSIXct(mtusUKEpsDT$date_2005, tz = "",
+                                    "%d/%m/%Y"
+)
+# convert to POSIX date
+mtusUKEpsDT[, r_date_2005 := as.POSIXct(mtusUKEpsDT$r_date_2005, origin = "1970-01-01")
+            ]
+
+# All other dates are currently strings
+mtusUKEpsDT[, r_date := as.POSIXct(str_date,
+                                   format = "%B-%e-%Y") # day = decimal (%B), # month = full name (%e) & year = with century...
+]
+
+# Add the 2005 dates
+# this results in seconds
+mtusUKEpsDT[, r_date := ifelse(survey == 2005, 
+                             r_date_2005, # if 2005
+                             r_date # if not, date already set
+                             )
+          ]
+
+# re-set to correct dates
+mtusUKEpsDT[, r_date := as.POSIXct(as.numeric(r_date), 
+                                       origin="1970-01-01")
+            ]
+
+# check
+print("-> r_date values:")
+head(mtusUKEpsDT$r_date)
+print("-> check for missing")
+summary(mtusUKEpsDT$r_date)
+print("-> check dates per year")
+kable(
+  mtusUKEpsDT[,
+              .( Min = min(r_date, na.rm = TRUE), # to avoid the remaining NAs in 1995
+                 Max = max(r_date, na.rm = TRUE) # to avoid the remaining NAs in 1995
+                 ),
+              by = survey
+              ]
+)
+```
+
+
+Where we had missing dates we have now calculated them based on the cday, month and year variable in the original file and imputed a new r_date variable which is the date the episode starts. As part of this process we have also imputed _all_ of the 2005 dates based on the recorded day of the week (cday) and the month. This process has resulted in unknown dates in the following number of cases because the original diary cday was missing:
+
+`r kable(mtusUKEpsDT[is.na(r_date), .(N = length(r_date)), by = survey])`
+
+## Check days of the week
+
+Now we check that the 'day of the week' from the original and the one derived from our imputed dates match:
+
+```{r checkEpisodeDays}
+# impute day of the week
+mtusUKEpsDT[, r_dow := as.POSIXlt(mtusUKEpsDT$r_date)$wday]
+# set to factor
+mtusUKEpsDT$r_dow <- factor(mtusUKEpsDT$r_dow,
+                   labels = c(
+                     "Sunday",
+                     "Monday",
+                     "Tuesday",
+                     "Wednesday",
+                     "Thursday",
+                     "Friday",
+                     "Saturday"
+                   )
+)
+# check matches
+  kable(
+    table(mtusUKEpsDT$r_dow, 
+        mtusUKEpsDT$day, 
+        useNA = "always"
+    )
+  )
+  
+  
+```
+
+There are still the few unset cases from 1995 but also many of the episodes don't match. It appears that there is an error in the coding of 'day' in the MTUS World 6 '[1983](http://www.timeuse.org/mtus/surveys/GBR/5352.html)' file. There is [code available to correct this error](http://www.timeuse.org/sites/default//files/mtus/study/5352/fix-day-of-week-variable.doc) but since we have correctly set r_date in most cases without using the original 'day' variable we can just use our new r_dow variable that we have derived from r_date.
+
+However we used 'day' to:
+
+ * impute dates for 2005 as cday was missing
+ * impute dates where cday was missing in 1995 & 2000/1
+ 
+We have to assume that day is correct in these instances.
+
+## Check episode start times
+
+We now create two start times for each episode:
+ 
+ * a full datetime which will be NA in those few cases where r_date could not be imputed
+ * a clock time (HH:MM) for all episodes
+ 
+In the UK the diaries are supposed to start at 04:00. Check:
+
+```{r checkStartDiary}
+# start and end are minutes from the beginning of the diary - these vary across the surveys but for the UK they are all 04:00
+# check this - epnum = 1st episode
+t_DT <- mtusUKEpsDT[epnum == 1]
+kable(caption = "Clock time of first episode by survey",
+      table(t_DT$survey, 
+      t_DT$clockst, #clockst is a weird clock hour. decimal time why why why?!!
+      useNA = "always"
+  )
+)
+```
+
+In theory the episode start times should align to the diary slots. Check (warning, long table):
+
+```{r checkStartEpisodes}
+t <- table(mtusUKEpsDT$start, 
+      mtusUKEpsDT$survey,
+      useNA = "always"
+)
+kable(caption = "Start times in minutes of episodes by survey",
+  head(t, 25)
+)
+```
+
+Some strange things seem to be happening.
+
+For 1974, which was a 30 minute diary, there are a lot where start = 1 (should be 0) and then mostly 30, 60, 70, 80, 90 (1/2 hours as per documentation) but  there are some in-betweens. Fix them.
+
+```{r fix1974StartTimes}
+mtusUKEpsDT$ba_startm <- mtusUKEpsDT$start
+mtusUKEpsDT[, ba_startm := ifelse(ba_startm == 1 & survey == 1974,
+            0,
+            ba_startm)
+            ]
+print("# -> Check")
+dt <-   table(mtusUKEpsDT$ba_startm, 
+      mtusUKEpsDT$survey,
+      useNA = "always")
+head(dt)
+```
+
+1983 is 15 minute intervals but have +1 for all periods (so 1, 16, 31, 61). Fix them.
+
+```{r fix1983StartTimes}
+mtusUKEpsDT[, ba_startm := ifelse(survey == 1983,
+            ba_startm - 1,
+            ba_startm)
+            ]
+print("# -> Check 1983")
+dt <-   table(mtusUKEpsDT$ba_startm, 
+      mtusUKEpsDT$survey,
+      useNA = "always")
+head(dt)
+```
+
+1987 appears to be 15 minutes but some in betweens on 5 minutes. Nothing to be fixed.
+
+1995 is 15 minutes but has +1 (so 16,31,46 etc) but 0 is correct. Fix them.
+
+```{r fix1995StartTimes}
+mtusUKEpsDT[, ba_startm := ifelse(survey == 1995 & ba_startm > 0,
+            ba_startm - 1,
+            ba_startm)
+            ]
+print("# -> Check 1995")
+dt <-   table(mtusUKEpsDT$ba_startm, 
+      mtusUKEpsDT$survey,
+      useNA = "always")
+head(dt)
+```
+
+2000 10 minutes, clean so no processing required.
+
+2005 10 minutes, clean so no processing required
+	
+Having corrected the episode start times we can now set up a start/end times that R understands:
+ 
+ * start & end as a datetime which will be NA for those few cases where r_date is still unknown
+ * start & end as HH:MM to include those few cases where r_date is still unknown
+ 
+First the datetime version:
+
+```{r createAllEpisodeStartEndDateTimes}  
+  # Setting up corrected start and end timestamps
+  # all diaries start at 04:00 on the date given
+  # first the datetime version
+# POSIXct works in seconds
+  mtusUKEpsDT$r_epStartDateTime <- mtusUKEpsDT$r_date + (4*60*60) # will be NA for those where r_date is unkown
+  # now add the corrected minutes * 60 ( = seconds) up to the start of the episode
+  mtusUKEpsDT$r_epStartDateTime <- mtusUKEpsDT$r_epStartDateTime + (mtusUKEpsDT$ba_startm*60)
+  # same for episode end
+  mtusUKEpsDT$r_epEndDateTime <- mtusUKEpsDT$r_epStartDateTime + (mtusUKEpsDT$end*60)
+  
+  print("# Check NAs in datetime version")
+  print("# -> StartDateTime")
+  summary(mtusUKEpsDT$r_epStartDateTime)
+  print("# -> EndDateTime")
+  summary(mtusUKEpsDT$r_epEndDateTime)
+```
+
+Now the HH:MM version:
+
+```{r createAllEpisodeStartEndTimes}
+  # now the HH:MM version
+  # hours
+  mtusUKEpsDT$r_epStartH <- 4 + floor(mtusUKEpsDT$ba_startm/60)
+  print("# Check start hours")
+  summary(mtusUKEpsDT$r_epStartH)
+  print("# This has created hours > 23 (tomorrow). Need to fix them")
+  mtusUKEpsDT[, r_epStartHf := ifelse(r_epStartH > 23,
+                                     r_epStartH - 24,
+                                     r_epStartH)
+              ]
+  print("# check")
+  table(mtusUKEpsDT$r_epStartHf[mtusUKEpsDT$r_epStartH > 22], 
+        mtusUKEpsDT$r_epStartH[mtusUKEpsDT$r_epStartH > 22])
+  
+  # minutes
+  mtusUKEpsDT$r_epStartM <- 0 + (mtusUKEpsDT$ba_startm %% 60) # this is R's way of calculating the remainder!!
+  print("# Check start minutes")
+  summary(mtusUKEpsDT$r_epStartM)
+  
+  
+  # concatenate & make POSIXct
+  mtusUKEpsDT[,r_epStartTime := as.POSIXct(paste0(r_epStartHf,
+                                              ":",
+                                              r_epStartM), 
+                                       format = "%H:%M")
+              ]
+  print("# Check distributions of episode starts (all surveys pooled)")
+  t <- mtusUKEpsDT[,
+                    .(
+                      N = length(epnum)
+                    ),
+                    by = r_epStartTime
+                      ]
+  plot(t)
+  
+  # create episode end
+  mtusUKEpsDT$r_epEndTime <- mtusUKEpsDT$r_epStartTime + (mtusUKEpsDT$end*60)
+  
+  print("# Check NAs in time version (should be none)")
+  print("# -> StartTime")
+  summary(mtusUKEpsDT$r_epStartTime)
+  print("# -> EndTime")
+  summary(mtusUKEpsDT$r_epEndTime)
+```
+
+## Create half hours
+Create and check the distribution of half hours. This uses the epStartTime variable to make sure it is set for all cases, even those which still have no date:
+
+```{r createHalfhours}  
+  # Add hour & half hour in which the episode starts
+# Use the fake start time we created earlier which includes all cases
+  mtusUKEpsDT[, st_hour := as.POSIXlt(mtusUKEpsDT$r_epStartTime)$hour]
+  mtusUKEpsDT[, st_hour := ifelse(mtusUKEpsDT$st_hour < 10 , 
+                                        paste0("0",mtusUKEpsDT$st_hour), # if true - add leading 0
+                                        mtusUKEpsDT$st_hour # if not
+                                  )
+  ]
+  mtusUKEpsDT[, st_mins := as.POSIXlt(mtusUKEpsDT$r_epStartTime)$min]
+  mtusUKEpsDT[, st_hh := ifelse(mtusUKEpsDT$st_mins < 30 , 
+                                      "00", # if true
+                                      "30" # if not
+                                )
+  ]
+  mtusUKEpsDT[, st_halfhour := paste0(mtusUKEpsDT$st_hour, 
+                                            ":",
+                                            mtusUKEpsDT$st_hh
+                                      )
+              ]
+  # check
+ kable(
+   table(mtusUKEpsDT$st_halfhour,
+         mtusUKEpsDT$survey, 
+         useNA = "always"
+         )
+ )
+```
+
+## Create pooled survey variable
+
+```{r createBAsurveyVar}  
+  # set a new survey variable to be:
+  # 74 + 75 = drop 
+  # 83 + 84 + 85 = 1985
+  # 95 = drop
+  # 100 + 101 = 2001 = drop
+  # 105 = 2005
+  # Making filter for years of interest
+  mtusUKEpsDT$ba_survey <- ifelse(
+    mtusUKEpsDT$survey == 1974, 
+    1974, # if true
+    NA # if not
+  )
+  
+  mtusUKEpsDT$ba_survey <- ifelse(
+    mtusUKEpsDT$survey == 1983 | 
+      mtusUKEpsDT$survey == 1987 , 
+    1985, # if true
+    mtusUKEpsDT$ba_survey # if not
+  )
+  
+  mtusUKEpsDT$ba_survey <- ifelse(
+    mtusUKEpsDT$survey == 1995 , 
+    1995, # if true
+    mtusUKEpsDT$ba_survey # if not
+  )
+  
+  mtusUKEpsDT$ba_survey <- ifelse(
+    mtusUKEpsDT$survey == 2000 , 
+    2000, # if true
+    mtusUKEpsDT$ba_survey # if not
+  )
+  
+  mtusUKEpsDT$ba_survey <- ifelse(
+    mtusUKEpsDT$survey == 2005 , 
+    2005, # if true
+    mtusUKEpsDT$ba_survey # if not
+  )
+```
+
+New ba_survey variable created:
+
+```{r ba_surveyTable}
+  # check data for analysis in this paper (1985 -> 2005)
+ kable(
+   table(mtusUKEpsDT$ba_survey, 
+        mtusUKEpsDT$survey,
+             useNA = "always"
+    )
+  )
+```
+
+See survey proessing file/code for justification.
+
+## Check for missing location
+
+```{r locationCheck}
+
+kable(caption = "Distribution of missing location",
+      table(mtusUKEpsDT$eloc, 
+            mtusUKEpsDT$survey,
+             useNA = "always")
+)
+```
+
+## Remove & check for badcases.
+
+```{r removeBadcases}
+  # Keep only good cases
+  mtusUKEpsDT <- mtusUKEpsDT[badcase == "good case"]
+  #mtusUKEpsDT <- mtusUKEpsDT[ba_survey %in% c("1985","2005")] # keep only 1985 & 2005
+  
+  # check
+  kable(
+    table(mtusUKEpsDT$badcase,mtusUKEpsDT$ba_survey,
+             useNA = "always")
+  )
+```
+
+## Keep the core variables and save the file
+
+```{r saveEpisodesFile}
+  # Keep the episode vars we need ----
+# - saves memory etc
+  MTUSW9UKdiaryEps_DT <- mtusUKEpsDT[, .(survey, ba_survey, ba_diarypid, ba_pid, main, sec,
+                                         inout, eloc, mtrav,
+                                         r_date, r_dow, 
+                                         time, r_epStartTime, r_epEndTime, st_halfhour,
+                                         r_epStartDateTime, r_epEndDateTime)]
+  
+  mtusUKEpsDT <- NULL
+  # Save out the working file for later use (saves re-running)
+  episodes_DT <- paste0(mtusProcPath, "MTUSW9UKdiaryEps_DT.csv")
+  print(paste0("Saving processed episodes file in: ", episodes_DT))
+  write.csv(MTUSW9UKdiaryEps_DT, 
+            file = episodes_DT, row.names = FALSE
+  )
+
+  dir <- getwd()
+  setwd(mtusProcPath)
+  print("Now gzip the file")
+  system("gzip -f MTUSW9UKdiaryEps_DT.csv &") # gzip & force over-write, shame can't do this directly as part of write
+  setwd(dir) # set back to working directory otherwise R will save .RData in an odd place
+```
+Variables retained in processed episodes file:
+
+`r kable(caption = "Variables retained in episodes file", names(MTUSW9UKdiaryEps_DT))`
+
+# Descriptive Statistics
+
+## Number of episodes recorded
+Remember that the number of episodes relates to the number of respondents, the length of the diary time slots in each survey, the number of days on which diaries were completed and the number of possible different activity codes...
+```{r episodesBySurvey}
+kable(caption = "Number of episodes per survey/ba_survey",
+  table(MTUSW9UKdiaryEps_DT$survey,
+        MTUSW9UKdiaryEps_DT$ba_survey,
+        useNA = "always"
+        )
+)
+```
+
+## Episodes by day of the week
+Remember that the number of episodes relates to the number of respondents, the length of the diary time slots in each survey, the number of days on which diaries were completed and the number of possible different activity codes... so they are not directly comparable across years (surveys).
+
+```{r episodesByDay}
+kable(
+  table(MTUSW9UKdiaryEps_DT$survey,
+        MTUSW9UKdiaryEps_DT$r_dow,
+        useNA = "always"
+        )
+)
+```
+
+## Episodes by actual year recorded
+Remember that the number of episodes relates to the number of respondents, the length of the diary time slots in each survey, the number of days on which diaries were completed and the number of possible different activity codes... so they are not directly comparable across years (surveys).
+
+
+```{r episodesByActualYear}
+# define year (started)
+MTUSW9UKdiaryEps_DT$r_year <- as.POSIXlt(MTUSW9UKdiaryEps_DT$r_date)$year + 1900 # 0 = 1900
+
+kable(
+  table(MTUSW9UKdiaryEps_DT$survey,
+        MTUSW9UKdiaryEps_DT$r_year,
+        useNA = "always"
+        )
+)
+```
+
+## Episodes by actual month recorded
+Remember that the number of episodes relates to the number of respondents, the length of the diary time slots in each survey, the number of days on which diaries were completed and the number of possible different activity codes... so they are not directly comparable across years (surveys).
+
+```{r episodesByActualYearMonth}
+kable(
+  table(as.POSIXlt(MTUSW9UKdiaryEps_DT$r_date)$mon,
+        MTUSW9UKdiaryEps_DT$r_year,
+        useNA = "always"
+        )
+)
+```
+
+# About
+
+Analysis completed in: `r round(Sys.time() - starttime, 3)` seconds using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com).
+
+Key packages used:
+
+ * base R - for the basics [@baseR]
+ * foreign - for loading SPSS data [@foreign]
+ * data.table - for fast (big) data handling [@data.table]
+ * knitr - to create this document [@knitr]
+ * dplyr & dtplyr - for data manipulation [@dplyr][@dtplyr]
+ 
+# References
diff --git a/MTUS-W9-adult-episodes-data-processing_files/figure-html/createAllEpisodeStartEndTimes-1.png b/MTUS-W9-adult-episodes-data-processing_files/figure-html/createAllEpisodeStartEndTimes-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fae0401eb5114fcf2e9e2507349d835afc61d58
Binary files /dev/null and b/MTUS-W9-adult-episodes-data-processing_files/figure-html/createAllEpisodeStartEndTimes-1.png differ
diff --git a/UK_surveys_summary_table.docx b/UK_surveys_summary_table.docx
new file mode 100644
index 0000000000000000000000000000000000000000..98ae433652ff82de281a35054aaae4c6e10ce8a6
Binary files /dev/null and b/UK_surveys_summary_table.docx differ
diff --git a/mtusFunctions.R b/mtusFunctions.R
new file mode 100644
index 0000000000000000000000000000000000000000..2d4cc76339284e130200bd25fa9a7f9e1cce58e4
--- /dev/null
+++ b/mtusFunctions.R
@@ -0,0 +1,47 @@
+# This is a function to install any packages that are not present
+# Especially useful when running on virtual machines where package installation is not persistent. Like UoS sve :-(
+# It will fail if the packages need to be installed but there is no internet access
+# Courtesy of Luke Blunden
+lb_myRequiredPackages <- function(x,y){
+  for( i in x ){
+    #  require returns TRUE if it was able to load package
+    if( ! require( i , character.only = TRUE ) ){
+      #  If package was not able to be loaded then re-install
+      install.packages( i , repos=y , 
+                        #type="win.binary" , comment out so runs on OS X etc
+                        quiet=TRUE , dependencies = TRUE , verbose = FALSE )
+      #  Load package after installing
+      require( i , character.only = TRUE, quietly = TRUE )
+    }
+  }
+}
+
+# Use the function to load the libraries required by this code
+reqLibs <- c("data.table" # fast data munching
+)
+
+print(paste0("Loading the following libraries using lb_myRequiredPackages: ", reqLibs))
+# Use Luke's function to require/install/load
+lb_myRequiredPackages(reqLibs,"http://cran.rstudio.com/")
+
+####
+ba_tidyNum <- function(number) { 
+  # puts commas in a long number and stops scientific format
+  format(number, big.mark=",", scientific=FALSE)
+}
+
+###
+# not needed - use wdy(x, label = TRUE, abbr = FALSE) from lubridate
+# ba_setDayOfWeekLabel <- function(var) {
+#   # changes 0 - 6 to labels
+#   factor(var, labels = c(
+#     "Sun",
+#     "Mon",
+#     "Tue",
+#     "Wed",
+#     "Thu",
+#     "Fri",
+#     "Sat"
+#     )
+#   )
+# }
\ No newline at end of file
diff --git a/tmp-pdfcrop-2623.tex b/tmp-pdfcrop-2623.tex
new file mode 100644
index 0000000000000000000000000000000000000000..bdf1c7c315ae2bb2e956e3c19d0c552ac16078e1
--- /dev/null
+++ b/tmp-pdfcrop-2623.tex
@@ -0,0 +1,130 @@
+\catcode37 14 % percent
+\catcode33 12 % exclam
+\catcode34 12 % quote
+\catcode35  6 % hash
+\catcode39 12 % apostrophe
+\catcode40 12 % left parenthesis
+\catcode41 12 % right parenthesis
+\catcode45 12 % minus
+\catcode46 12 % period
+\catcode60 12 % less
+\catcode61 12 % equals
+\catcode62 12 % greater
+\catcode64 12 % at
+\catcode91 12 % left square
+\catcode93 12 % right square
+\catcode96 12 % back tick
+\catcode123 1 % left curly brace
+\catcode125 2 % right curly brace
+\catcode126 12 % tilde
+\catcode`\#=6 %
+\escapechar=92 %
+\def\IfUndefined#1#2#3{%
+  \begingroup\expandafter\expandafter\expandafter\endgroup
+  \expandafter\ifx\csname#1\endcsname\relax
+    #2%
+  \else
+    #3%
+  \fi
+}
+\def\pdffilehex{4D5455532D57362D6164756C742D7375727665792D646174612D70726F63657373696E675F66696C65732F6669677572652D6C617465782F6561744D6F64656C2D322E706466}
+\IfUndefined{pdfunescapehex}{%
+  \begingroup
+    \gdef\pdffile{}%
+    \def\do#1#2{%
+      \ifx\relax#2\relax
+        \ifx\relax#1\relax
+        \else
+          \errmessage{Invalid hex string, should not happen!}%
+        \fi
+      \else
+        \lccode`0="#1#2\relax
+        \lowercase{%
+          \xdef\pdffile{\pdffile0}%
+        }%
+        \expandafter\do
+      \fi
+    }%
+    \expandafter\do\pdffilehex\relax\relax
+  \endgroup
+}{%
+  \edef\pdffile{\pdfunescapehex{\pdffilehex}}%
+}
+\immediate\write-1{Input file: \pdffile}
+\pdfoutput=1 %
+\pdfcompresslevel=9 %
+\csname pdfmapfile\endcsname{}
+\def\setpdfversion#1{%
+  \IfUndefined{pdfobjcompresslevel}{%
+  }{%
+    \ifnum#1<5 %
+      \pdfobjcompresslevel=0 %
+    \else
+      \pdfobjcompresslevel=2 %
+    \fi
+  }%
+  \IfUndefined{pdfminorversion}{%
+    \IfUndefined{pdfoptionpdfminorversion}{%
+    }{%
+      \pdfoptionpdfminorversion=#1\relax
+    }%
+  }{%
+    \pdfminorversion=#1\relax
+  }%
+}
+\def\page #1 [#2 #3 #4 #5]{%
+  \count0=#1\relax
+  \setbox0=\hbox{%
+    \pdfximage page #1 mediabox{\pdffile}%
+    \pdfrefximage\pdflastximage
+  }%
+  \pdfhorigin=-#2bp\relax
+  \pdfvorigin=#3bp\relax
+  \pdfpagewidth=#4bp\relax
+  \advance\pdfpagewidth by -#2bp\relax
+  \pdfpageheight=#5bp\relax
+  \advance\pdfpageheight by -#3bp\relax
+  \ht0=\pdfpageheight
+  \shipout\box0\relax
+}
+\def\pageclip #1 [#2 #3 #4 #5][#6 #7 #8 #9]{%
+  \count0=#1\relax
+  \dimen0=#4bp\relax \advance\dimen0 by -#2bp\relax
+  \edef\imagewidth{\the\dimen0}%
+  \dimen0=#5bp\relax \advance\dimen0 by -#3bp\relax
+  \edef\imageheight{\the\dimen0}%
+  \pdfximage page #1 mediabox{\pdffile}%
+  \setbox0=\hbox{%
+    \kern -#2bp\relax
+    \lower #3bp\hbox{\pdfrefximage\pdflastximage}%
+  }%
+  \wd0=\imagewidth\relax
+  \ht0=\imageheight\relax
+  \dp0=0pt\relax
+  \pdfhorigin=#6pt\relax
+  \pdfvorigin=#7bp\relax
+  \pdfpagewidth=\imagewidth
+  \advance\pdfpagewidth by #6bp\relax
+  \advance\pdfpagewidth by #8bp\relax
+  \pdfpageheight=\imageheight\relax
+  \advance\pdfpageheight by #7bp\relax
+  \advance\pdfpageheight by #9bp\relax
+  \pdfxform0\relax
+  \shipout\hbox{\pdfrefxform\pdflastxform}%
+}%
+\def\pageinclude#1{%
+  \pdfhorigin=0pt\relax
+  \pdfvorigin=0pt\relax
+  \pdfximage page #1 mediabox{\pdffile}%
+  \setbox0=\hbox{\pdfrefximage\pdflastximage}%
+  \pdfpagewidth=\wd0\relax
+  \pdfpageheight=\ht0\relax
+  \advance\pdfpageheight by \dp0\relax
+  \shipout\hbox{%
+    \raise\dp0\box0\relax
+  }%
+}
+\setpdfversion{4}
+\page 1 [1 15 439 309]
+\csname @@end\endcsname
+\end