Skip to content
Snippets Groups Projects
title: "MTUS World 6 Survey Data Processing"
author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)"
date: 'Last run at: `r Sys.time()`'
output:
  html_document:
    fig_caption: yes
    keep_md: yes
    number_sections: yes
    self_contained: no
    theme: journal
    toc: yes
  pdf_document:
    toc: yes
    toc_depth: '3'
  word_document:
    toc: yes
    toc_depth: '3'
bibliography: ~/bibliography.bib
# set default echo to FALSE (code not in output)
knitr::opts_chunk$set(echo = FALSE)
knitr::opts_chunk$set(warning = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(fig_caption = TRUE)
knitr::opts_chunk$set(tidy = TRUE)

Introduction

Purpose:

  • To process the MTUS World 6 Survey data

A processed & gzipped .csv file containing data for just the UK is saved.

Data:

Things that are NOT fixed here:

  • diary day of the week which is not correct in 1984 - this is fixed in the episodes data processing

This work was funded by RCUK through the End User Energy Demand Centres Programme via the "DEMAND: Dynamics of Energy, Mobility and Demand" Centre:

Code:

License:

The R code embedded in this document is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

YMMV - http://en.wiktionary.org/wiki/YMMV

Set up R environment

Key packages used:

  • base R - for the basics [@baseR]
  • foreign - for loading SPSS data [@foreign]
  • data.table - for fast (big) data handling [@data.table]
  • knitr - to create this document [@knitr]
  • dplyr & dtplyr - for data manipulation [@dplyr][@dtplyr]
  • car - regression diagnostics [@fox_car]
# Clear out all old objects etc ----
# to avoid confusion
rm(list = ls()) 

# Set time ----
starttime <- Sys.time()

# Load required packages ----
packs <- c("foreign", # loading SPSS/STATA
       "dplyr", # data manipulation
       "data.table", # fast data manipulation
       "dtplyr", # data table & dplyr code
       "car", # regression diagnostics
       "knitr" # for kable
       )

# do this to install them if needed
# install.packages(x)
print("Loading required packages:")
print(packs)

lapply(packs, require, character.only = T)

# Set paths ----
mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file
mtusProcPath <- "~/Data/MTUS/World_6/processed/" # where to put the processed .csv file(s)

# Set file names ----
surveyFile <- "MTUS-adult-aggregate.sav"

sfile <- paste0(mtusPath, surveyFile)

Load original survey data

Loading r sfile.

MTUSW6survey_DT <- as.data.table(read.spss(sfile)) # load from SPSS

setkey(MTUSW6survey_DT, countrya, survey, swave, msamp, hldid, persid, id)

We have loaded r format(nrow(MTUSW6survey_DT), big.mark=",",scientific=FALSE) rows of data for r format(uniqueN(MTUSW6survey_DT$countrya), big.mark=",",scientific=FALSE) countries.

kable(caption = "Number of diaries per year",
      table(MTUSW6survey_DT$survey, droplevels(MTUSW6survey_DT$countrya) # removes unused countries
            )
      )
MTUSW6UKsurvey_DT <- subset(MTUSW6survey_DT, countrya == "United Kingdom")

We now delete the non-UK data leaving us with r format(nrow(MTUSW6UKsurvey_DT), big.mark=",",scientific=FALSE) rows of survey data.

Process UK survey data

print("-> Create uniq id for diaries (for matching) and persons")

# Create unique ids ----
# diarypid
MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT, 
                survey,
                swave,
                msamp,
                hldid,
                persid,
                id
       )

# pid
MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey,
                swave,
                msamp,
                hldid,
                persid
       )

# create a reduced survey table with the few variables we need so joins
# does not break memory

# Rename original day/month/year as we think these may be suspect in some years
print("-> Renaming day/month/year variables")
setnames(MTUSW6UKsurvey_DT, "day", "mtus_day") # "Original MTUS day - incorrect for 1983/4"
setnames(MTUSW6UKsurvey_DT, "month", "mtus_month")
setnames(MTUSW6UKsurvey_DT, "year", "mtus_year")

# HH size variable ----
print("-> Creating new HH size variable")
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 1] <- 1
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 2] <- 2
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 3] <- 3
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 4] <- 4
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize > 4] <- "5+"

# check
#table(MTUSW6UKsurveyCore_DT$hhldsize, MTUSW6UKsurveyCore_DT$ba_hhsize, useNA = "always")

# Set up n child & n people variables ----
print("-> Creating new n children/n people variables")
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild == 0] <- 0
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild == 1] <- 1
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild == 2] <- 2
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild > 2] <- "3+"

# check
#table(MTUSW6UKsurveyCore_DT$nchild, MTUSW6UKsurveyCore_DT$ba_nchild, useNA = "always")

# Age categories ----
print("-> Creating new age group variable from:")
MTUSW6UKsurvey_DT[,
                  .(
                    min = min(age),
                    max = max(age)
                  ),
                  by = survey
                  ]
print("# NB: some are missing in 1974 & 1995 (min = -8):")
t <- MTUSW6UKsurvey_DT[age < 0]
table(t$age,t$survey)

# create age group
MTUSW6UKsurvey_DT$ba_age_r <- cut(MTUSW6UKsurvey_DT$age, 
	                                      c(16,25,35,45,55,65,75,85,105)
	                                      )
# check
#table(MTUSW6UKsurveyCore_DT$age, MTUSW6UKsurveyCore_DT$ba_age_r, useNA = "always")

# Age cohorts ----
# NB - max age = 80 so older cohorts missing from 2005
print("-> Creating new age cohort  variable")
MTUSW6UKsurvey_DT[, ba_birthyear:= mtus_year - age]

MTUSW6UKsurvey_DT$ba_age_cohort <- cut(MTUSW6UKsurvey_DT$ba_birthyear, 
	                                      c(1890,1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2005)
	                                      )
# check
# table(MTUSW6UKsurveyCore_DT$ba_age_cohort, MTUSW6UKsurveyCore_DT$ba_age_r, useNA = "always")
# XXX needs re-labelling!

Pooling 1984 & 1987

We now look to pool the 1983 & 1987 data to form a full year '1985' sample. This is explicitly recommended in the MTUS Guidebook p13 (XX url XX).

Before we do this we test for significant differences on core time-use dimensions of interest. We do not test for differences in timing - although we could.

# Check distributions for 1st diary day
subset1983_1987_DT <- MTUSW6UKsurvey_DT[(survey == 1983 | survey == 1987) & diary == "1st diary day"]
kable(caption = "Months data collected",
  table(subset1983_1987_DT$mtus_month, subset1983_1987_DT$survey)
)
# essentially 1983 = autumn/winter, 1987 = spring/summer
kable(caption = "Days data collected (day may be incorrect)",
  table(subset1983_1987_DT$mtus_day, subset1983_1987_DT$survey)
)

kable(caption = "Mean minutes per day by 1983/87 survey - 1st diary day",
      subset1983_1987_DT[,
           .(
             sleep = mean(main2),
             wash_dress = mean(main4),
             eating = mean(main5 + main6),
             paid_work = mean(main7 + main8),
             cooking = mean(main18),
             laundry = mean(main21),
             pub_etc = mean(main39)
           ),
           by = survey
           ]
)

XX Provide t tests? Need to log transform first. Also should really use weighted survey analysis XX

On the basis of this analysis and the detailed regression results in Annex 1, we conclude that we are justified in pooling 1983/87.

print("-> Creating new ba_survey variable to pool 1983/7")
  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 1974, 
    1974, # if true
    NA # if not
  )
  
  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 1983 | 
      MTUSW6UKsurvey_DT$survey == 1987 , 
    1985, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )
  
  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 1995 , 
    1995, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )
  
  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 2000 , 
    2000, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )
  
  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 2005 , 
    2005, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )

Save out processed file

# Keep the survey vars we need ----
print("-> Keeping core survey variables")
MTUSW6UKsurveyCore_DT <- MTUSW6UKsurvey_DT[, .(countrya, survey, swave, msamp, hldid, persid, id, diary,
                                               mtus_day, mtus_month, mtus_year,
                                               empstat, occup, urban, 
                                               badcase, sex, hhtype, income, propwt,
                                               ba_survey, ba_diarypid, ba_pid, 
                                               age, ba_age_cohort, ba_age_r, ba_nchild, ba_npeople) # important to keep age to enable labour market filtering etc
                                           ]

  # Save out the core file for later use (saves re-running) ----
  print("-> Save out core survey file")
  coreSurvey_DT <- paste0(mtusProcPath, "MTUSW6UKsurveyCore_DT.csv")
  print(paste0("--> Saving processed file in: ", coreSurvey_DT))
  write.csv(MTUSW6UKsurveyCore_DT, 
            file = coreSurvey_DT, row.names = FALSE
  )
  
  dir <- getwd()
  setwd(mtusProcPath)
  print("--> Now gzip the file")
  system("gzip -f MTUSW6UKsurveyCore_DT.csv &") # gzip & force over-write, shame can't do this directly as part of write
  setwd(dir) # set back to working directory otherwise R will save .RData in an odd place
  

Survey variables retained in the processed file:

r kable(caption="Survey variables retained", align="l",names(MTUSW6UKsurveyCore_DT))

Descripitive analysis

Check countries in this dataset

kable(
  table(MTUSW6UKsurveyCore_DT$countrya,
        MTUSW6UKsurveyCore_DT$survey,
        useNA = "always"
        )
)

Number of diary days completed

kable(caption = "Number of diary days by survey",
  table(MTUSW6UKsurveyCore_DT$diary,
        MTUSW6UKsurveyCore_DT$survey,
        useNA = "always"
        )
)

As we can see 1974-1987 were full week diaries. 2001 was a two day diary and 1995/2005 were one-day dairies.

From this point on in this section we use only unique individual records. Note that results do not necessarily match the number of cases recorded in the MTUS user guide as the user guide includes all cases (i.e. both adults and children).

setkey(MTUSW6UKsurveyCore_DT, ba_pid)
gMTUSW6UKsurveyCoreUniq_DT <- unique(MTUSW6UKsurveyCore_DT)

Urban/rural

kable(caption = "Urban/rural distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$urban, 
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always")
)

Occupation

kable(caption = "Occupation distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$occup,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)

Age group

kable(caption = "Age distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$ba_age_r,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)

Age Cohort

kable(caption = "Age cohort distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$ba_age_cohort,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)

Might need some factor labels!

Pooled survey year

kable(caption = "Survey year & pooled survey year (ba_survey)",
  table(gMTUSW6UKsurveyCoreUniq_DT$ba_survey,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)

Annex

Analysis in support of 1984/87 pooling

There do not appear to be large differences but we will test whether the survey year significantly predicts minutes per day in these activities given other characteristics (which may themselves have varied between the two samples).

The following analyses use a subset of the main data which only contains 1983 & 1987 data.

kable(caption="Cases for 1983/87 pooling testing",
      table(subset1983_1987_DT$survey,
            subset1983_1987_DT$sex,
            useNA = "always"
      )
      )

subset1983_1987_DT[, survey:= as.factor(survey)]

# Transformations based on spreadlevel plot of original un-transformed data
sleep <- lm((main2*main2) ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

wash_dress <- lm(sqrt(main4) ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

subset1983_1987_DT$eat <- subset1983_1987_DT$main5 + subset1983_1987_DT$main6
eat <- lm(eat ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

subset1983_1987_DT$paid_work <- subset1983_1987_DT$main7 + subset1983_1987_DT$main8
paid_work <- lm(paid_work ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

cook <- lm(main18 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

laundry <- lm(main21 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

pub_etc <- lm(main39 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

We report the model results below. In each case we also report basic regression diagnostics - in most cases these will indicate heteroskedasticity as is normal with time-diary data which is usually highly left-skewed.

First sleep:

summary(sleep)
qqnorm(sleep$residuals); qqline(sleep$residuals, col = 2)
spreadLevelPlot(sleep)

Washing/dressing:

summary(wash_dress)
qqnorm(wash_dress$residuals); qqline(wash_dress$residuals, col = 2)
spreadLevelPlot(wash_dress)

Eating:

summary(eat)
qqnorm(eat$residuals); qqline(eat$residuals, col = 2)
spreadLevelPlot(eat)

Cooking:

summary(cook)
qqnorm(cook$residuals); qqline(cook$residuals, col = 2)
spreadLevelPlot(cook)

Paid work:

summary(paid_work)
qqnorm(paid_work$residuals); qqline(paid_work$residuals, col = 2)
spreadLevelPlot(paid_work)

Laundry:

summary(laundry)
qqnorm(laundry$residuals); qqline(laundry$residuals, col = 2)
spreadLevelPlot(laundry)

Pub etc:

summary(pub_etc)
qqnorm(pub_etc$residuals); qqline(pub_etc$residuals, col = 2)
#spreadLevelPlot(pub_etc)

On the basis of these results we seem justified in assuming that we can pool 1983 & 1987.


Meta: Analysis completed in: r round(Sys.time() - starttime, 3) seconds using knitr in RStudio.

References