MTUS-W6-adult-survey-data-processing.Rmd

title: "MTUS World 6 Survey Data Processing"
author: "Ben Anderson (b.anderson@soton.ac.uk/@dataknut)"
date: 'Last run at: `r Sys.time()`'
output:
  html_document:
    fig_caption: yes
    keep_md: yes
    number_sections: yes
    self_contained: no
    theme: journal
    toc: yes
  pdf_document:
    toc: yes
    toc_depth: '3'
  word_document:
    toc: yes
    toc_depth: '3'
bibliography: ~/bibliography.bib
# set default echo to FALSE (code not in output)
knitr::opts_chunk$set(echo = FALSE)
knitr::opts_chunk$set(warning = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(fig_caption = TRUE)
knitr::opts_chunk$set(tidy = TRUE)
# Clear out all old objects etc ----
# to avoid confusion
rm(list = ls())

# Set time ----
starttime <- Sys.time()

# Load required packages ----
packs <- c("foreign", # loading SPSS/STATA
       "dplyr", # data manipulation
       "data.table", # fast data manipulation
       "dtplyr", # data table & dplyr code
       "car", # regression diagnostics
       "knitr" # for kable
       )

# do this to install them if needed
# install.packages(x)
print("Loading required packages:")
print(packs)

lapply(packs, require, character.only = T)

# Set paths ----
mtusPath <- "~/Data/MTUS/World_6/" # location of MTUS SPSS file
mtusProcPath <- "~/Data/MTUS/World_6/processed/" # where to put the processed .csv file(s)

# Set file names ----
surveyFile <- "MTUS-adult-aggregate.sav"

sfile <- paste0(mtusPath, surveyFile)
MTUSW6survey_DT <- as.data.table(read.spss(sfile)) # load from SPSS

setkey(MTUSW6survey_DT, countrya, survey, swave, msamp, hldid, persid, id)
kable(caption = "Number of diaries per year",
      table(MTUSW6survey_DT$survey, droplevels(MTUSW6survey_DT$countrya) # removes unused countries
            )
      )
MTUSW6UKsurvey_DT <- subset(MTUSW6survey_DT, countrya == "United Kingdom")
print("-> Create uniq id for diaries (for matching) and persons")

# Create unique ids ----
# diarypid
MTUSW6UKsurvey_DT$ba_diarypid <- group_indices(MTUSW6UKsurvey_DT,
                survey,
                swave,
                msamp,
                hldid,
                persid,
                id
       )

# pid
MTUSW6UKsurvey_DT$ba_pid <- group_indices(MTUSW6UKsurvey_DT, survey,
                swave,
                msamp,
                hldid,
                persid
       )

# create a reduced survey table with the few variables we need so joins
# does not break memory

# Rename original day/month/year as we think these may be suspect in some years
print("-> Renaming day/month/year variables")
setnames(MTUSW6UKsurvey_DT, "day", "mtus_day") # "Original MTUS day - incorrect for 1983/4"
setnames(MTUSW6UKsurvey_DT, "month", "mtus_month")
setnames(MTUSW6UKsurvey_DT, "year", "mtus_year")

# HH size variable ----
print("-> Creating new HH size variable")
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 1] <- 1
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 2] <- 2
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 3] <- 3
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize == 4] <- 4
MTUSW6UKsurvey_DT$ba_npeople[MTUSW6UKsurvey_DT$hhldsize > 4] <- "5+"

# check
#table(MTUSW6UKsurveyCore_DT$hhldsize, MTUSW6UKsurveyCore_DT$ba_hhsize, useNA = "always")

# Set up n child & n people variables ----
print("-> Creating new n children/n people variables")
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild == 0] <- 0
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild == 1] <- 1
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild == 2] <- 2
MTUSW6UKsurvey_DT$ba_nchild[MTUSW6UKsurvey_DT$nchild > 2] <- "3+"

# check
#table(MTUSW6UKsurveyCore_DT$nchild, MTUSW6UKsurveyCore_DT$ba_nchild, useNA = "always")

# Age categories ----
print("-> Creating new age group variable from:")
MTUSW6UKsurvey_DT[,
                  .(
                    min = min(age),
                    max = max(age)
                  ),
                  by = survey
                  ]
print("# NB: some are missing in 1974 & 1995 (min = -8):")
t <- MTUSW6UKsurvey_DT[age < 0]
table(t$age,t$survey)

# create age group
MTUSW6UKsurvey_DT$ba_age_r <- cut(MTUSW6UKsurvey_DT$age,
	                                      c(16,25,35,45,55,65,75,85,105)
	                                      )
# check
#table(MTUSW6UKsurveyCore_DT$age, MTUSW6UKsurveyCore_DT$ba_age_r, useNA = "always")

# Age cohorts ----
# NB - max age = 80 so older cohorts missing from 2005
print("-> Creating new age cohort  variable")
MTUSW6UKsurvey_DT[, ba_birthyear:= mtus_year - age]

MTUSW6UKsurvey_DT$ba_age_cohort <- cut(MTUSW6UKsurvey_DT$ba_birthyear,
	                                      c(1890,1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2005)
	                                      )
# check
# table(MTUSW6UKsurveyCore_DT$ba_age_cohort, MTUSW6UKsurveyCore_DT$ba_age_r, useNA = "always")
# XXX needs re-labelling!
# Check distributions for 1st diary day
subset1983_1987_DT <- MTUSW6UKsurvey_DT[(survey == 1983 | survey == 1987) & diary == "1st diary day"]
kable(caption = "Months data collected",
  table(subset1983_1987_DT$mtus_month, subset1983_1987_DT$survey)
)
# essentially 1983 = autumn/winter, 1987 = spring/summer
kable(caption = "Days data collected (day may be incorrect)",
  table(subset1983_1987_DT$mtus_day, subset1983_1987_DT$survey)
)

kable(caption = "Mean minutes per day by 1983/87 survey - 1st diary day",
      subset1983_1987_DT[,
           .(
             sleep = mean(main2),
             wash_dress = mean(main4),
             eating = mean(main5 + main6),
             paid_work = mean(main7 + main8),
             cooking = mean(main18),
             laundry = mean(main21),
             pub_etc = mean(main39)
           ),
           by = survey
           ]
)
print("-> Creating new ba_survey variable to pool 1983/7")
  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 1974,
    1974, # if true
    NA # if not
  )

  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 1983 |
      MTUSW6UKsurvey_DT$survey == 1987 ,
    1985, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )

  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 1995 ,
    1995, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )

  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 2000 ,
    2000, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )

  MTUSW6UKsurvey_DT$ba_survey <- ifelse(
    MTUSW6UKsurvey_DT$survey == 2005 ,
    2005, # if true
    MTUSW6UKsurvey_DT$ba_survey # if not
  )
# Keep the survey vars we need ----
print("-> Keeping core survey variables")
MTUSW6UKsurveyCore_DT <- MTUSW6UKsurvey_DT[, .(countrya, survey, swave, msamp, hldid, persid, id, diary,
                                               mtus_day, mtus_month, mtus_year,
                                               empstat, occup, urban,
                                               badcase, sex, hhtype, income, propwt,
                                               ba_survey, ba_diarypid, ba_pid,
                                               age, ba_age_cohort, ba_age_r, ba_nchild, ba_npeople) # important to keep age to enable labour market filtering etc
                                           ]

  # Save out the core file for later use (saves re-running) ----
  print("-> Save out core survey file")
  coreSurvey_DT <- paste0(mtusProcPath, "MTUSW6UKsurveyCore_DT.csv")
  print(paste0("--> Saving processed file in: ", coreSurvey_DT))
  write.csv(MTUSW6UKsurveyCore_DT,
            file = coreSurvey_DT, row.names = FALSE
  )

  dir <- getwd()
  setwd(mtusProcPath)
  print("--> Now gzip the file")
  system("gzip -f MTUSW6UKsurveyCore_DT.csv &") # gzip & force over-write, shame can't do this directly as part of write
  setwd(dir) # set back to working directory otherwise R will save .RData in an odd place

kable(
  table(MTUSW6UKsurveyCore_DT$countrya,
        MTUSW6UKsurveyCore_DT$survey,
        useNA = "always"
        )
)
kable(caption = "Number of diary days by survey",
  table(MTUSW6UKsurveyCore_DT$diary,
        MTUSW6UKsurveyCore_DT$survey,
        useNA = "always"
        )
)
setkey(MTUSW6UKsurveyCore_DT, ba_pid)
gMTUSW6UKsurveyCoreUniq_DT <- unique(MTUSW6UKsurveyCore_DT)
kable(caption = "Urban/rural distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$urban,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always")
)
kable(caption = "Occupation distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$occup,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)
kable(caption = "Age distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$ba_age_r,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)
kable(caption = "Age cohort distributions by survey",
  table(gMTUSW6UKsurveyCoreUniq_DT$ba_age_cohort,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)
kable(caption = "Survey year & pooled survey year (ba_survey)",
  table(gMTUSW6UKsurveyCoreUniq_DT$ba_survey,
        gMTUSW6UKsurveyCoreUniq_DT$survey,
        useNA = "always"
        )
)
kable(caption="Cases for 1983/87 pooling testing",
      table(subset1983_1987_DT$survey,
            subset1983_1987_DT$sex,
            useNA = "always"
      )
      )

subset1983_1987_DT[, survey:= as.factor(survey)]

# Transformations based on spreadlevel plot of original un-transformed data
sleep <- lm((main2*main2) ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

wash_dress <- lm(sqrt(main4) ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

subset1983_1987_DT$eat <- subset1983_1987_DT$main5 + subset1983_1987_DT$main6
eat <- lm(eat ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

subset1983_1987_DT$paid_work <- subset1983_1987_DT$main7 + subset1983_1987_DT$main8
paid_work <- lm(paid_work ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

cook <- lm(main18 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

laundry <- lm(main21 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

pub_etc <- lm(main39 ~ survey + mtus_month + ba_age_r + ba_nchild + hhtype, data = subset1983_1987_DT)

summary(sleep)
qqnorm(sleep$residuals); qqline(sleep$residuals, col = 2)
spreadLevelPlot(sleep)
summary(wash_dress)
qqnorm(wash_dress$residuals); qqline(wash_dress$residuals, col = 2)
spreadLevelPlot(wash_dress)
summary(eat)
qqnorm(eat$residuals); qqline(eat$residuals, col = 2)
spreadLevelPlot(eat)
summary(cook)
qqnorm(cook$residuals); qqline(cook$residuals, col = 2)
spreadLevelPlot(cook)
summary(paid_work)
qqnorm(paid_work$residuals); qqline(paid_work$residuals, col = 2)
spreadLevelPlot(paid_work)
summary(laundry)
qqnorm(laundry$residuals); qqline(laundry$residuals, col = 2)
spreadLevelPlot(laundry)
summary(pub_etc)
qqnorm(pub_etc$residuals); qqline(pub_etc$residuals, col = 2)
#spreadLevelPlot(pub_etc)