diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000000000000000000000000000000000000..b1b8c3dcb44af9a237b2aadd2204fea0123d294d --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^saveData\.Rproj$ +^\.Rproj\.user$ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000000000000000000000000000000000000..4e1f26defe2cd34c5eabaff2fe3da6f33c0ccc54 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,11 @@ +Package: saveData +Title: Package to support the re-use of the SAVE household demand response trial data +Version: 0.1 +Authors@R: c(person("Ben", "Anderson", email = "b.anderson@soton.ac.uk", role = c("aut", "cre")), + person("Michael", "Harper", email = "m.harper@soton.ac.uk", role = "aut"), + person("Tom", "Rushby", email = "twrushby@soton.ac.uk", role = "aut")) +Description: A range of scripts for data cleaning and processing. + This work was funded by the Low Carbon Network Fund (LCNF) Tier 2 Programme project "Solent Achieving Value from Efficiency (SAVE)" http://www.energy.soton.ac.uk/tag/save/. +License: file LICENCE +Encoding: UTF-8 +LazyData: true diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000000000000000000000000000000000000..884a6312ae55caf2d5bac006d43fab89d8545080 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/surveyData.R b/R/surveyData.R new file mode 100644 index 0000000000000000000000000000000000000000..57542797d248cccdbe198f6a48e46f1be683b2cf --- /dev/null +++ b/R/surveyData.R @@ -0,0 +1,1183 @@ +### ---- Functions used for survey data processing ---- #### + +# Helper/utility functions are found in bmgDataUtils.R + +#' Process the household response sheet from the BMG data file +#' +#' \code{getBmgAll} reads in and processes the household response sheet, the recruitment survey, the TP1 update survey +#' and combines them returning a data.table. +#' +#' This only gets called if it is needed (or you can run it manually). \code{input} over-rides the .xlsx file to load. +#' +#' Recoding variables: only recode recruitment survey variables in this function that have +#' have been updated via the TP1 or TP2 (etc) update surveys. +#' +#' @param input .xlsx file to load +#' @param message should messages be printed. Default is TRUE +#' @param updateRounds specifies how many update surveys to apply e.g. "2" uses TP1 and TP2 update surveys. Default 3 (all). +#' @param saveFile should messages be printed. Default is TRUE +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} (updated) +#' +#' @import plyr +#' @import data.table +#' @import readxl +#' +#' @family bmgData functions +#' +#' @export +#' +getBmgAll <- function(input = bmgCompleteFile, updateRounds = 3, message = TRUE, saveFile = FALSE){ + if(isTRUE(message)) outputMessage("...BMG files have been updated or forceUpdate = TRUE. Recompiling.") + if(isTRUE(message)) outputMessage(paste0("Load BMG data from: ", input)) + + + bmgResponseDT <- getBmgResponseFile(input) + data.table::setkey(bmgResponseDT, bmg_id) + + bmgRecruitSurveyDT <- getBmgRecruitSurvey(input) + data.table::setkey(bmgRecruitSurveyDT, bmg_id) + + if(isTRUE(message)) outputMessage("Loading update survey: round 1") + bmgTP1SurveyDT <- getBmgUpdateSurvey(input, updateRound = "TP1") + data.table::setkey(bmgTP1SurveyDT, bmg_id) + + # Create a DT called dt which contains all addresses ever contacted and all response and survey data. This function will also pdate full recruitment survey where possible + # This code sets the relevant values to be the 'most recent' (xx.latest). i.e. if we have a value for the TP1 (update) survey then we use that, if not we use the original survey value + # NB the update survey did not ask all possible questions so some values can't be updated (yet) + # the function also does some error checking + + + if(isTRUE(message)) outputMessage("Merging updated variables: round 1") + # Merge response file & recruitment survey file ---- + # use suffixes to show source of the variables that have the same names + dt <- merge(bmgResponseDT, bmgRecruitSurveyDT, all = TRUE, suffixes = c(".respf", ".fullSurvey")) + # This appears to add about 40 who are not in the response file? + data.table::setkey(dt, bmg_id) + + # Add TP1 update survey file ---- + # use suffixes to show source of the variables that have the same names + # careful with suffixes as merging will fail for subsequent updates ... + if (updateRounds == 1){ + dt <- merge(dt,bmgTP1SurveyDT, all = TRUE, suffixes = c(".fullSurvey", ".tp1Survey")) + data.table::setkey(dt, bmg_id) + } + + if (updateRounds >= 2){ + if(isTRUE(message)) outputMessage("Loading update survey: round 2") + # First merge TP1 update survey and preserve suffixes (first arg "" in suffixes) + dt <- merge(dt,bmgTP1SurveyDT, all = TRUE, suffixes = c("", ".tp1Survey")) + data.table::setkey(dt, bmg_id) + # Add TP2 update survey file ---- + bmgTP2SurveyDT <- getBmgUpdateSurvey(input, updateRound = "TP2") + data.table::setkey(bmgTP2SurveyDT, bmg_id) + + if(isTRUE(message)) outputMessage("Merging updated variables: round 2") + + if (updateRounds == 2){ + # Merge TP2 update survey and suffixes for full survey (first arg in suffixes) + dt <- merge(dt,bmgTP2SurveyDT, all = TRUE, suffixes = c(".fullSurvey", ".tp2Survey")) + data.table::setkey(dt, bmg_id) + } + + if (updateRounds == 3){ + # Merge TP2 update survey and preserve suffixes (first arg "" in suffixes) + dt <- merge(dt,bmgTP2SurveyDT, all = TRUE, suffixes = c("", ".tp2Survey")) + data.table::setkey(dt, bmg_id) + } + } + + if (updateRounds == 3){ + if(isTRUE(message)) outputMessage("Loading update survey: round 3") + # Add TP3 update survey file ---- + bmgTP3SurveyDT <- getBmgUpdateSurvey(input, updateRound = "TP3") + data.table::setkey(bmgTP3SurveyDT, bmg_id) + # Merge TP3 update survey and suffixes for full survey (first arg in suffixes) + if(isTRUE(message)) outputMessage("Merging updated variables: round 3") + dt <- merge(dt,bmgTP3SurveyDT, all = TRUE, suffixes = c(".fullSurvey", ".tp3Survey")) + data.table::setkey(dt, bmg_id) + } + + # Check for duplicates & remove any ---- + nHHs <- data.table::uniqueN(dt$bmg_id) + nObs <- nrow(dt) + + outputMessage("Before duplicate removal:") + outputMessage("Number of bmg_ids: ",nHHs) + outputMessage("Number of observations: ",nObs) + + if(nHHs < nObs){ + # This means there must be duplicates. We have checked these for errors and have discovered that they are exact duplicates so we remove them. + dt <- dt[!duplicated(dt$bmg_id)] + } + + nHHs <- uniqueN(dt$bmg_id) + nObs <- nrow(dt) + + outputMessage("After duplicate removal:") + outputMessage("Number of bmg_ids: ",nHHs) + outputMessage("Number of observations: ",nObs) + + # Update recruitment survey data ---- + # This code sets the relevant values to be the 'most recent' (xx.latest). i.e. if we have a value for the TP1 (update) survey then we use that, if not we use the original survey value + # NB the update survey did not ask all possible questions so some values can't be updated (yet) + + outputMessage(paste0("Updating survey variables with ",updateRounds, " update rounds")) + + if (updateRounds == 3){ + # > update children ---- + dt <- dt[, ba_presenceChildren.latest := ifelse(!is.na(ba_presenceChildren.tp3Survey), # if non-missing TP3 update + ba_presenceChildren.tp3Survey, # use TP3 update, else ... + ifelse(!is.na(ba_presenceChildren.tp2Survey), # if non-missing TP2 update + ba_presenceChildren.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_presenceChildren.tp1Survey), # if non-missing TP1 update + ba_presenceChildren.tp1Survey, # use TP1 update, else ... + ba_presenceChildren.fullSurvey))) # use full recruitment + ] + + # recode N children to make regression tables etc easier to understand + dt <- dt[, ba_presenceChildren.latest := ifelse(ba_presenceChildren.latest == "No children", "0 children", "1+ child")] + + # > update HRP work status ---- + dt <- dt[, ba_Q2D_HRPemplType.latest := ifelse(!is.na(ba_Q2D_HRPemplType.tp3Survey), # if non-missing TP3 update + ba_Q2D_HRPemplType.tp3Survey, # use TP3 update, else ... + ifelse(!is.na(ba_Q2D_HRPemplType.tp2Survey), # if non-missing TP2 update + ba_Q2D_HRPemplType.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_Q2D_HRPemplType.tp1Survey), # if non-missing TP1 update + ba_Q2D_HRPemplType.tp1Survey, # use TP1 update, else ... + ba_Q2D_HRPemplType.fullSurvey))) # use full recruitment + ] # 'other' should be coded so NA really is NA + + # > update n people age ---- + # people come & go + + dt <- dt[, ba_Q2_npeople.latest := ifelse(!is.na(ba_Q2_npeople.tp3Survey), # if non-missing TP3 update + ba_Q2_npeople.tp3Survey, # use TP3 update, else ... + ifelse(!is.na(ba_Q2_npeople.tp2Survey), # if non-missing TP2 update + ba_Q2_npeople.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_Q2_npeople.tp1Survey), # if non-missing TP1 update + ba_Q2_npeople.tp1Survey, # use TP1 update, else ... + ba_Q2_npeople.fullSurvey))) # use full recruitment + ] + + dt <- dt[, ba_Q2_npeople_reduced.latest := ifelse(!is.na(ba_Q2_npeople_reduced.tp3Survey), # if non-missing TP3 update + ba_Q2_npeople_reduced.tp3Survey, # use TP3 update, else ... + ifelse(!is.na(ba_Q2_npeople_reduced.tp2Survey), # if non-missing TP2 update + ba_Q2_npeople_reduced.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_Q2_npeople_reduced.tp1Survey), # if non-missing TP1 update + ba_Q2_npeople_reduced.tp1Survey, # use TP1 update, else ... + ba_Q2_npeople_reduced.fullSurvey))) # use full recruitment + ] + + #t <- table(dt$ba_censusNpeople.fullSurvey, dt$ba_censusNpeople.tp1Survey, useNA = "always") + dt <- dt[, ba_censusNpeople.latest := ifelse(!is.na(ba_censusNpeople.tp3Survey), # if non-missing TP3 update + ba_censusNpeople.tp3Survey, # use TP3 update, else ... + ifelse(!is.na(ba_censusNpeople.tp2Survey), # if non-missing TP2 update + ba_censusNpeople.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_censusNpeople.tp1Survey), # if non-missing TP1 update + ba_censusNpeople.tp1Survey, # use TP1 update, else ... + ba_censusNpeople.fullSurvey))) # use full recruitment + ] + + # > update electric vehicles ---- + dt <- dt[, Q6_1.latest := ifelse(!is.na(Q6_1.tp3Survey), # if non-missing TP3 update + Q6_1.tp3Survey, # use TP3 update, else ... + ifelse(!is.na(Q6_1.tp2Survey), # if non-missing TP2 update + Q6_1.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(Q6_1.tp1Survey), # if non-missing TP1 update + Q6_1.tp1Survey, # use TP1 update, else ... + Q6_1.fullSurvey))) # use full recruitment + ] + + + } + + if (updateRounds == 2){ + # > update children ---- + dt <- dt[, ba_presenceChildren.latest := ifelse(!is.na(ba_presenceChildren.tp2Survey), # if non-missing TP2 update + ba_presenceChildren.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_presenceChildren.tp1Survey), # if non-missing TP1 update + ba_presenceChildren.tp1Survey, # use TP1 update, else ... + ba_presenceChildren.fullSurvey)) # use full recruitment + ] + + # recode N children to make regression tables etc easier to understand + dt <- dt[, ba_presenceChildren.latest := ifelse(ba_presenceChildren.latest == "No children", "0 children", "1+ child")] + + # > update HRP work status ---- + dt <- dt[, ba_Q2D_HRPemplType.latest := ifelse(!is.na(ba_Q2D_HRPemplType.tp2Survey), # if non-missing TP2 update + ba_Q2D_HRPemplType.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_Q2D_HRPemplType.tp1Survey), # if non-missing TP1 update + ba_Q2D_HRPemplType.tp1Survey, # use TP1 update, else ... + ba_Q2D_HRPemplType.fullSurvey)) # use full recruitment + ] # 'other' should be coded so NA really is NA + + # > update n people age ---- + # people come & go + + dt <- dt[, ba_Q2_npeople.latest := ifelse(!is.na(ba_Q2_npeople.tp2Survey), # if non-missing TP2 update + ba_Q2_npeople.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_Q2_npeople.tp1Survey), # if non-missing TP1 update + ba_Q2_npeople.tp1Survey, # use TP1 update, else ... + ba_Q2_npeople.fullSurvey)) # use full recruitment + ] + + dt <- dt[, ba_Q2_npeople_reduced.latest := ifelse(!is.na(ba_Q2_npeople_reduced.tp2Survey), # if non-missing TP2 update + ba_Q2_npeople_reduced.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_Q2_npeople_reduced.tp1Survey), # if non-missing TP1 update + ba_Q2_npeople_reduced.tp1Survey, # use TP1 update, else ... + ba_Q2_npeople_reduced.fullSurvey)) # use full recruitment + ] + + #t <- table(dt$ba_censusNpeople.fullSurvey, dt$ba_censusNpeople.tp1Survey, useNA = "always") + dt <- dt[, ba_censusNpeople.latest := ifelse(!is.na(ba_censusNpeople.tp2Survey), # if non-missing TP2 update + ba_censusNpeople.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(ba_censusNpeople.tp1Survey), # if non-missing TP1 update + ba_censusNpeople.tp1Survey, # use TP1 update, else ... + ba_censusNpeople.fullSurvey)) # use full recruitment + ] + + # > update electric vehicles ---- + dt <- dt[, Q6_1.latest := ifelse(!is.na(Q6_1.tp2Survey), # if non-missing TP2 update + Q6_1.tp2Survey, # use TP2 update, else ... + ifelse(!is.na(Q6_1.tp1Survey), # if non-missing TP1 update + Q6_1.tp1Survey, # use TP1 update, else ... + Q6_1.fullSurvey)) # use full recruitment + ] + } + + if (updateRounds == 1){ + # > update children ---- + dt <- dt[, ba_presenceChildren.latest := ifelse(!is.na(ba_presenceChildren.tp1Survey), # if non-missing TP1 update + ba_presenceChildren.tp1Survey, # use TP1 update, else ... + ba_presenceChildren.fullSurvey) # use full recruitment + ] + + # recode N children to make regression tables etc easier to understand + dt <- dt[, ba_presenceChildren.latest := ifelse(ba_presenceChildren.latest == "No children", "0 children", "1+ child")] + + # > update HRP work status ---- + dt <- dt[, ba_Q2D_HRPemplType.latest := ifelse(!is.na(ba_Q2D_HRPemplType.tp1Survey), # if non-missing TP1 update + ba_Q2D_HRPemplType.tp1Survey, # use TP1 update, else ... + ba_Q2D_HRPemplType.fullSurvey) # use full recruitment + ] # 'other' should be coded so NA really is NA + + # > update n people age ---- + # people come & go + + dt <- dt[, ba_Q2_npeople.latest := ifelse(!is.na(ba_Q2_npeople.tp1Survey), # if non-missing TP1 update + ba_Q2_npeople.tp1Survey, # use TP1 update, else ... + ba_Q2_npeople.fullSurvey) # use full recruitment + ] + + dt <- dt[, ba_Q2_npeople_reduced.latest := ifelse(!is.na(ba_Q2_npeople_reduced.tp1Survey), # if non-missing TP1 update + ba_Q2_npeople_reduced.tp1Survey, # use TP1 update, else ... + ba_Q2_npeople_reduced.fullSurvey) # use full recruitment + ] + + #t <- table(dt$ba_censusNpeople.fullSurvey, dt$ba_censusNpeople.tp1Survey, useNA = "always") + dt <- dt[, ba_censusNpeople.latest := ifelse(!is.na(ba_censusNpeople.tp1Survey), # if non-missing TP1 update + ba_censusNpeople.tp1Survey, # use TP1 update, else ... + ba_censusNpeople.fullSurvey) # use full recruitment + ] + + # > update electric vehicles ---- + dt <- dt[, Q6_1.latest := ifelse(!is.na(Q6_1.tp1Survey), # if non-missing TP1 update + Q6_1.tp1Survey, # use TP1 update, else ... + Q6_1.fullSurvey) # use full recruitment + ] + + } + + # > update HRP age ---- + # do we really want to do this? + # it might just be that someone else answered the update survey? + # could check by looking at gender + #t <- table(dt$ba_Q2B_2_HRPage.fullSurvey, dt$ba_Q2B_2_HRPage.tp1Survey, useNA = "always") + #kable(caption = "Check age of HRP in each survey", t) + + # Recode Variables --- + # Only recode 'latest' variables here - i.e. those where we need to combine various surveys + # Recode variables asked in only 1 survey in the relevant function + + + # Clamp installed flag ---- + dt <- dt[, ba_clampInstalled := ifelse(bmgInstallType %like% "install" | + bmgInstallType %like% "Opt" | + bmgInstallType %like% "Removals", + "Yes", + "No")] + + # Install type var ---- + dt <- dt[, ba_installType := ifelse(bmgInstallType %like% "install", + "Installation complete", + NA)] + dt <- dt[, ba_installType := ifelse(bmgInstallType %like% "Opt Out", + "Installation complete", + ba_installType)] # we're coding these as installed for now despite the fact that they then opted out + dt <- dt[, ba_installType := ifelse(Outcome_Label %like% "Removals from Navetas", + "Installation complete", + ba_installType)] # we're coding these as installed for now despite the fact that they then opted out + dt <- dt[, ba_installType := ifelse(bmgInstallType %like% "Refused", + "Refusal", + ba_installType)] + #No contacts + dt <- dt[, ba_installType := ifelse(Outcome_Label %like% "knocks" | + Outcome_Label %like% "Knocks" | + Outcome_Label %like% "No reply" | + Outcome_Label %like% "No response", + "No contact",ba_installType)] + + # Overall outcome var ---- + dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "No reply", + "No contact", + NA)] + dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "No response", + "No contact", # does this differ?? + ba_saveOutcome)] + dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "Empty", + "No contact", + ba_saveOutcome)] + dt <- dt[, ba_saveOutcome := ifelse(ba_installType %like% "Refusal", + "Refusal", + ba_saveOutcome)] + dt <- dt[, ba_saveOutcome := ifelse(bmgInstallType %like% "Unable", + "Unable to fit clamp", + ba_saveOutcome)] + dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "Not Used", + "Unused address", # i.e. not issued + ba_saveOutcome)] + dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "agrees" | + Outcome_Label %like% "Agrees" | + Outcome_Label %like% "Self Install" | + Outcome_Label %like% "Opt Out" | + Outcome_Label %like% "Removals" | + ba_surveyMode.fullSurvey %like% "Install", # adds households not found in Outcome_Label + "Installation completed", # Installed (they may then have opted out) + ba_saveOutcome)] + dt <- dt[, ba_saveOutcome := ifelse(is.na(ba_saveOutcome), + "Other non-response", # The rest + ba_saveOutcome)] + + # set levels order + dt <- dt[, ba_saveOutcome := factor(ba_saveOutcome, + levels = c("Unused address", + "No contact", + "Refusal", + "Other non-response", + "Unable to fit clamp", + "Installation completed" + ) + ) + ] + + + + # Survey year as a year ---- + dt <- dt[, surveyYear := lubridate::year(InterviewDate.fullSurvey)] + + # Set key ---- + if(isTRUE(message)) outputMessage("Set key to 'bmg_id'") + dt <- dt[, bmg_id := as.character(bmg_id)] # to match power data + setkey(dt, bmg_id) + + if(isTRUE(saveFile)){ + # Save the file for future use each time this runs ---- + ofile <- paste0(dPath,"processed/", + "hhAttributesLatest_", + Sys.Date(), + ".csv") + outputMessage("Saving ", ofile, " for future use...") + write_csv(dt, ofile) + + # remove old gzip file + cmd <- paste0("rm ", ofile, ".gz") # will produce a warning if it does not exist - yes, we could make this tidyer + try(system(cmd)) + + # now gzip new one + outputMessage("gzipping file to: ", paste0(ofile, ".gz")) + cmd <- paste0("gzip ", ofile) + try(system(cmd)) # in case it fails - if it does there will just be .csv files (not gzipped) - e.g. under windows + } + + # Finish ----- + + outputMessage("All BMG data loaded, processed & returned") + return(dt) +} + +#' Process the household response sheet from the BMG data file +#' +#' \code{getBmgResponseFile} reads in and processes the original household response sheet. This contains all bmg_ids ever issued of +#' which many were not used. +#' +#' Links to a lot of area-level variables as OA & LSOA level. +#' +#' Recoding variables: recode original response file variables here (not in \code{getBmgAll()}). +#' +#' Returns a processed data.table with > 40,000 rows +#' +#' @param file .xlsx file to load +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' @import readxl +#' +#' @family bmgData functions +#' +#' @export +#' +getBmgResponseFile <- function(file){ + message("getBmgResponseFile: Load complete response data from: ", file, ", sheet = AllIssued") + hhRespDTorig <- data.table::as.data.table(readxl::read_xlsx(file, sheet = "AllIssued")) + hhRespDTorig <- hhRespDTorig[, bmg_id := as.character(BMG_ID)] + print(paste0("N rows in ", file, ": AllIssued sheet (before processing) = ", tidyNum(nrow(hhRespDTorig)))) + + dt <- hhRespDTorig + # set bmg_id to char for matching -- + dt <- dt[,bmg_id := as.character(BMG_ID)] + + # set trial groups -- + dt <- dt[,bmgIntervention := Intervention] + + dt <- dt[,bmgIntervention := factor(bmgIntervention, + labels = c("1: Control", + "2: LED", + "3: Messaging", + "4: Messaging & £" + ) + ) + ] + # check + #print(with(dt, table(bmgIntervention, Intervention, useNA = "always"))) + + # set trial group ---- + dt <- dt[, INTERVENTIONTYPE := Intervention] # fix + dt <- SAVEr::setBmgTrialGroups(dt) # set using function for reproducibility + + # to avoid confusion set name to this file + dt <- dt[, bmgGroup.respF := bmgGroup] + dt <- dt[, bmgGroup:= NULL] + + # check + #print(with(dt, table(bmgGroup, Intervention, useNA = "always"))) + + # set outcome labels ---- + dt <- dt[,bmgOutcomeLabel := Outcome_Label] + + # derived/simplified - do not change the name of this variable as the navetas processing code relies on it + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Self Install", "Self install", "Other code")] # other carries forward to replace NA + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Not Used", "Address not used", bmgInstallType)] + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "new participant", "New install", bmgInstallType)] + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "new ocupier", "Re-install (new occupier)", bmgInstallType)] # beware typo in original! + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "previous occupier", "Re-install (same occupier)", bmgInstallType)] + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Removals from Navetas", "Removals from Navetas", bmgInstallType)] + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Opt Out", "Opt Out", bmgInstallType)] + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Unable", "Unable to fit clamp", bmgInstallType)] + dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Refused", "Refused (various)", bmgInstallType)] + + # check + #print(dt[, .N, keyby = bmgInstallType]) # better than using table as it always prints NA + + dt[, bmgClampInstall := ifelse(bmgInstallType %like% "install", "Yes", "No")] + # check + #print(with(dt, table(bmgInstallType,bmgInstall, useNA = "always"))) + + # fix installation dates ---- + dt <- dt[, optOutDate := lubridate::ymd(`Opt Out Date`)] # requires lubridate + dt$`Opt Out Date` <- NULL + dt <- dt[, navetasSimRouterInstallDate := lubridate::ymd(as_date(New_Sim_Router_Complete_Date))] # requires lubridate + dt$New_Sim_Router_Complete_Date <- NULL + dt <- dt[, newNavetasInstallDate := lubridate::ymd(navetasInstallDate)] # requires lubridate + dt$navetasInstallDate <- NULL + dt$bmgInstallYear <- year(dt$wmgInstallDate) + dt <- dt[, bmgInstallYear := ifelse(is.na(bmgInstallYear), lubridate::year(dt$wmgCompleteDate), bmgInstallYear)] + dt <- dt[, bmgInstallYear := ifelse(is.na(bmgInstallYear), lubridate::year(dt$navetasSimRouterInstallDate), bmgInstallYear)] + dt <- dt[, bmgInstallYear := ifelse(is.na(bmgInstallYear), lubridate::year(dt$newNavetasInstallDate), bmgInstallYear)] + + # check + #print(with(dt, table(bmgInstallType,bmgInstallYear, useNA = "always"))) + # not got all dates - most of these are caused by the self-installs + + # set internet mode ---- + dt$bmgInternetType[dt$bmgInstallType %like% "install" | + dt$bmgInstallType %like% "Opt Out" | + dt$bmgInstallType %like% "Removals"] <- "BMG: ADSL?" # default - has (or had) an installation of some sort + dt$bmgInternetType[dt$Sim_Installs == 1] <- "BMG: SIM" + + # check + #print(with(dt, table(Sim_Installs, bmgInternetType, useNA = "always"))) + + # get household non-response weight ---- + # wgtNonResponse + + # Add geography vars ---- + # index via OA code + dt <- dt[, bmgOA2011 := Oa11] + + # > Urban vs rural ---- + message("Adding urban/rural codes") + oa2011urbanDT <- data.table::fread(rurFile) + # change to make future matching easier + oa2011urbanDT$oa11cd <- oa2011urbanDT$OA11CD + data.table::setkey(oa2011urbanDT, oa11cd) + oa2011urbanDT[, OA11CD := NULL] + + # > IMD ---- + message("Adding IMD deciles etc") + imdnames <- c("lsoacode", "lsoaname","la_code", "la_name", "imd_rank", "imd_score_dec") + + lsoaImd2015DT <- data.table::fread(lsoaImd2015File) + data.table::setnames(lsoaImd2015DT, imdnames) + lsoaImd2015DT[, lsoacode := lsoacode] + data.table::setkey(lsoaImd2015DT, lsoacode) + + # > OA <-> LSOA ---- + message("Adding LSOA etc codes") + oa2011lutDT <- data.table::fread(oaLutFile) + oa2011lutDT$lsoacode <- oa2011lutDT$LSOA11CD # for ease of merging + oa2011lutDT$oa11cd <- oa2011lutDT$OA11CD # for ease of merging + data.table::setkey(oa2011lutDT, oa11cd) + #print(paste0("Number of OAs in OA-LSOA LUT file", oaLutFile)) + data.table::uniqueN(oa2011lutDT) + + # > OA <-> County ---- + message("Adding County codes") + oaCountylutDT <- data.table::fread(oaCountyFile) + oaCountylutDT$oa11cd <- oaCountylutDT$OACD # for ease of merging + # N OAs + #print(paste0("Number of OAs in county file: ", oaCountyFile)) + data.table::uniqueN(oaCountylutDT) + data.table::setkey(oaCountylutDT, oa11cd) + + oa2011lutDT <- merge(oa2011lutDT,oaCountylutDT, all = TRUE) # there are OAs without county names & vice versa + # N OAs with a county + data.table::uniqueN(oa2011lutDT) + + oa2011lutDT[, ba_WiderSolentOA := ifelse(LAD11NM %like% "Southampton" | + LAD11NM %like% "Portsmouth" | + LAD11NM %like% "Isle of Wight" | + LAD11NM %like% "Hampshire", + "Wider Solent (SAVE)", # Hampshire, Southampton, Portsmouth, Isle of Wight + "not Wider Solent") + ] + #table(oa2011lutDT$ba_WiderSolentOA, useNA = "always") + + # > link the OA urban/rural and census data ---- + data.table::setkey(oa2011lutDT, oa11cd) + oaAllDataDT <- merge(oa2011lutDT,oa2011urbanDT, all.x = TRUE) # keep all OAs + + #print(oaAllDataDT[, .(nOAs = uniqueN(oa11cd)), by = ba_WiderSolentOA]) + #print(oaAllDataDT[, .(nLSOAs = uniqueN(lsoacode)), by = ba_WiderSolentOA]) + + # > add the IMD data ---- + data.table::setkey(oaAllDataDT, lsoacode) + oaAllDataDT <- merge(oaAllDataDT,lsoaImd2015DT, all.x = TRUE) # keep all OAs + + #t <- table(oaAllDataDT$ba_WiderSolentOA, useNA = "always") + oaAllDataDT[, .(nOAs = .N), by = ba_WiderSolentOA] + + # oaAllDataDT is what it says on the tin (no geography filtering) + + # now merge that to the response file via the OA code + dt <- dt[, oa11cd := bmgOA2011] # fix for merge + data.table::setkey(dt,oa11cd) + data.table::setkey(oaAllDataDT, oa11cd) + + dt <- merge(dt, oaAllDataDT, all.x = TRUE) # only keeps rows in response file + + # Keep Census 2011 OA but beware: potentially disclosive + # > Fix names ---- + + dt$lsoacode_2011 <- dt$LSOA11CD + dt$lsoaname_2011 <- dt$LSO11ANM # beware typo in original! + dt$countyname_2011 <- dt$FCTYNM + dt$countycode_2011 <- dt$FCTYCD + dt$lacode_2011 <- dt$LAD11CD + dt$laname_2011 <- dt$LAD11NM + + data.table::setkey(dt, bmg_id) + + # Finish ---- + print("Done response file") + return(dt) +} + +#' Process the household recruitment survey sheet from the BMG data file +#' +#' \code{getBmgRecruitSurvey} reads in and processes the original household recruitment survey data sheet. This contains all bmg_ids who completed +#' a recruitment survey. +#' +#' Recoding variables: recode original recruitment survey variables here (not in \code{getBmgAll()}). +#' +#' Returns a processed data.table +#' +#' @param file .xlsx file to load +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' @import readxl +#' +#' @family bmgData functions +#' +#' @export +#' +getBmgRecruitSurvey <- function(file){ + # BMG recruitment survey data -- + print(paste0("getBmgRecruitSurvey: Load full recruitment survey data from: ", file, ", sheet = FullRecruit")) + dt <- data.table::as.data.table(readxl::read_xlsx(file, sheet = "FullRecruit")) + dt <- dt[, bmg_id := as.character(BMG_ID)] + print(paste0("N rows in ", file, ": FullRecruit sheet (before processing) = ", + tidyNum(nrow(dt)))) + + data.table::setkey(dt, bmg_id) + # function to to process the BMG recruitment survey file + dt <- dt[, bmg_id := BMG_ID] + dt <- dt[, bmg_id := as.character(bmg_id)] + + # Set trial groups -- + dt <- SAVEr::setBmgTrialGroups(dt) + + # to avoid confusion + dt <- dt[, bmgGroup.fullSurvey := bmgGroup] + dt <- dt[, bmgGroup:= NULL] + + # Set survey type flag -- + dt <- dt[surveyMode %like% "CATI", ba_surveyMode := "Install_CATI"] + dt <- dt[surveyMode %like% "CAWI", ba_surveyMode := "Install_CAWI"] + dt <- dt[surveyMode %like% "CAPI", ba_surveyMode := "Install_f2f"] + dt <- dt[is.na(surveyMode), ba_surveyMode := "No survey data"] + + # Set survey source flag -- + dt <- dt[ba_surveyMode != "No survey data", source := "Full recruitment survey"] + dt <- dt[ba_surveyMode == "No survey data", ba_recruitSurveySource := bmgCompleteFile] + + print("Test full survey file before duplicate tests:") + print(paste0("Number of rows of data : ", tidyNum(nrow(dt)), " (includes all issued addresses)")) + print(paste0("Number of unique bmg_id: ", tidyNum(uniqueN(dt$bmg_id)))) + + # Are there duplicates & does it matter? + + print("Removing any duplicates") + + dt <- dt[!duplicated(dt$bmg_id)] + print("Test full survey file after duplicate tests:") + print(paste0("Number of rows of data : ", tidyNum(nrow(dt)), " (includes all issued addresses)")) + print(paste0("Number of unique bmg_id: ", tidyNum(uniqueN(dt$bmg_id)))) + print(paste0("Number of recruitment survey responses ever collected: ", + tidyNum(uniqueN(dt[ba_surveyMode != "No survey data"]$bmg_id)))) + + data.table::setkey(dt, bmg_id) + + + # Fix interview date -- + print("Fixing recruitment interview date - this may break with future versions of the data") + # This whole section should probably be inside an if version = v5 + # (some values are badly formatted in some of the BMG files) + # if there is an issue the some of the dates look like strings of numbers and some look like "08/12/2017" char strings + # presumably becasue read_xlsx doesn't deal with variable date formats very well + + # first try to force them into integers + # this will convert any that were dates (which got turned into integers by read_xlsx + # It leave a lot of NAs including those that were character strings + print("# -> Before fix (to test if fix required):") + print(summary(dt$InterviewDate)) + dt$InterviewDate_orig <- dt$InterviewDate + # dt <- dt[, InterviewDate_Fixed := ymd_hms(InterviewDate)] # uses lubridate to pick up the char strings + # + # dt <- dt[!is.na(InterviewDate_Fixed), InterviewDate_orig := NA] # remove the ones we fixed + # # now force the conversion of the remainder to numeric and then convert using MS Windows excel epoch + # dt <- dt[, InterviewDate_xlFixed := as.Date(as.numeric(InterviewDate_orig), origin = "1900-01-01")] # this is unreformed Windows Excel so epoch = 1/1/1900 + # dt <- dt[is.na(InterviewDate_Fixed), InterviewDate_Fixed := InterviewDate_xlFixed] # add the ones we fixed + # dt <- dt[, InterviewDate := InterviewDate_Fixed] + dt <- dt[, InterviewDate := as.Date(InterviewDate_orig)] + print("# -> After fix:") + print(summary(dt$InterviewDate)) + + # Recode variables -- + # Do this using functions that can be re-used for the TP1/TP2 etc surveys to ensure conformity (assumes original coding is harmonised!) + + # > HRP Age -- + dt <- SAVEr::recodeHrpAge(dt) + + # > HRP employment -- + dt <- SAVEr::recodeHrpEmplType(dt) + + # > HRP Ethnicity -- + dt <- SAVEr::recodeHrpEthnicity(dt) + + # > HRP gender -- + dt <- SAVEr::recodeHrpGender(dt) + + # > HRP NS_SEC -- + dt <- SAVEr::codeHrpNSSEC(dt) + + # > Mains gas etc + dt$ba_Q3_6_mainsGas <- dt$Q3_6 + + dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 1, + "Mains gas", + ba_Q3_6_mainsGas)] + dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 2, + "LPG gas", + ba_Q3_6_mainsGas)] + dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 3, + "No gas", + ba_Q3_6_mainsGas)] + dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 4, + "Don't know", + ba_Q3_6_mainsGas)] + # > Heat source -- + # recode heat source from full survey (not on update survey - assume is still correct!) + dt <- SAVEr::recodeHeatSource(dt) + dt <- SAVEr::recodeHeatSourceExt(dt) + + # > Presence of children ---- + dt <- SAVEr::recodePresenceChildren(dt) + + # > Source of electricity + dt$ba_Q3_3_Mains <- dt$Q3_3_1 + dt$ba_Q3_3_PV <- dt$Q3_3_2 + dt$ba_Q3_3_Wind <- dt$Q3_3_3 + dt$ba_Q3_3_Hydro <- dt$Q3_3_4 + dt$ba_Q3_3_CHP <- dt$Q3_3_5 + dt$ba_Q3_3_Gen <- dt$Q3_3_6 + dt$ba_Q3_3_Oth <- dt$Q3_3_7 + dt$ba_Q3_3_dk <- dt$Q3_3_8 + + # > Water heating ---- + #Q3_18A 151 Q3.18. How is your water heated?: Electric immersion heater (to hot water tank) + dt$ba_Q3_12_hotWaterImm <- dt$Q3_18_1 + #Q3_18B 152 Q3.18. How is your water heated?: Gas or oil direct heating (combi) + dt$ba_Q3_12_hotWaterCombi <- dt$Q3_18_2 + #Q3_18C 153 Q3.18. How is your water heated?: Gas, oil or solid fuel (including wood) indirect heating (to hot water tank) + dt$ba_Q3_12_hotWaterInDirect <- dt$Q3_18_3 + #Q3_18D 154 Q3.18. How is your water heated?: Solar heating (to hot water tank) + dt$ba_Q3_12_hotWaterSolar <- dt$Q3_18_4 + #Q3_18E 155 Q3.18. How is your water heated?: Other + dt$ba_Q3_12_hotWaterOther <- dt$Q3_18_5 + #table(dt$ba_Q3_12_hotWaterOther) + dt$ba_Q3_12_hotWaterOtherDetail <- dt$Q3_18OTHER # actual text recorded - has 119 entries could be recoded in to the other categories? + #Q3_18F 156 Q3.18. How is your water heated?: Don't know + dt$ba_Q3_12_hotWaterDK <- dt$Q3_18_6 + #Q3_18G 157 Q3.18. How is your water heated?: Refused + dt$ba_Q3_12_hotWaterRefused <- dt$Q3_18_7 + + # > N cars ---- + dt <- SAVEr::recodeNcars(dt) + + # > Tenure (Census 2011 form) -- + + # > Dwelling type ---- + # Q8_2 (dwelling type from survey) to make labels easier + # Lys suggests is more reliable - why? + dt$ba_Q8_2_dwelling <- factor(dt$Q8_2, + labels = c( + "Detached", + "Semi-detached", + "Terrace or end terrace", + "Flat - purpose built", + "Flat - converted", + "Commercial", + "Caravan etc", + "Refused" + ) + ) + + # Merge flat types + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 1, 1, NA)] + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 2, 2, ba_Q8_2_dwellingRecoded)] + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 3, 3, ba_Q8_2_dwellingRecoded)] + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 4 | Q8_2 == 5, 4, ba_Q8_2_dwellingRecoded)] + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 6, 5, ba_Q8_2_dwellingRecoded)] + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 7, 6, ba_Q8_2_dwellingRecoded)] + dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 8, 7, ba_Q8_2_dwellingRecoded)] + + dt$ba_Q8_2_dwellingRecoded <- factor(dt$ba_Q8_2_dwellingRecoded, + labels = c( + "Detached", + "Semi-detached", + "Terrace or end terrace", + "Flat", + "Commercial", + "Caravan etc", + "Refused" + ) + ) + # check + # with(dt, table(ba_Q8_2_dwelling,ba_Q8_2_dwellingRecoded, useNA = "always" )) + + # dictionary to convert dwelling type + namesDwelling <- + c("Detached" = "Detached", + "Semi-detached" = "Semi", + "Terrace or end terrace" = "Terrace", + "Flat" = "Flat/Other", + "Commercial" = "Flat/Other", + "Caravan etc" = "Flat/Other", + "Refused" = "Flat/Other") + + dt <- dt[, Dwelling := plyr::revalue(ba_Q8_2_dwellingRecoded, namesDwelling)] # <- only asked at recruitment + + # > N people -- + dt <- SAVEr::recodeNpeople(dt) + + # > N rooms ---- + + # match to USOC by subtracting n bathrooms + dt <- dt[, Q8_6 := as.numeric(Q8_6)] + dt <- dt[, Q8_8 := as.numeric(Q8_8)] + dt <- dt[, ba_nroomsUSOCtemp := Q8_6 - Q8_8] + dt <- dt[, ba_nroomsUSOC := as.character(ba_nroomsUSOCtemp)] + dt <- dt[ba_nroomsUSOCtemp > 8, ba_nroomsUSOC := "9+"] + + # force correspondence to census + # https://www.nomisweb.co.uk/census/2011/qs407ew + dt <- dt[, ba_nrooms := as.character(Q8_6)] + dt <- dt[Q8_6 > 8, ba_nrooms := "9+"] + + # > Bedrooms ---- + + # Bedrooms recoded to 4 collapsed categories + # used in CustomerTypes (bmgDataUtils - SAVEr::codeCustomerTypes) + dt <- dt[, Bedrooms := NA] + dt <- dt[, Bedrooms := ifelse(Q8_7 <= 2, "0-2", Bedrooms)] + dt <- dt[, Bedrooms := ifelse(Q8_7 >= 3 & Q8_7 <= 4, Q8_7, Bedrooms)] + dt <- dt[, Bedrooms := ifelse(Q8_7 >= 5, "5+", Bedrooms)] + + # table(dt$ba_Q8_6_nrooms, dt$Q8_6, useNA = "always") + + # > Tenure ---- + dt <- SAVEr::recodeTenure(dt) + + # > Income ---- + dt <- SAVEr::recodeIncome(dt) + + # > recode eco satisfaction score -- + dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 1] <- "1: I'm happy with what I do at the moment" + dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 2] <- "2: I'd like to do a bit more to help the environment" + dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 3] <- "3: I'd like to do a lot more to help the environment" + dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 4] <- "4: Don't know" + dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 5] <- "5: Refused" + table(dt$ba_Q7_1_ecoSatis, dt$Q7_1, useNA = "always") + + # Create aggregate eco behaviours index ---- + + # > recode so high = more eco ---- + + # Q7_2_x all coded 1 = Always -> 5 = Never & 6 = NA + # Q7_2_1 281 Leave a TV on standby for the night + + dt$Q7_2_1r[dt$Q7_2_1 == 1] <- 1 + dt$Q7_2_1r[dt$Q7_2_1 == 2] <- 2 + dt$Q7_2_1r[dt$Q7_2_1 == 3] <- 3 + dt$Q7_2_1r[dt$Q7_2_1 == 4] <- 4 + dt$Q7_2_1r[dt$Q7_2_1 == 5] <- 5 + dt$Q7_2_1r[dt$Q7_2_1 == 6] <- NA + + #table(dt$Q7_2_1, dt$Q7_2_1r, useNA = "always") + + # Q7_2_2 282 Switch off lights in rooms that aren't being used + # reverse + dt$Q7_2_2r[dt$Q7_2_2 == 1] <- 5 + dt$Q7_2_2r[dt$Q7_2_2 == 2] <- 4 + dt$Q7_2_2r[dt$Q7_2_2 == 3] <- 3 + dt$Q7_2_2r[dt$Q7_2_2 == 4] <- 2 + dt$Q7_2_2r[dt$Q7_2_2 == 5] <- 1 + dt$Q7_2_2r[dt$Q7_2_2 == 6] <- NA + + #table(dt$Q7_2_2, dt$Q7_2_2r, useNA = "always") + + # Q7_2_3 283 Decide not to buy something because it has too much packaging + # reverse + dt$Q7_2_3r[dt$Q7_2_3 == 1] <- 5 + dt$Q7_2_3r[dt$Q7_2_3 == 2] <- 4 + dt$Q7_2_3r[dt$Q7_2_3 == 3] <- 3 + dt$Q7_2_3r[dt$Q7_2_3 == 4] <- 2 + dt$Q7_2_3r[dt$Q7_2_3 == 5] <- 1 + dt$Q7_2_3r[dt$Q7_2_3 == 6] <- NA + + #table(dt$Q7_2_3, dt$Q7_2_3r, useNA = "always") + + # Q7_2_4 284 Buy recycled paper products such as toilet paper or tissues + # reverse + dt$Q7_2_4r[dt$Q7_2_4 == 1] <- 5 + dt$Q7_2_4r[dt$Q7_2_4 == 2] <- 4 + dt$Q7_2_4r[dt$Q7_2_4 == 3] <- 3 + dt$Q7_2_4r[dt$Q7_2_4 == 4] <- 2 + dt$Q7_2_4r[dt$Q7_2_4 == 5] <- 1 + dt$Q7_2_4r[dt$Q7_2_4 == 6] <- NA + + #table(dt$Q7_2_4, dt$Q7_2_4r, useNA = "always") + + # Q7_2_5 285 Use public transport (e.g. bus, train) rather than travel by car + # reverse + dt$Q7_2_5r[dt$Q7_2_5 == 1] <- 5 + dt$Q7_2_5r[dt$Q7_2_5 == 2] <- 4 + dt$Q7_2_5r[dt$Q7_2_5 == 3] <- 3 + dt$Q7_2_5r[dt$Q7_2_5 == 4] <- 2 + dt$Q7_2_5r[dt$Q7_2_5 == 5] <- 1 + dt$Q7_2_5r[dt$Q7_2_5 == 6] <- NA + + + #table(dt$Q7_2_5, dt$Q7_2_5r, useNA = "always") + + # Q7_2_6 286 Walk or cycle for short journeys less than 2 or 3 miles + # reverse + dt$Q7_2_6r[dt$Q7_2_6 == 1] <- 5 + dt$Q7_2_6r[dt$Q7_2_6 == 2] <- 4 + dt$Q7_2_6r[dt$Q7_2_6 == 3] <- 3 + dt$Q7_2_6r[dt$Q7_2_6 == 4] <- 2 + dt$Q7_2_6r[dt$Q7_2_6 == 5] <- 1 + dt$Q7_2_6r[dt$Q7_2_6 == 6] <- NA + + #table(dt$Q7_2_6, dt$Q7_2_6r, useNA = "always") + + # this will have created NAs from two sources: 'Not Applicable' in question and NA (no survey response) + + # > create sum & mean indicators ---- + # there has to be an easier way than this + ecoDTs <- dt[, .(bmg_id, Q7_2_1r, Q7_2_2r, Q7_2_3r, Q7_2_4r, Q7_2_5r, Q7_2_6r)] + + # use this DT to create the sum of all non-NA values + ecoDTs <- ecoDTs[, ba_Q7_2_ecoSum := apply(.SD, 1, sum, na.rm = TRUE), by = bmg_id] + # has the effect of setting those with all NA to 0 so recode + ecoDTs <- ecoDTs[, ba_Q7_2_ecoSum := ifelse(ba_Q7_2_ecoSum == 0, NA, ba_Q7_2_ecoSum)] + data.table::setkey(ecoDTs, bmg_id) + + # now the mean + ecoDTm <- dt[, .(bmg_id, Q7_2_1r, Q7_2_2r, Q7_2_3r, Q7_2_4r, Q7_2_5r, Q7_2_6r)] + # use this DT to create the mean of all non-NA values + ecoDTm <- ecoDTm[, ba_Q7_2_ecoMean := apply(.SD, 1, mean, na.rm = TRUE), by = bmg_id] + data.table::setkey(ecoDTm, bmg_id) + # add the columns back to the original survey file + data.table::setkey(dt, bmg_id) + dt <- merge(dt,ecoDTs) + dt <- merge(dt,ecoDTm) + + # Finish ---- + print("Done recruitment survey file") + return(dt) +} + +#' Process the TP1 household update survey sheet from the BMG data file +#' +#' \code{getBmgTP1UpdateSurvey} reads in and processes the TP1 household update survey. This contains only those bmg_ids who +#' were contacted during this period. +#' +#' Recoding variables: recode original TP1 update survey variables here (not in \code{getBmgAll()}). +#' +#' Returns a processed data.table +#' +#' @param file .xlsx file to load +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' @import readxl +#' +#' @family bmgData functions +#' +#' @export +#' +getBmgTP1UpdateSurvey <- function(file){ + # function to to process the BMG TP1 update survey file + print(paste0("getBmgTP1UpdateSurvey: Load TP1 survey data from: ", file, ", sheet = TP1Update")) + dt <- data.table::as.data.table(readxl::read_xlsx(file, sheet = "TP1Update")) + dt$bmg_id <- dt$BMG_ID + dt$source <- "TP1 update survey" + dt$tp1UpdateSurvey <- "TP1 update survey" + dt <- dt[, bmg_id := as.character(BMG_ID)] + + # Set trial groups ---- + dt <- SAVEr::setBmgTrialGroups(dt) + + # to avoid confusion + dt <- dt[, bmgGroup.tp1Survey := bmgGroup] + dt <- dt[, bmgGroup:= NULL] + + # Duplicate check ---- + print("Check for duplicates in TP1UpdateSurveyFile:") + nHHs <- uniqueN(dt$bmg_id) + nObs <- nrow(dt) + print(paste0("Number of bmg_ids: ",nHHs)) + print(paste0("Number of observations: ",nObs)) + + data.table::setkey(dt, bmg_id) + + # Survey type flag ---- + # Name clash with main survey is dealt with before merging them + dt <- dt[surveyMode %like% "CATI", ba_surveyMode := "Install_CATI"] + dt <- dt[surveyMode %like% "CAWI", ba_surveyMode := "Install_CAWI"] + dt <- dt[surveyMode %like% "CAPI", ba_surveyMode := "Install_f2f"] + dt <- dt[is.na(surveyMode), ba_surveyMode := "No survey data"] + print(paste0("Number of TP1 update survey responses: ", uniqueN(dt[ba_surveyMode != "No survey data"]$bmg_id))) + + #table(dt$surveyMode,dt$ba_surveyMode, useNA = "always") + dt <- dt[ba_surveyMode == "No survey data", ba_tp1UpdateSource := bmgCompleteFile] + + # Fix interview date ---- + print("Fixing TP1 interview date - this may break with future versions of the data") + # This whole section should probably be inside an if version = v5 + # (some values are badly formatted in some of the BMG files) + # if there is an issue the some of the dates look like strings of numbers and some look like "08/12/2017" char strings + # presumably becasue read_xlsx doesn't deal with variable date formats very well + + # first try to force them into integers + # this will convert any that were dates (which got turned into integers by read_xlsx + # It leave a lot of NAs including those that were character strings + print("# -> Before fix (to test if fix required):") + print(summary(dt$InterviewDate)) + dt$InterviewDate_orig <- dt$InterviewDate + # dt <- dt[, InterviewDate_Fixed := dmy(dt$InterviewDate)] # uses lubridate to pick up the char strings + # + # dt <- dt[!is.na(InterviewDate_Fixed), InterviewDate_orig := NA] # remove the ones we fixed + # # now force the conversion of the remainder to numeric and then convert using MS Windows excel epoch + # dt <- dt[, InterviewDate_xlFixed := as.Date(as.numeric(InterviewDate_orig), origin = "1900-01-01")] # this is unreformed Windows Excel so epoch = 1/1/1900 + # dt <- dt[is.na(InterviewDate_Fixed), InterviewDate_Fixed := InterviewDate_xlFixed] # add the ones we fixed + + dt <- dt[, InterviewDate := as.Date(InterviewDate_orig)] + print("# -> After fix:") + print(summary(dt$InterviewDate)) + # Recode variables -- + + # > HRP Age ---- + # in theory should be c 1 year older but perhaps different person responds? + dt <- SAVEr::recodeHrpAge(dt) + + # > HRP gender ---- + # why might this have changed? Different person responds? + dt <- SAVEr::recodeHrpGender(dt) + + # > HRP employment ---- + dt <- SAVEr::recodeHrpEmplType(dt) + + # > N people ---- + # people may have left/joined + dt <- SAVEr::recodeNpeople(dt) + + # > Presence of children ---- + dt <- SAVEr::recodePresenceChildren(dt) + + # Finish ---- + print("Done TP1 survey file") + return(dt) + +} + + +#' Process the household update survey sheets from the BMG data file +#' +#' \code{getBmgUpdateSurvey} reads in and processes a single household update survey. This contains only those bmg_ids who +#' were contacted during this period. +#' +#' Recoding variables: recode original update survey variables here (not in \code{getBmgAll()}). +#' +#' Returns a processed data.table +#' +#' @param file .xlsx file to load +#' @param updateRound round of update survey to process, defaults to "TP1" +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} (original function) +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} (updated function) +#' +#' @import data.table +#' @import readxl +#' +#' @family bmgData functions +#' +#' @export +#' +getBmgUpdateSurvey <- function(file, updateRound = "TP1"){ + + # loop over update rounds + upDate <- updateRound + + # function to to process each BMG update survey file + print(paste0("Load ",upDate," survey data from: ", file, ", sheet = ",upDate,"Update")) + dt <- data.table::as.data.table(readxl::read_xlsx(file, sheet = paste0(upDate,"Update"))) + dt$bmg_id <- dt$BMG_ID + dt$source <- paste0(upDate," update survey") + #dt$tp1UpdateSurvey <- "TP1 update survey" # replaced below but may not work + dt$UpdateSurvey <- paste0(upDate," update survey") + dt <- dt[, bmg_id := as.character(BMG_ID)] + + # Set trial groups ---- + dt <- SAVEr::setBmgTrialGroups(dt) + + # to avoid confusion + bmgGroupVarName <- paste0("bmgGroup.",tolower(upDate),"Survey") + dt <- dt[, (bmgGroupVarName) := bmgGroup] # wrap bmgGroupVarName in parentheses to pass name + dt <- dt[, bmgGroup:= NULL] + + # Duplicate check ---- + print(paste0("Check for duplicates in ",upDate,"UpdateSurveyFile:")) + nHHs <- uniqueN(dt$bmg_id) + nObs <- nrow(dt) + print(paste0("Number of bmg_ids: ",nHHs)) + print(paste0("Number of observations: ",nObs)) + + data.table::setkey(dt, bmg_id) + + # Survey type flag ---- + # Name clash with main survey is dealt with before merging them + dt <- dt[surveyMode %like% "CATI", ba_surveyMode := "Install_CATI"] + dt <- dt[surveyMode %like% "CAWI", ba_surveyMode := "Install_CAWI"] + dt <- dt[surveyMode %like% "CAPI", ba_surveyMode := "Install_f2f"] + dt <- dt[is.na(surveyMode), ba_surveyMode := "No survey data"] + print(paste0("Number of ",upDate," update survey responses: ", uniqueN(dt[ba_surveyMode != "No survey data"]$bmg_id))) + + #table(dt$surveyMode,dt$ba_surveyMode, useNA = "always") + dt <- dt[ba_surveyMode == "No survey data", ba_tp1UpdateSource := bmgCompleteFile] + + # Fix interview date ---- + print(paste0("Fixing ",upDate," interview date - this may break with future versions of the data")) + # This whole section should probably be inside an if version = v5 + # (some values are badly formatted in some of the BMG files) + # if there is an issue the some of the dates look like strings of numbers and some look like "08/12/2017" char strings + # presumably becasue read_xlsx doesn't deal with variable date formats very well + + # first try to force them into integers + # this will convert any that were dates (which got turned into integers by read_xlsx + # It leave a lot of NAs including those that were character strings + print("# -> Before fix (to test if fix required):") + print(summary(dt$InterviewDate)) + dt$InterviewDate_orig <- dt$InterviewDate + # dt <- dt[, InterviewDate_Fixed := dmy(dt$InterviewDate)] # uses lubridate to pick up the char strings + # + # dt <- dt[!is.na(InterviewDate_Fixed), InterviewDate_orig := NA] # remove the ones we fixed + # # now force the conversion of the remainder to numeric and then convert using MS Windows excel epoch + # dt <- dt[, InterviewDate_xlFixed := as.Date(as.numeric(InterviewDate_orig), origin = "1900-01-01")] # this is unreformed Windows Excel so epoch = 1/1/1900 + # dt <- dt[is.na(InterviewDate_Fixed), InterviewDate_Fixed := InterviewDate_xlFixed] # add the ones we fixed + + dt <- dt[, InterviewDate := as.Date(InterviewDate_orig)] + print("# -> After fix:") + print(summary(dt$InterviewDate)) + # Recode variables -- + + # > HRP Age ---- + # in theory should be c 1 year older but perhaps different person responds? + dt <- SAVEr::recodeHrpAge(dt) + + # > HRP gender ---- + # why might this have changed? Different person responds? + dt <- SAVEr::recodeHrpGender(dt) + + # > HRP employment ---- + dt <- SAVEr::recodeHrpEmplType(dt) + + # > N people ---- + # people may have left/joined + dt <- SAVEr::recodeNpeople(dt) + + # > Presence of children ---- + dt <- SAVEr::recodePresenceChildren(dt) + + # Finish ---- + print(paste0("Done ",upDate," survey file")) + return(dt) + +} diff --git a/R/surveyDataUtils.R b/R/surveyDataUtils.R new file mode 100644 index 0000000000000000000000000000000000000000..a42d99656f6ea050a7a8314d1a64b5d00f92dde9 --- /dev/null +++ b/R/surveyDataUtils.R @@ -0,0 +1,929 @@ +### ---- Utility functions used in survey data processing ---- #### +#' Sets trial groups +#' +#' \code{setBmgTrialGroups} sets trial groups (`bmgGroup`) using `Intervention` variable +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +#' +setBmgTrialGroups <- function(dt) { + # function to add BMG group label to dt. Expects 'INTERVENTIONTYPE' + dt <- dt[, bmgGroup := ifelse(Intervention == 1 | Intervention %like% "Control", # handles text labels + "BMG Group 1: Control", + NA) + ] + dt <- dt[, bmgGroup := ifelse(Intervention == 2 | Intervention %like% "LED", + "BMG Group 2", + bmgGroup) + ] + dt <- dt[, bmgGroup := ifelse(Intervention == 3 | Intervention %like% "with financial", + "BMG Group 3", + bmgGroup) + ] + dt <- dt[, bmgGroup := ifelse(Intervention == 4 | Intervention %like% "without financial", + "BMG Group 4", + bmgGroup) + ] + return(dt) +} + +#' recodes age variable +#' +#' \code{recodeHrpAge} recodes `Q2B` to give `ba_Q2B_HRPage` with sensible labels +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeHrpAge <- function(dt){ + # recode age for HRP plus 1st & 2nd persons + dt$ba_Q2B_HRPage <- factor(dt$Q2B, + labels = c( + #"< 18", # no under 18s + "18 - 24", + "25 - 34", + "35 - 44", + "45 - 54", + "55 - 64", + "65 - 74", + "75+", + "Refused" + ) + ) + + dt$ba_Q2B_2_person2age <- ifelse(dt$Q2B_2 == 0,NA,dt$Q2B_2) + dt$ba_Q2B_2_person2age <- factor(dt$ba_Q2B_2_person2age, + labels = c( + "< 16", + "16 - 24", + "25 - 34", + "35 - 44", + "45 - 54", + "55 - 64", + "65 - 74", + "75+", + "Refused" + ) + ) + # check + #table(dt$ba_Q2B_2_HRPage, dt$Q2B_2, useNA = "always") + + # this breaks for TP1 update survey + # dt$ba_Q2B_3_age <- factor(dt$Q2B_3, + # labels = c( + # "< 16", + # "16 - 24", + # "25 - 34", + # "35 - 44", + # "45 - 54", + # "55 - 64", + # "65 - 74", + # "75+", + # "Refused" + # ) + # ) + # dt$ba_Q2B_4_age <- factor(dt$Q2B_4, + # labels = c( + # "< 16", + # "16 - 24", + # "25 - 34", + # "35 - 44", + # "45 - 54", + # "55 - 64", + # "65 - 74", + # "75+", + # "Refused" + # ) + # ) + + return(dt) +} + +#' recodes HRP employment variable +#' +#' \code{recodeHrpEmplType} recodes `Q2D` to give `ba_Q2D_HRPemplType` with sensible labels +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeHrpEmplType <- function(dt){ + dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 1, + "HRP in full-time employment", + NA + ) + ] + dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 2, + "HRP in part-time employment (8-29 hours/week)", + ba_Q2D_HRPemplType + ) + ] + dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 4, + "HRP self-employed (unkown hours)", + ba_Q2D_HRPemplType + ) + ] + dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 5 | Q2D == 6, + "Unemployed", + ba_Q2D_HRPemplType + ) + ] + dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 7, + "HRP retired", + ba_Q2D_HRPemplType + ) + ] + dt <- dt[, ba_Q2D_HRPemplType := ifelse(is.na(ba_Q2D_HRPemplType) & !is.na(Q2D), + "Other", + ba_Q2D_HRPemplType + ) + ] + return(dt) +} + +#' recodes HRP ethnicity variable +#' +#' \code{recodeHrpEthnicity} recodes `Q8_22` to give `ba_Q22_HRPethnicity` with sensible labels +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeHrpEthnicity <- function(dt) { + # Question has 20 codes, collapse + dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 >= 1 & Q8_22 <= 4, + 1 , NA)] #"White British/Irish" + dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 4 & Q8_22 <= 9 , + 2 , ba_Q22_HRPethnicity)] # "Mixed" + dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 9 & Q8_22 <= 14 , + 3 , ba_Q22_HRPethnicity)] # "Asian/Asian British" + dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 14 & Q8_22 <= 17 , + 4 , ba_Q22_HRPethnicity)] # "Black/Black British" + dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 17 & Q8_22 <= 19 , + 5 , ba_Q22_HRPethnicity)] # "Other" + dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 == 20 , + 6, ba_Q22_HRPethnicity)] + dt <- dt[, ba_Q22_HRPethnicity := factor(ba_Q22_HRPethnicity, + labels = c( + "White British/Irish", + "Mixed", + "Asian/Asian British", + "Black/Black British", + "Other", + "Refused" + ) + ) + ] + # check + #table(dt$Q8_22, dt$ba_Q22_HRPethnicity) + return(dt) +} + +#' recodes HRP gender variable +#' +#' \code{recodeHrpGender} recodes `Q2C` to give `ba_Q2C_HRPgender` with sensible labels +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeHrpGender <- function(dt) { + dt$ba_Q2C_HRPgender <- factor(dt$Q2C, + levels = c(1,2,3), + labels=c("Male", "Female", "Refused") + ) + #table(dt$Q2C, dt$ba_Q2C_HRPgender, useNA = "always") + return(dt) +} + +#' codes HRP NS-SEC variable +#' +#' \code{codeHrpNSSEC} uses a number of variable to derive `ba_hrpNSSEC7` and `ba_hrpNSSEC3` with sensible labels +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +codeHrpNSSEC <- function(dt) { + # https://www.ons.gov.uk/methodology/classificationsandstandards/otherclassifications/thenationalstatisticssocioeconomicclassificationnssecrebasedonsoc2010#deriving-the-ns-sec-full-reduced-and-simplified-methods + # specifically: http://www.ons.gov.uk/ons/guide-method/classifications/current-standard-classifications/soc2010/soc2010-volume-3-ns-sec--rebased-on-soc2010--user-manual/section-14--deriving-the-ns-sec--self-coded-method.pdf + + # returns NS-SEC 7 & NS-SEC 3 + # if self-empl + # empl others? + # Yes: + # > 25 of them -> Empl large org Code 1 + dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 2 & # "Self-employed" + Q8_18 == 2 & # has employees + Q8_19 >= 2 # 25+ employees + , 1, NA) + ] + # < 25 of them -> Empl small org Code 2 + dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 2 & # "Self-employed" & + Q8_18 == 2 & # has employees + Q8_19 == 1 # < 25 employees + , 2, ba_hrpNSSEC7) + ] + # No: self empl, no employees -> Code 3 + dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 2 & # "Self-employed" & + Q8_18 == 1 # no employees + , 3, ba_hrpNSSEC7) + ] + # if empl + # manager? # comes from Q8_13 which is verbatim! + # manager currently not identified, uses supervisor as filter + # Yes: + # > 25 of them -> Managers large org Code 4 + dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 1 & # "employed" & + Q8_16 == 1 & # supervises + Q8_17 >= 2 # 25+ employees + , 4, ba_hrpNSSEC7) + ] + # < 25 of them -> Managers small org Code 5 + dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 1 & # "employed" & + Q8_16 == 1 & # supervisees + Q8_17 == 1 # < 25 employees + , 5, ba_hrpNSSEC7) + ] + # Not manager: + # supervise others? + # 6 or 7 (depending if supervisors?) + dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 1 & # "employed" & + Q8_16 == 2 # no supervisees + , "6 or 7", ba_hrpNSSEC7) + ] + dt <- dt[, ba_hrpNSSEC3 := ifelse(ba_hrpNSSEC7 == 1 | ba_hrpNSSEC7 == 4, + 1, NA) + ] + dt <- dt[, ba_hrpNSSEC3 := ifelse(ba_hrpNSSEC7 == 2 | ba_hrpNSSEC7 == 5, + 2, ba_hrpNSSEC3) + ] + dt <- dt[, ba_hrpNSSEC3 := ifelse(ba_hrpNSSEC7 == 3 | ba_hrpNSSEC7 == "6 or 7", + 3, ba_hrpNSSEC3) + ] + # test + # table(hhAttributesDT$ba_hrpNSSEC3,hhAttributesDT$ba_hrpNSSEC7, useNA = "always") + + # recode NS-SEC to include those who can't be coded (students etc) + dt <- dt[, ba_hrpNSSEC3 := ifelse(is.na(ba_hrpNSSEC3) & # NS-SEC not coded + !is.na(surveyMode), # but did complete survey + "Not coded", + ba_hrpNSSEC3 + ) + ] + dt <- dt[, ba_hrpNSSEC7 := ifelse(is.na(ba_hrpNSSEC7) & # NS-SEC not coded + !is.na(surveyMode), # but did complete survey + "Not coded", + ba_hrpNSSEC7 + ) + ] + return(dt) +} + +#' recodes heat source variable +#' +#' \code{recodeHeatSource} recodes `Q3_12` to give `ba_heatSourceReduced` with sensible labels +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeHeatSource <- function(dt){ + # recode heat source from full survey (not on update survey - assume is still correct) + dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 4 | + Q3_12 == 5 | + Q3_12 == 6 | + Q3_12 == 7 | + Q3_12 == 8, + "Oil/wood/solid fuels/biomass/other", + NA + ) + ] + dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 1, + "Electricity (storage heaters)", + ba_heatSourceReduced + ) + ] + dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 2, + "Other electricity (e.g. heat pump)", + ba_heatSourceReduced + ) + ] + dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 3, + "Gas boiler", + ba_heatSourceReduced + ) + ] + + if(isTRUE(message)) message("Create derived 'HeatSource' to make chart labels and table names easier to read") + + # recode to make charts easier + namesFuels <- + c('Gas boiler' = "1 Gas", + "Electricity (storage heaters)" = "Electricity:\nstorage heaters", + "Oil/wood/solid fuels/biomass/other" = "Oil, wood\nsolid fuels\nbiomass, other", + "Other electricity (e.g. heat pump)" = "Other electricity\ne.g. ground source\nheat pumps") + + dt <- dt[, HeatSource := plyr::revalue(ba_heatSourceReduced, namesFuels)] # <- only asked at recruitment + return(dt) +} + + +#' recodes heat source variable for additional re-categorisation (multiple variants) +#' +#' \code{recodeHeatSource} recodes `Q3_12` to give: +#' `tr_heatSourceCustType`: 6 categories +#' `tr_heatSourceCensus`: 5 categories (electric combined) +#' `tr_heatSourceSSEN`: 3 categories (Gas, Electric and Other) +#' Recoding gives sensible labels for charts etc +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +#' +recodeHeatSourceExt <- function(dt){ + # recode heat source from full survey (not on update survey - assume is still correct) + dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 5 | + Q3_12 == 6 | + Q3_12 == 7, + "Solid/biomass", + NA + ) + ] + dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 8, + "Other", + tr_heatSourceCustType + ) + ] + dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 1, + "Elec. storage", + tr_heatSourceCustType + ) + ] + dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 2, + "Elec. other", + tr_heatSourceCustType + ) + ] + dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 3, + "Gas", + tr_heatSourceCustType + ) + ] + dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 4, + "Oil", + tr_heatSourceCustType + ) + ] + # now re-order using factors + dt <- dt[, tr_heatSourceCustType := factor(dt$tr_heatSourceCustType, + levels = c("Gas", "Elec. storage", "Elec. other", "Oil", + "Solid/biomass", "Other"), # specify in order required + labels = c("Gas", "Elec. storage", "Elec. other", "Oil", + "Solid/biomass", "Other") # create labels for levels (must be same order) + )] + + # recode original heat source (Q3_12) into census aligned categories + + dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 5 | + Q3_12 == 6 | + Q3_12 == 7, + "Solid/biomass", + NA + ) + ] + dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 8, + "Other", + tr_heatSourceCensus + ) + ] + dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 1 | + Q3_12 == 2, + "Electric", + tr_heatSourceCensus + ) + ] + dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 3, + "Gas", + tr_heatSourceCensus + ) + ] + dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 4, + "Oil", + tr_heatSourceCensus + ) + ] + # now re-order using factors + dt <- dt[, tr_heatSourceCensus := factor(dt$tr_heatSourceCensus, + levels = c("Gas", "Electric", "Oil", + "Solid/biomass", "Other"), # specify in order required + labels = c("Gas", "Electric", "Oil", + "Solid/biomass", "Other") # create labels for levels (must be same order) + )] + + # recode original heat source (Q3_12) into 3 categories (request by SSEN) + + dt <- dt[, tr_heatSourceSSEN := ifelse(Q3_12 == 4 | + Q3_12 == 5 | + Q3_12 == 6 | + Q3_12 == 7 | + Q3_12 == 8, + "Other", + NA + ) + ] + dt <- dt[, tr_heatSourceSSEN := ifelse(Q3_12 == 1 | + Q3_12 == 2, + "Electric", + tr_heatSourceSSEN + ) + ] + dt <- dt[, tr_heatSourceSSEN := ifelse(Q3_12 == 3, + "Gas", + tr_heatSourceSSEN + ) + ] + # now re-order using factors + dt <- dt[, tr_heatSourceSSEN := factor(dt$tr_heatSourceSSEN, + levels = c("Gas", "Electric", "Other"), # specify in order required + labels = c("Gas", "Electric", "Other") # create labels for levels (must be same order) + )] + + return(dt) +} + + +#' recodes n people +#' +#' \code{recodeNpeople} recodes `Q2` to give `ba_Q2_npeople_reduced` & `ba_censusNpeople` (matches UK Census) +#' with sensible labels. +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeNpeople <- function(dt){ + dt$ba_Q2_npeople <- dt$Q2 # n people + # reduced form + dt <- dt[, ba_Q2_npeople_reduced := ifelse(ba_Q2_npeople != "Refused", as.numeric(ba_Q2_npeople), ba_Q2_npeople) ] + dt <- dt[, ba_Q2_npeople_reduced := ifelse(ba_Q2_npeople_reduced != "Refused" & ba_Q2_npeople_reduced > 4, "5+", ba_Q2_npeople_reduced) ] + # dt <- dt[, ba_Q2_npeople_reduced := ifelse(ba_Q2_npeople > 4, "5+",ba_Q2_npeople) ] - old code retained for roll-back + # census form + dt <- dt[, ba_censusNpeople := ifelse(ba_Q2_npeople != "Refused", as.numeric(ba_Q2_npeople), ba_Q2_npeople) ] + dt <- dt[, ba_censusNpeople := ifelse(ba_censusNpeople != "Refused" & ba_censusNpeople > 7, "8+",ba_censusNpeople) ] + dt <- dt[, ba_censusNpeople := ifelse(ba_censusNpeople == 10 | ba_censusNpeople == 13, "8+",ba_censusNpeople) ] + # dt <- dt[, ba_censusNpeople := ifelse(ba_Q2_npeople > 7, "8+",ba_Q2_npeople) ] - old code retained for roll-back + # check + #table(dt$ba_censusNpeople, dt$ba_Q2_npeople, useNA = "always") + return(dt) +} + +#' recodes n cars variable +#' +#' \code{recodeNcars} recodes `Q8_9` to give `ba_censusNcars` with sensible labels. +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeNcars <- function(dt){ + dt$ba_Q8_9_ncars <- dt$Q8_9 # n cars - not asked at update + # census form + dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 1, "0",NA) ] + dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 2, "1",ba_censusNcars) ] + dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 3, "2",ba_censusNcars) ] + dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 4, "3",ba_censusNcars) ] + dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars >= 5 , "4+",ba_censusNcars) ] + # check + #table(dt$ba_censusNcars, dt$ba_Q8_9_ncars, useNA = "always") + return(dt) +} + +#' recodes n children variable +#' +#' \code{recodePresenceChildren} recodes `Q2B_2` to give `ba_presenceChildren` with sensible labels. +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodePresenceChildren <- function(dt){ + dt <- dt[, ba_presenceChildren := ifelse(dt$Q2B == 1 | dt$Q2B_2 == 1 | dt$Q2B_3 == 1 | + dt$Q2B_4 == 1 | dt$Q2B_5 == 1 | + dt$Q2B_6 == 1 | dt$Q2B_7 == 1 | + dt$Q2B_8 == 1, + "At least one child < 16", + "No children" + ) + ] + # this produces NAs where there are no children AND also where we have no survey + dt$ba_presenceChildren[is.na(dt$ba_presenceChildren)] <- "No children" + dt$ba_presenceChildren[is.na(dt$surveyMode)] <- NA # set non-survey to NA - redundant but works + return(dt) +} + +#' recodes tenure variable +#' +#' \code{recodeTenure} recodes `Q3_1` to give `ba_censusTenure` with sensible labels. Format matches UK Census. +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeTenure <- function(dt){ + # Not asked at update (perhaps should have been) + dt <- dt[, ba_censusTenure := ifelse(Q3_1 == 1 | Q3_1 == 2, # "Own outright" | Q3_1 == "Own with mortgage" + "Own", NA)] + dt <- dt[Q3_2 == 1 | # "Housing association, housing co-operative, charitable trust, registered social landlord" + Q3_2 == 2, # "Council (local authority)" + ba_censusTenure := "Social rent" + ] + + dt <- dt[Q3_2 == 3 | # "Private landlord or letting agency" + Q3_2 == 4 | # "Employer of a household member" + Q3_2 == 5 | # "Relative or friend of a household member" + Q3_2 == 6 , # "Other" + ba_censusTenure := "Private rent" + ] + dt <- dt[Q3_1 == 7 | # "other" + Q3_1 == 8 | # "refused" + Q3_2 == 7 | # "dk" + Q3_2 == 6 , # "Other" + ba_censusTenure := "x.Refused/dk/Other" + ] + table(dt$ba_censusTenure, dt$Q3_1, useNA = "always") + return(dt) +} + +#' recodes income variable +#' +#' \code{recodeIncome} recodes `Q8_27` to give `ba_Q8_27_Income` with sensible labels. +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export +recodeIncome <- function(dt){ + # Question has 16 codes, collapse + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 >= 1 & Q8_27 <= 4, + 1 , NA)] # < 10k + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 1 & Q8_27 <= 5 , + 2 , ba_Q8_27_Income)] # 10-20 + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 5 & Q8_27 <= 9 , + 3 , ba_Q8_27_Income)] # 20-30 + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 9 & Q8_27 <= 11 , + 4 , ba_Q8_27_Income)] # 30-40 + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 11 & Q8_27 <= 12 , + 5 , ba_Q8_27_Income)] # 40-50 + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 12 & Q8_27 <= 13 , + 6, ba_Q8_27_Income)] # 50-60 + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 13 & Q8_27 <= 14 , + 7, ba_Q8_27_Income)] # 60-80 + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 14 & Q8_27 <= 16 , + 8, ba_Q8_27_Income)] # 80+ + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 == 17 , + 9, ba_Q8_27_Income)] # DK + dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 == 18 , + 10, ba_Q8_27_Income)] # Ref + dt <- dt[, ba_Q8_27_Income := factor(ba_Q8_27_Income, + labels = c( + "< £10k p.a", + "£10-£20k", + "£20-£30k", + "£30-£40k", + "£40-£50k", + "£50-60k", + "£60-80k", + ">£80k", + "Don't know", + "Refused" + ) + ) + ] + # check + #table(dt$Q8_27, dt$ba_Q8_27_Income, useNA = "always") + return(dt) +} + + +#' recodes household survey questions related to high-power electric appliances +#' +#' \code{recodeAppliances} takes `Q6_1`, `Q3_18_1`, `Q3_19_27`, `Q3_19_23` and `Q3_19_30` +#' to return yes/no responses for types of appliance (e.g. EV or electric immersion heater). +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export + +recodeAppliances <- function(dt){ + +# Electric vehicles (EV) + + dt <- dt[, electricVehicle := Q6_1.latest] + dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 1, "Yes", electricVehicle)] + dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 2, "No", electricVehicle)] + dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 3, "Don't know", electricVehicle)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, electricVehicle := factor(dt$electricVehicle, + levels = c("No","Yes","Don't know"))] + +# Water heating - immersion heater (elec) + + dt <- dt[, waterHeat := Q3_18_1] + dt <- dt[, waterHeat := ifelse(Q3_18_1 >= 1, "Yes", waterHeat)] + dt <- dt[, waterHeat := ifelse(Q3_18_1 == 0, "No", waterHeat)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, waterHeat := factor(dt$waterHeat, + levels = c("No","Yes"))] + +# Plug-in heater + + dt <- dt[, plugInHeat := Q3_19_27] + dt <- dt[, plugInHeat := ifelse(Q3_19_27 >= 1, "Yes", plugInHeat)] + dt <- dt[, plugInHeat := ifelse(Q3_19_27 == 0, "No", plugInHeat)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, plugInHeat := factor(dt$plugInHeat, + levels = c("No","Yes"))] + +# Air con + + dt <- dt[, airCon := Q3_19_23] + dt <- dt[, airCon := ifelse(Q3_19_23 >= 1, "Yes", airCon)] + dt <- dt[, airCon := ifelse(Q3_19_23 == 0, "No", airCon)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, airCon := factor(dt$airCon, + levels = c("No","Yes"))] + +# Power shower + + dt <- dt[, powerShower := Q3_19_30] + dt <- dt[, powerShower := ifelse(Q3_19_30 >= 1, "Yes", powerShower)] + dt <- dt[, powerShower := ifelse(Q3_19_30 == 0, "No", powerShower)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, powerShower := factor(dt$powerShower, + levels = c("No","Yes"))] + +# Tumble drier + + dt <- dt[, tumbleDrier := Q3_19_3] + dt <- dt[, tumbleDrier := ifelse(Q3_19_3 >= 1, "Yes", tumbleDrier)] + dt <- dt[, tumbleDrier := ifelse(Q3_19_3 == 0, "No", tumbleDrier)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, tumbleDrier := factor(dt$tumbleDrier, + levels = c("No","Yes"))] + +# Combined washing maching and tumble + + dt <- dt[, washTumble := Q3_19_2] + dt <- dt[, washTumble := ifelse(Q3_19_2 >= 1, "Yes", washTumble)] + dt <- dt[, washTumble := ifelse(Q3_19_2 == 0, "No", washTumble)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, washTumble := factor(dt$washTumble, + levels = c("No","Yes"))] + +# Tumble drier or combined washer drier + + dt <- dt[, tumbleDryAny := ifelse(tumbleDrier == "Yes" | washTumble == "Yes", "Yes", "No")] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, tumbleDryAny := factor(dt$tumbleDryAny, + levels = c("No","Yes"))] + +# Dishwasher + + dt <- dt[, dishWasher := Q3_19_5] + dt <- dt[, dishWasher := ifelse(Q3_19_5 >= 1, "Yes", dishWasher)] + dt <- dt[, dishWasher := ifelse(Q3_19_5 == 0, "No", dishWasher)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, dishWasher := factor(dt$dishWasher, + levels = c("No","Yes"))] + +# Electric Hob + + dt <- dt[, electricHob := Q3_19_6] + dt <- dt[, electricHob := ifelse(Q3_19_6 >= 1, "Yes", electricHob)] + dt <- dt[, electricHob := ifelse(Q3_19_6 == 0, "No", electricHob)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, electricHob := factor(dt$electricHob, + levels = c("No","Yes"))] + +# Electric Oven + + dt <- dt[, electricOven := Q3_19_7] + dt <- dt[, electricOven := ifelse(Q3_19_7 >= 1, "Yes", electricOven)] + dt <- dt[, electricOven := ifelse(Q3_19_7 == 0, "No", electricOven)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, electricOven := factor(dt$electricOven, + levels = c("No","Yes"))] + + return(dt) + +} + +#' recodes household survey questions related to low-carbon technologies (LCTs) +#' +#' \code{recodeLCTs} takes `Q6_1`, `Q3_3_2`, `Q3_3_5` and `Q3_12` +#' to return yes/no responses for types of installed LCT (e.g. EV, PV, heat-pump or CHP). +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export + +recodeLCTs <- function(dt){ + + # Electric vehicles (EV) + + dt <- dt[, electricVehicle := Q6_1.latest] + dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 1, "Yes", electricVehicle)] + dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 2, "No", electricVehicle)] + dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 3, "Don't know", electricVehicle)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, electricVehicle := factor(dt$electricVehicle, + levels = c("No","Yes","Don't know"))] + + # Electricity provision + + # solar voltaic panels (PV) + dt <- dt[, solarPV := Q3_3_2] + dt <- dt[, solarPV := ifelse(Q3_3_2 == 1, "Yes", solarPV)] + dt <- dt[, solarPV := ifelse(Q3_3_2 == 0, "No", solarPV)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, solarPV := factor(dt$solarPV, + levels = c("No","Yes"))] + + # conbined heat and power (CHP) + dt <- dt[, chp := Q3_3_5] + dt <- dt[, chp := ifelse(Q3_3_5 == 1, "Yes", chp)] + dt <- dt[, chp := ifelse(Q3_3_5 == 0, "No", chp)] + # finally, re-order ("No" will now be contrast in models) + dt <- dt[, chp := factor(dt$chp, + levels = c("No","Yes"))] + + # heat pump (inferred from 'other electric' heat source Q3_12) + dt <- dt[, heatPump := "No"] + dt <- dt[, heatPump := ifelse(Q3_12 == 2, "Yes", heatPump)] + dt <- dt[, heatPump := ifelse(Q3_12 == 9, "Don't know", heatPump)] + dt <- dt[, heatPump := factor(dt$heatPump, + levels = c("No","Yes","Don't know"))] + + return(dt) + +} + + +#' recodes household survey responses related to qualification (education) of HRP +#' +#' \code{recodeQuals} takes `Q8_20` and provides recoded responses as character +#' rather than numeric categories. Two variants are returned: +#' `tr_qualRecoded` - a full recoding (13 categories) +#' `Qualification` - a reduced coding (5 categories) +#' +#' Returns a processed data.table +#' +#' @param dt data.table to process +#' +#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} +#' +#' @import data.table +#' +#' @export + +recodeQuals <- function(dt) { + # recode Q8_20 highest qualifications (respondent) + # first map values + valuesQualifications <- c( + `1` = "Higher Degree", + `2` = "Degree", + `3` = "A/AS level, Scottish Higher, ONC/OND", + `4` = "HNC/HND", + `5` = "HNC/HND", + `6` = "A/AS level, Scottish Higher, ONC/OND", + `7` = "A/AS level, Scottish Higher, ONC/OND", + `8` = "A/AS level, Scottish Higher, ONC/OND", + `9` = "GCSE/O level", + `10` = "GCSE/O level", + `11` = "Other", + `12` = "No qualifications", + `13` = "Don't know") + + # now revalue Q8_20 to create new variable + dt <- dt[, tr_qualRecoded := plyr::revalue(as.factor(Q8_20), valuesQualifications)] + + # finally, re-order ("No qualifications" will now be contrast in models) + dt <- dt[, tr_qualRecoded := factor(dt$tr_qualRecoded, + levels = c("No qualifications", "GCSE/O level", + "A/AS level, Scottish Higher, ONC/OND", + "HNC/HND", "Degree", "Higher Degree", + "Other", "Don't know"))] + + # recode Q8_20 highest qualifications (respondent) + # first map values + valuesQualsReduced <- c( + `1` = "HNC/D, degree and higher", + `2` = "HNC/D, degree and higher", + `3` = "A/AS level, Scottish Higher, ONC/OND", + `4` = "HNC/D, degree and higher", + `5` = "HNC/D, degree and higher", + `6` = "A/AS level, Scottish Higher, ONC/OND", + `7` = "A/AS level, Scottish Higher, ONC/OND", + `8` = "A/AS level, Scottish Higher, ONC/OND", + `9` = "GCSE/O level or below", + `10` = "GCSE/O level or below", + `11` = "Other", + `12` = "GCSE/O level or below", + `13` = "Don't know") + + # now revalue Q8_20 to create new variable + dt <- dt[, Qualification := plyr::revalue(as.factor(Q8_20), valuesQualsReduced)] + + # finally, re-order ("No qualifications" will now be contrast in models) + dt <- dt[, Qualification := factor(dt$Qualification, + levels = c("GCSE/O level or below", + "A/AS level, Scottish Higher, ONC/OND", + "HNC/D, degree and higher", + "Other", "Don't know"))] + + return(dt) + +} + diff --git a/R/utilities.R b/R/utilities.R new file mode 100644 index 0000000000000000000000000000000000000000..061d6a24079de4534359956e785d4d73c3c0e157 --- /dev/null +++ b/R/utilities.R @@ -0,0 +1,295 @@ +#' Tidy long numbers +#' +#' \code{tidyNum} reformats long numbers to include commas and prevents scientific formats, +#' making them suitable for printing within R Markdown reports and inline text. +#' +#' @param number an input number or list +#' +#' @examples +#' tidyNum(123456789) +#' tidyNum(10^6) +#' tidyNum(c(10^6, 10^7, 10^8)) +#' +#' @author Ben Anderson, \email{banderson@@soton.ac.uk} +#' @export +#' @family Utilities + +tidyNum <- function(number) { + format(number, big.mark=",", scientific=FALSE) +} + +#' Return middle 98\% of a distribution +#' +#' \code{getP98} filters top & bottom 1\% and returns the middle 98\% +#' +#' @param dt the data table +#' @param var the variable to filter +#' @return the middle 98pc as a data.table +#' +#' @author Ben Anderson, \email{banderson@@soton.ac.uk} +#' @export +#' @family Utilities + +getP98 <-function(dt,var){ + # expects a data.table + # returns a data.table with the middle 98% of var + + # Calculate mean & sd for use in calculating bounds + attach(dt) + tempMean <- mean(get(var)) + tempSD <- sd(get(var)) + + # p98 extract + p01 <- quantile(get(var), probs = 0.01) # central 98% + p99 <- quantile(get(var), probs = 0.99) + dt <- subset(dt, p01 < get(var) & get(var) < p99) + return(dt) +} + +#' Return middle 95\% of a distribution using z scores +#' +#' \code{getZ95} filters top & bottom 2.5\% using z scores and returns the middle 95\% +#' +#' @param dt the data table +#' @param var the variable to filter +#' +#' +#' @author Ben Anderson, \email{banderson@@soton.ac.uk} +#' @export +#' @family Utilities + +getZ95 <-function(dt,var){ + # expects a data.table + # returns a data.table with only +/- 95% z for var + + # Calculate mean & sd for use in calculating bounds + attach(dt) + tempMean <- mean(get(var)) + tempSD <- sd(get(var)) + + # z95 extract + z95Lower <- tempMean - (qnorm(0.975) * tempSD) # +/- 2.5% + z95Upper <- tempMean + (qnorm(0.975) * tempSD) + + dt <- subset(dt, z95Lower < get(var) & get(var) < z95Upper) + + return(dt) +} + +#' Prints information about the R session +#' +#' \code{DisplaySystemInfo} provides a summary printout of the +#' R session, including the operating system, user info and project location. +#' +#' Before running this, the user should have defined the \code{projLoc} variable +#' using the \code{SAVEr::findParentDirectory} +#' +#' @param printMessage TRUE/FALSE. States whether the parameter should be printed. +#' +#' @author Mikey Harper, \email{m.harper@@soton.ac.uk} +#' @seealso \code{\link{findParentDirectory}} +#' @seealso \code{\link{assignUserSystemInfo}} +#' @export +#' @family Utilities + +displaySystemInfo <- function(printMessage = TRUE){ + + if(is.na(sysName)) assignUserSystemInfo() + if(is.na(projLoc)) stop("Please specify `projLoc` in the Global Environment") + + if(isTRUE(printMessage)){ + cat(paste0("Running on ", sysName, " with projLoc = ", projLoc, "\n \n", + " User: ",Sys.info()[[7]]), "\n", + "Platform: ",Sys.info()[[4]], "\n", + "Input data path: ",dPath, "\n") + } + +} + +#' Installs and loads packages +#' +#' \code{myRequiredPackages} checks whether the package is already installed, +#' installing those which are not preinstalled. All the libraries are then load. +#' +#' Especially useful when running on virtual machines where package installation +#' is not persistent (Like UoS sve). It will fail if the packages need to be +#' installed but there is no internet access +#' @param ... A list of packages +#' @param repository The repository to load functions from. Defaults to "https://cran.ma.imperial.ac.uk/" +#' @importFrom utils install.packages +#' +#' @author Luke Blunden, \email{lsb@@soton.ac.uk} (original) +#' @author Michael Harper \email{m.harper@@soton.ac.uk} (revised version) +#' @export +#' @family Utilities + +myRequiredPackages <- function(..., repository = "https://cran.rstudio.com/"){ + + packages <- c(...) + + # Find if package isn't installed + newPackages <- packages[!(packages %in% utils::installed.packages()[,1])] + + # Install if required + if (length(newPackages)){utils::install.packages(newPackages, dependencies = TRUE)} + + # Load packages + sapply(packages, require, character.only = TRUE) +} + + +#' Installs and loads packages from Github +#' +#' \code{myRequiredPackagesGithub} checks whether the package is already installed. +#' If the package is not detected, it will attempt to install it from GitHub, +#' before loading the packages. +#' +#' +#' @param ... A list of packages +#' @importFrom utils install.packages +#' +#' @author Michael Harper \email{m.harper@@soton.ac.uk} +#' @export +#' @family Utilities + +myRequiredPackagesGithub <- function(..., quiet = FALSE){ + + packages <- c(...) + + # Only keep package name + packageName <- gsub("^.*/", "", packages) + + # Find if package isn't installed + newPackages <- packages[!(packageName %in% utils::installed.packages()[,1])] + + # Install if required + if (length(newPackages)){devtools::install_github(newPackages)} + + # Load packages + sapply(packageName, require, character.only = TRUE) +} + +#' Compresses any file to .gz file +#' +#' The function will check and delete any existing compressed file +#' +#' @param file path to file +#' @param output file +#' @author Ben Anderson +#' +#' @family Utilities +gzipIt <- function(file){ + + # Check if input exists + if(!file.exists(file)) stop("Input `file` not located.") + + # Path of output file + zipped <- paste0(file, ".gz") + + # Rremove old gzip file if exists + if(file.exists(zipped)){ + message("Deleting old File...") + file.remove(zipped) + message("DONE\n") + } + + # Gzip new one + # in case it fails (it will on windows - you will be left with the file) + message(paste0("gzipping file to: ", zipped)) + try(system( paste0("gzip ", file))) + message("Gzipped file created.") +} + +#' Extract a compressed .7z file +#' +#' The function will unpack a compressed file +#' +#' @param inFile name of archive file +#' @param fileDir directory containing file +#' +#' @author Tom Rushby, \email{(t.w.rushby@@soton.ac.uk)} +#' @export +#' +#' @family Utilities +#' @examples +#' extract7z(fileDir = "/SotonGitLab/SERG/SAVE", inFile = "LED_installs.7z") +#' extract7z(fileDir = outFileDir, inFile = inFile) +extract7z <- function(fileDir, inFile){ + + filePath <- paste0(fileDir,inFile) + # Check if input file exists + if(!file.exists(filePath)) stop("Input `file` does not exist.") + + archive::archive(filePath) + + # create output file directory + print(paste0("Extracting file to location: ",fileDir,"\n")) + + archive::archive_extract(filePath, fileDir, file = NULL) + +} + + +#' Find user information +#' +#' \code{UserSystemInfo} lists the user system information. No parameters have to +#' be provided +#' +#' @return The system name, username and computer. +#' @author Mikey Harper, \email{m.harper@@soton.ac.uk} +#' @export +#' @family Utilities +assignUserSystemInfo <- function(){ + + assign("sysInfo", Sys.info(), envir = globalenv()) + assign("sysName", sysInfo[[1]], envir = globalenv()) + assign("nodeName", sysInfo[[4]], envir = globalenv()) + assign("userName", sysInfo[[7]], envir = globalenv()) + + rm(sysInfo, envir = globalenv()) +} + +#' Converts timestamps between local times +#' +#' Overrides the timestamp of a timestamp, converting the clock time to a different +#' local time. +#' +#' @description +#' All data collected within the SAVE project is collected in UTC time. For the analysis +#' of behaviour patterns, it is important that the local time is used +#' +#' @param timestamp a POSIXct, POSIXlt, Date, chron date-time object or a data.frame object. +#' @param from,to valid timezone values +#' @export +#' @author Michael Harper +#' +#' @family Utilities +convertTimezone <- function(timestamp, from = "UTC", to = "Europe/London", message = TRUE, warning = TRUE){ + + # check valid timestamps + if(!is.POSIXct(timestamp)) stop("timestamp must be a POSIXct object") + if(!(from %in% OlsonNames())) stop("specified 'from' is not a valid timezone") + if(!(to %in% OlsonNames())) stop("specified 'to' is not a valid timezone") + + # Check the input timestamp + inputTimezone <- attr(timestamp, "tzone") + + if(from != inputTimezone){ + + # Prints warning if specified + if(warning) warning("Input timezone does not match the specified object.", + "Timezone of object: ", inputTimezone, "\n", + "Timezone specified: ", from, "\n") + + + if(message) message("Timezone overwritten by function without conversion of the clock time") + + timestamp <- lubridate::force_tz(time = timestamp, tzone = from) + } + + # Convert ID + timestamp2 <- lubridate::with_tz(time = timestamp, tzone = to) + + return(timestamp2) + +}