diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000000000000000000000000000000000000..b1b8c3dcb44af9a237b2aadd2204fea0123d294d
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^saveData\.Rproj$
+^\.Rproj\.user$
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000000000000000000000000000000000000..4e1f26defe2cd34c5eabaff2fe3da6f33c0ccc54
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,11 @@
+Package: saveData
+Title: Package to support the re-use of the SAVE household demand response trial data
+Version: 0.1
+Authors@R: c(person("Ben", "Anderson", email = "b.anderson@soton.ac.uk", role = c("aut", "cre")), 
+             person("Michael", "Harper", email = "m.harper@soton.ac.uk", role = "aut"),
+             person("Tom", "Rushby", email = "twrushby@soton.ac.uk", role = "aut"))
+Description: A range of scripts for data cleaning and processing. 
+    This work was funded by the Low Carbon Network Fund (LCNF) Tier 2 Programme  project "Solent Achieving Value from Efficiency (SAVE)" http://www.energy.soton.ac.uk/tag/save/.
+License: file LICENCE
+Encoding: UTF-8
+LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000000000000000000000000000000000000..884a6312ae55caf2d5bac006d43fab89d8545080
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,2 @@
+# Generated by roxygen2: fake comment so roxygen2 overwrites silently.
+exportPattern("^[^\\.]")
diff --git a/R/surveyData.R b/R/surveyData.R
new file mode 100644
index 0000000000000000000000000000000000000000..57542797d248cccdbe198f6a48e46f1be683b2cf
--- /dev/null
+++ b/R/surveyData.R
@@ -0,0 +1,1183 @@
+### ---- Functions used for survey data processing ---- ####
+
+# Helper/utility functions are found in bmgDataUtils.R
+
+#' Process the household response sheet from the BMG data file
+#'
+#' \code{getBmgAll} reads in and processes the household response sheet, the recruitment survey, the TP1 update survey
+#'     and combines them returning a data.table.
+#'
+#'     This only gets called if it is needed (or you can run it manually). \code{input} over-rides the .xlsx file to load.
+#'
+#'     Recoding variables: only recode recruitment survey variables in this function that have
+#'     have been updated via the TP1 or TP2 (etc) update surveys.
+#'
+#' @param input .xlsx file to load
+#' @param message should messages be printed. Default is TRUE
+#' @param updateRounds specifies how many update surveys to apply e.g. "2" uses TP1 and TP2 update surveys. Default 3 (all).
+#' @param saveFile should messages be printed. Default is TRUE
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} (updated)
+#'
+#' @import plyr
+#' @import data.table
+#' @import readxl
+#'
+#' @family bmgData functions
+#'
+#' @export
+#'
+getBmgAll <- function(input = bmgCompleteFile, updateRounds = 3, message = TRUE, saveFile = FALSE){
+  if(isTRUE(message)) outputMessage("...BMG files have been updated or forceUpdate = TRUE. Recompiling.")
+  if(isTRUE(message)) outputMessage(paste0("Load BMG data from: ", input))
+
+
+  bmgResponseDT <- getBmgResponseFile(input)
+  data.table::setkey(bmgResponseDT, bmg_id)
+
+  bmgRecruitSurveyDT <- getBmgRecruitSurvey(input)
+  data.table::setkey(bmgRecruitSurveyDT, bmg_id)
+
+  if(isTRUE(message)) outputMessage("Loading update survey: round 1")
+  bmgTP1SurveyDT <- getBmgUpdateSurvey(input, updateRound = "TP1")
+  data.table::setkey(bmgTP1SurveyDT, bmg_id)
+
+  # Create a DT called dt which contains all addresses ever contacted and all response and survey data. This function will also pdate full recruitment survey where possible
+  # This code sets the relevant values to be the 'most recent' (xx.latest). i.e. if we have a value for the TP1 (update) survey then we use that, if not we use the original survey value
+  # NB the update survey did not ask all possible questions so some values can't be updated (yet)
+  # the function also does some error checking
+
+
+  if(isTRUE(message)) outputMessage("Merging updated variables: round 1")
+  # Merge response file & recruitment survey file ----
+  # use suffixes to show source of the variables that have the same names
+  dt <- merge(bmgResponseDT, bmgRecruitSurveyDT, all = TRUE, suffixes = c(".respf", ".fullSurvey"))
+  # This appears to add about 40 who are not in the response file?
+  data.table::setkey(dt, bmg_id)
+
+  # Add TP1 update survey file ----
+  # use suffixes to show source of the variables that have the same names
+  # careful with suffixes as merging will fail for subsequent updates ...
+  if (updateRounds == 1){
+    dt <- merge(dt,bmgTP1SurveyDT, all = TRUE, suffixes = c(".fullSurvey", ".tp1Survey"))
+    data.table::setkey(dt, bmg_id)
+  }
+
+  if (updateRounds >= 2){
+    if(isTRUE(message)) outputMessage("Loading update survey: round 2")
+    # First merge TP1 update survey and preserve suffixes (first arg "" in suffixes)
+    dt <- merge(dt,bmgTP1SurveyDT, all = TRUE, suffixes = c("", ".tp1Survey"))
+    data.table::setkey(dt, bmg_id)
+    # Add TP2 update survey file ----
+    bmgTP2SurveyDT <- getBmgUpdateSurvey(input, updateRound = "TP2")
+    data.table::setkey(bmgTP2SurveyDT, bmg_id)
+
+    if(isTRUE(message)) outputMessage("Merging updated variables: round 2")
+
+    if (updateRounds == 2){
+      # Merge TP2 update survey and suffixes for full survey (first arg in suffixes)
+      dt <- merge(dt,bmgTP2SurveyDT, all = TRUE, suffixes = c(".fullSurvey", ".tp2Survey"))
+      data.table::setkey(dt, bmg_id)
+    }
+
+    if (updateRounds == 3){
+      # Merge TP2 update survey and preserve suffixes (first arg "" in suffixes)
+      dt <- merge(dt,bmgTP2SurveyDT, all = TRUE, suffixes = c("", ".tp2Survey"))
+      data.table::setkey(dt, bmg_id)
+    }
+  }
+
+  if (updateRounds == 3){
+    if(isTRUE(message)) outputMessage("Loading update survey: round 3")
+    # Add TP3 update survey file ----
+    bmgTP3SurveyDT <- getBmgUpdateSurvey(input, updateRound = "TP3")
+    data.table::setkey(bmgTP3SurveyDT, bmg_id)
+    # Merge TP3 update survey and suffixes for full survey (first arg in suffixes)
+    if(isTRUE(message)) outputMessage("Merging updated variables: round 3")
+    dt <- merge(dt,bmgTP3SurveyDT, all = TRUE, suffixes = c(".fullSurvey", ".tp3Survey"))
+    data.table::setkey(dt, bmg_id)
+  }
+
+  # Check for duplicates & remove any ----
+  nHHs <- data.table::uniqueN(dt$bmg_id)
+  nObs <- nrow(dt)
+
+  outputMessage("Before duplicate removal:")
+  outputMessage("Number of bmg_ids: ",nHHs)
+  outputMessage("Number of observations: ",nObs)
+
+  if(nHHs < nObs){
+    # This means there must be duplicates. We have checked these for errors and have discovered that they are exact duplicates so we remove them.
+    dt <- dt[!duplicated(dt$bmg_id)]
+  }
+
+  nHHs <- uniqueN(dt$bmg_id)
+  nObs <- nrow(dt)
+
+  outputMessage("After duplicate removal:")
+  outputMessage("Number of bmg_ids: ",nHHs)
+  outputMessage("Number of observations: ",nObs)
+
+  # Update recruitment survey data ----
+  # This code sets the relevant values to be the 'most recent' (xx.latest). i.e. if we have a value for the TP1 (update) survey then we use that, if not we use the original survey value
+  # NB the update survey did not ask all possible questions so some values can't be updated (yet)
+
+  outputMessage(paste0("Updating survey variables with ",updateRounds, " update rounds"))
+
+  if (updateRounds == 3){
+    # > update children ----
+    dt <- dt[, ba_presenceChildren.latest := ifelse(!is.na(ba_presenceChildren.tp3Survey),               # if non-missing TP3 update
+                                                    ba_presenceChildren.tp3Survey,                       # use TP3 update, else ...
+                                                    ifelse(!is.na(ba_presenceChildren.tp2Survey),        # if non-missing TP2 update
+                                                           ba_presenceChildren.tp2Survey,                # use TP2 update, else ...
+                                                           ifelse(!is.na(ba_presenceChildren.tp1Survey), # if non-missing TP1 update
+                                                                  ba_presenceChildren.tp1Survey,         # use TP1 update, else ...
+                                                                  ba_presenceChildren.fullSurvey)))      # use full recruitment
+             ]
+
+    # recode N children to make regression tables etc easier to understand
+    dt <- dt[, ba_presenceChildren.latest := ifelse(ba_presenceChildren.latest == "No children", "0 children", "1+ child")]
+
+    # > update HRP work status ----
+    dt <- dt[, ba_Q2D_HRPemplType.latest := ifelse(!is.na(ba_Q2D_HRPemplType.tp3Survey),               # if non-missing TP3 update
+                                                   ba_Q2D_HRPemplType.tp3Survey,                       # use TP3 update, else ...
+                                                   ifelse(!is.na(ba_Q2D_HRPemplType.tp2Survey),        # if non-missing TP2 update
+                                                          ba_Q2D_HRPemplType.tp2Survey,                # use TP2 update, else ...
+                                                          ifelse(!is.na(ba_Q2D_HRPemplType.tp1Survey), # if non-missing TP1 update
+                                                                 ba_Q2D_HRPemplType.tp1Survey,         # use TP1 update, else ...
+                                                                 ba_Q2D_HRPemplType.fullSurvey)))      # use full recruitment
+             ] # 'other' should be coded so NA really is NA
+
+    # > update n people age ----
+    # people come & go
+
+    dt <- dt[, ba_Q2_npeople.latest := ifelse(!is.na(ba_Q2_npeople.tp3Survey),               # if non-missing TP3 update
+                                              ba_Q2_npeople.tp3Survey,                       # use TP3 update, else ...
+                                              ifelse(!is.na(ba_Q2_npeople.tp2Survey),        # if non-missing TP2 update
+                                                     ba_Q2_npeople.tp2Survey,                # use TP2 update, else ...
+                                                     ifelse(!is.na(ba_Q2_npeople.tp1Survey), # if non-missing TP1 update
+                                                            ba_Q2_npeople.tp1Survey,         # use TP1 update, else ...
+                                                            ba_Q2_npeople.fullSurvey)))      # use full recruitment
+             ]
+
+    dt <- dt[, ba_Q2_npeople_reduced.latest := ifelse(!is.na(ba_Q2_npeople_reduced.tp3Survey),               # if non-missing TP3 update
+                                                      ba_Q2_npeople_reduced.tp3Survey,                       # use TP3 update, else ...
+                                                      ifelse(!is.na(ba_Q2_npeople_reduced.tp2Survey),        # if non-missing TP2 update
+                                                             ba_Q2_npeople_reduced.tp2Survey,                # use TP2 update, else ...
+                                                             ifelse(!is.na(ba_Q2_npeople_reduced.tp1Survey), # if non-missing TP1 update
+                                                                    ba_Q2_npeople_reduced.tp1Survey,         # use TP1 update, else ...
+                                                                    ba_Q2_npeople_reduced.fullSurvey)))      # use full recruitment
+             ]
+
+    #t <- table(dt$ba_censusNpeople.fullSurvey, dt$ba_censusNpeople.tp1Survey, useNA = "always")
+    dt <- dt[, ba_censusNpeople.latest := ifelse(!is.na(ba_censusNpeople.tp3Survey),               # if non-missing TP3 update
+                                                 ba_censusNpeople.tp3Survey,                       # use TP3 update, else ...
+                                                 ifelse(!is.na(ba_censusNpeople.tp2Survey),        # if non-missing TP2 update
+                                                        ba_censusNpeople.tp2Survey,                # use TP2 update, else ...
+                                                        ifelse(!is.na(ba_censusNpeople.tp1Survey), # if non-missing TP1 update
+                                                               ba_censusNpeople.tp1Survey,         # use TP1 update, else ...
+                                                               ba_censusNpeople.fullSurvey)))      # use full recruitment
+             ]
+
+    # > update electric vehicles ----
+    dt <- dt[, Q6_1.latest := ifelse(!is.na(Q6_1.tp3Survey),                              # if non-missing TP3 update
+                                                    Q6_1.tp3Survey,                       # use TP3 update, else ...
+                                                    ifelse(!is.na(Q6_1.tp2Survey),        # if non-missing TP2 update
+                                                           Q6_1.tp2Survey,                # use TP2 update, else ...
+                                                           ifelse(!is.na(Q6_1.tp1Survey), # if non-missing TP1 update
+                                                                  Q6_1.tp1Survey,         # use TP1 update, else ...
+                                                                  Q6_1.fullSurvey)))      # use full recruitment
+             ]
+
+
+  }
+
+  if (updateRounds == 2){
+    # > update children ----
+    dt <- dt[, ba_presenceChildren.latest := ifelse(!is.na(ba_presenceChildren.tp2Survey),               # if non-missing TP2 update
+                                                           ba_presenceChildren.tp2Survey,                # use TP2 update, else ...
+                                                           ifelse(!is.na(ba_presenceChildren.tp1Survey), # if non-missing TP1 update
+                                                                  ba_presenceChildren.tp1Survey,         # use TP1 update, else ...
+                                                                  ba_presenceChildren.fullSurvey))       # use full recruitment
+             ]
+
+    # recode N children to make regression tables etc easier to understand
+    dt <- dt[, ba_presenceChildren.latest := ifelse(ba_presenceChildren.latest == "No children", "0 children", "1+ child")]
+
+    # > update HRP work status ----
+    dt <- dt[, ba_Q2D_HRPemplType.latest := ifelse(!is.na(ba_Q2D_HRPemplType.tp2Survey),               # if non-missing TP2 update
+                                                          ba_Q2D_HRPemplType.tp2Survey,                # use TP2 update, else ...
+                                                          ifelse(!is.na(ba_Q2D_HRPemplType.tp1Survey), # if non-missing TP1 update
+                                                                 ba_Q2D_HRPemplType.tp1Survey,         # use TP1 update, else ...
+                                                                 ba_Q2D_HRPemplType.fullSurvey))       # use full recruitment
+             ] # 'other' should be coded so NA really is NA
+
+    # > update n people age ----
+    # people come & go
+
+    dt <- dt[, ba_Q2_npeople.latest := ifelse(!is.na(ba_Q2_npeople.tp2Survey),               # if non-missing TP2 update
+                                                     ba_Q2_npeople.tp2Survey,                # use TP2 update, else ...
+                                                     ifelse(!is.na(ba_Q2_npeople.tp1Survey), # if non-missing TP1 update
+                                                            ba_Q2_npeople.tp1Survey,         # use TP1 update, else ...
+                                                            ba_Q2_npeople.fullSurvey))       # use full recruitment
+             ]
+
+    dt <- dt[, ba_Q2_npeople_reduced.latest := ifelse(!is.na(ba_Q2_npeople_reduced.tp2Survey),               # if non-missing TP2 update
+                                                             ba_Q2_npeople_reduced.tp2Survey,                # use TP2 update, else ...
+                                                             ifelse(!is.na(ba_Q2_npeople_reduced.tp1Survey), # if non-missing TP1 update
+                                                                    ba_Q2_npeople_reduced.tp1Survey,         # use TP1 update, else ...
+                                                                    ba_Q2_npeople_reduced.fullSurvey))       # use full recruitment
+             ]
+
+    #t <- table(dt$ba_censusNpeople.fullSurvey, dt$ba_censusNpeople.tp1Survey, useNA = "always")
+    dt <- dt[, ba_censusNpeople.latest := ifelse(!is.na(ba_censusNpeople.tp2Survey),               # if non-missing TP2 update
+                                                        ba_censusNpeople.tp2Survey,                # use TP2 update, else ...
+                                                        ifelse(!is.na(ba_censusNpeople.tp1Survey), # if non-missing TP1 update
+                                                               ba_censusNpeople.tp1Survey,         # use TP1 update, else ...
+                                                               ba_censusNpeople.fullSurvey))       # use full recruitment
+             ]
+
+    # > update electric vehicles ----
+    dt <- dt[, Q6_1.latest := ifelse(!is.na(Q6_1.tp2Survey),                       # if non-missing TP2 update
+                                                    Q6_1.tp2Survey,                # use TP2 update, else ...
+                                                    ifelse(!is.na(Q6_1.tp1Survey), # if non-missing TP1 update
+                                                           Q6_1.tp1Survey,         # use TP1 update, else ...
+                                                           Q6_1.fullSurvey))       # use full recruitment
+             ]
+  }
+
+  if (updateRounds == 1){
+    # > update children ----
+    dt <- dt[, ba_presenceChildren.latest := ifelse(!is.na(ba_presenceChildren.tp1Survey),        # if non-missing TP1 update
+                                                           ba_presenceChildren.tp1Survey,         # use TP1 update, else ...
+                                                           ba_presenceChildren.fullSurvey)        # use full recruitment
+             ]
+
+    # recode N children to make regression tables etc easier to understand
+    dt <- dt[, ba_presenceChildren.latest := ifelse(ba_presenceChildren.latest == "No children", "0 children", "1+ child")]
+
+    # > update HRP work status ----
+    dt <- dt[, ba_Q2D_HRPemplType.latest := ifelse(!is.na(ba_Q2D_HRPemplType.tp1Survey),        # if non-missing TP1 update
+                                                          ba_Q2D_HRPemplType.tp1Survey,         # use TP1 update, else ...
+                                                          ba_Q2D_HRPemplType.fullSurvey)        # use full recruitment
+             ] # 'other' should be coded so NA really is NA
+
+    # > update n people age ----
+    # people come & go
+
+    dt <- dt[, ba_Q2_npeople.latest := ifelse(!is.na(ba_Q2_npeople.tp1Survey),        # if non-missing TP1 update
+                                                     ba_Q2_npeople.tp1Survey,         # use TP1 update, else ...
+                                                     ba_Q2_npeople.fullSurvey)        # use full recruitment
+             ]
+
+    dt <- dt[, ba_Q2_npeople_reduced.latest := ifelse(!is.na(ba_Q2_npeople_reduced.tp1Survey),        # if non-missing TP1 update
+                                                             ba_Q2_npeople_reduced.tp1Survey,         # use TP1 update, else ...
+                                                             ba_Q2_npeople_reduced.fullSurvey)       # use full recruitment
+             ]
+
+    #t <- table(dt$ba_censusNpeople.fullSurvey, dt$ba_censusNpeople.tp1Survey, useNA = "always")
+    dt <- dt[, ba_censusNpeople.latest := ifelse(!is.na(ba_censusNpeople.tp1Survey),        # if non-missing TP1 update
+                                                        ba_censusNpeople.tp1Survey,         # use TP1 update, else ...
+                                                        ba_censusNpeople.fullSurvey)       # use full recruitment
+             ]
+
+    # > update electric vehicles ----
+    dt <- dt[, Q6_1.latest := ifelse(!is.na(Q6_1.tp1Survey),                # if non-missing TP1 update
+                                                    Q6_1.tp1Survey,         # use TP1 update, else ...
+                                                    Q6_1.fullSurvey)        # use full recruitment
+             ]
+
+  }
+
+  # > update HRP age ----
+  # do we really want to do this?
+  # it might just be that someone else answered the update survey?
+  # could check by looking at gender
+  #t <- table(dt$ba_Q2B_2_HRPage.fullSurvey, dt$ba_Q2B_2_HRPage.tp1Survey, useNA = "always")
+  #kable(caption = "Check age of HRP in each survey", t)
+
+  # Recode Variables ---
+  # Only recode 'latest' variables here - i.e. those where we need to combine various surveys
+  # Recode variables asked in only 1 survey in the relevant function
+
+
+  # Clamp installed flag ----
+  dt <- dt[, ba_clampInstalled := ifelse(bmgInstallType %like% "install" |
+                                           bmgInstallType %like% "Opt" |
+                                           bmgInstallType %like% "Removals",
+                                         "Yes",
+                                         "No")]
+
+  # Install type var ----
+  dt <- dt[, ba_installType := ifelse(bmgInstallType %like% "install",
+                                      "Installation complete",
+                                      NA)]
+  dt <- dt[, ba_installType := ifelse(bmgInstallType %like% "Opt Out",
+                                      "Installation complete",
+                                      ba_installType)] # we're coding these as installed for now despite the fact that they then opted out
+  dt <- dt[, ba_installType := ifelse(Outcome_Label %like% "Removals from Navetas",
+                                      "Installation complete",
+                                      ba_installType)] # we're coding these as installed for now despite the fact that they then opted out
+  dt <- dt[, ba_installType := ifelse(bmgInstallType %like% "Refused",
+                                      "Refusal",
+                                      ba_installType)]
+  #No contacts
+  dt <- dt[, ba_installType := ifelse(Outcome_Label %like% "knocks" |
+                                                             Outcome_Label %like% "Knocks" |
+                                                             Outcome_Label %like% "No reply" |
+                                                             Outcome_Label %like% "No response",
+                                                           "No contact",ba_installType)]
+
+  # Overall outcome var ----
+  dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "No reply",
+                                      "No contact",
+                                      NA)]
+  dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "No response",
+                                      "No contact",   # does this differ??
+                                      ba_saveOutcome)]
+  dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "Empty",
+                                      "No contact",
+                                      ba_saveOutcome)]
+  dt <- dt[, ba_saveOutcome := ifelse(ba_installType %like% "Refusal",
+                                      "Refusal",
+                                      ba_saveOutcome)]
+  dt <- dt[, ba_saveOutcome := ifelse(bmgInstallType %like% "Unable",
+                                      "Unable to fit clamp",
+                                      ba_saveOutcome)]
+  dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "Not Used",
+                                      "Unused address",   # i.e. not issued
+                                      ba_saveOutcome)]
+  dt <- dt[, ba_saveOutcome := ifelse(Outcome_Label %like% "agrees" |
+                                        Outcome_Label %like% "Agrees" |
+                                        Outcome_Label %like% "Self Install" |
+                                        Outcome_Label %like% "Opt Out" |
+                                        Outcome_Label %like% "Removals" |
+                                        ba_surveyMode.fullSurvey %like% "Install", # adds households not found in Outcome_Label
+                                      "Installation completed",   # Installed (they may then have opted out)
+                                      ba_saveOutcome)]
+  dt <- dt[, ba_saveOutcome := ifelse(is.na(ba_saveOutcome),
+                                      "Other non-response",   # The rest
+                                      ba_saveOutcome)]
+
+  # set levels order
+  dt <- dt[, ba_saveOutcome := factor(ba_saveOutcome,
+                                      levels = c("Unused address",
+                                                 "No contact",
+                                                 "Refusal",
+                                                 "Other non-response",
+                                                 "Unable to fit clamp",
+                                                 "Installation completed"
+                                                 )
+                                      )
+           ]
+
+
+
+  # Survey year as a year ----
+  dt <- dt[, surveyYear := lubridate::year(InterviewDate.fullSurvey)]
+
+  # Set key ----
+  if(isTRUE(message)) outputMessage("Set key to 'bmg_id'")
+  dt <- dt[, bmg_id := as.character(bmg_id)] # to match power data
+  setkey(dt, bmg_id)
+
+  if(isTRUE(saveFile)){
+    # Save the file for future use each time this runs ----
+    ofile <- paste0(dPath,"processed/",
+                    "hhAttributesLatest_",
+                    Sys.Date(),
+                    ".csv")
+    outputMessage("Saving ", ofile, " for future use...")
+    write_csv(dt, ofile)
+
+    # remove old gzip file
+    cmd <- paste0("rm ", ofile, ".gz") # will produce a warning if it does not exist - yes, we could make this tidyer
+    try(system(cmd))
+
+    # now gzip new one
+    outputMessage("gzipping file to: ", paste0(ofile, ".gz"))
+    cmd <- paste0("gzip ", ofile)
+    try(system(cmd)) # in case it fails - if it does there will just be .csv files (not gzipped) - e.g. under windows
+  }
+
+  # Finish  -----
+
+  outputMessage("All BMG data loaded, processed & returned")
+  return(dt)
+}
+
+#' Process the household response sheet from the BMG data file
+#'
+#' \code{getBmgResponseFile} reads in and processes the original household response sheet. This contains all bmg_ids ever issued of
+#'     which many were not used.
+#'
+#'     Links to a lot of area-level variables as OA & LSOA level.
+#'
+#'     Recoding variables: recode original response file variables here (not in \code{getBmgAll()}).
+#'
+#'     Returns a processed data.table with > 40,000 rows
+#'
+#' @param file .xlsx file to load
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#' @import readxl
+#'
+#' @family bmgData functions
+#'
+#' @export
+#'
+getBmgResponseFile <- function(file){
+  message("getBmgResponseFile: Load complete response data from: ", file, ", sheet = AllIssued")
+  hhRespDTorig <- data.table::as.data.table(readxl::read_xlsx(file, sheet = "AllIssued"))
+  hhRespDTorig <- hhRespDTorig[, bmg_id := as.character(BMG_ID)]
+  print(paste0("N rows in ", file, ": AllIssued sheet (before processing) = ", tidyNum(nrow(hhRespDTorig))))
+
+  dt <- hhRespDTorig
+  # set bmg_id to char for matching --
+  dt <- dt[,bmg_id := as.character(BMG_ID)]
+
+  # set trial groups --
+  dt <- dt[,bmgIntervention := Intervention]
+
+  dt <- dt[,bmgIntervention := factor(bmgIntervention,
+                                      labels = c("1: Control",
+                                                 "2: LED",
+                                                 "3: Messaging",
+                                                 "4: Messaging & £"
+                                      )
+  )
+  ]
+  # check
+  #print(with(dt, table(bmgIntervention, Intervention, useNA = "always")))
+
+  # set trial group ----
+  dt <- dt[, INTERVENTIONTYPE := Intervention] # fix
+  dt <- SAVEr::setBmgTrialGroups(dt) # set using function for reproducibility
+
+  # to avoid confusion set name to this file
+  dt <- dt[, bmgGroup.respF := bmgGroup]
+  dt <- dt[, bmgGroup:= NULL]
+
+  # check
+  #print(with(dt, table(bmgGroup, Intervention, useNA = "always")))
+
+  # set outcome labels ----
+  dt <- dt[,bmgOutcomeLabel := Outcome_Label]
+
+  # derived/simplified - do not change the name of this variable as the navetas processing code relies on it
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Self Install", "Self install", "Other code")] # other carries forward to replace NA
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Not Used", "Address not used", bmgInstallType)]
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "new participant", "New install", bmgInstallType)]
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "new ocupier", "Re-install (new occupier)", bmgInstallType)] # beware typo in original!
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "previous occupier", "Re-install (same occupier)", bmgInstallType)]
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Removals from Navetas", "Removals from Navetas", bmgInstallType)]
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Opt Out", "Opt Out", bmgInstallType)]
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Unable", "Unable to fit clamp", bmgInstallType)]
+  dt <- dt[,bmgInstallType := ifelse(Outcome_Label %like% "Refused", "Refused  (various)", bmgInstallType)]
+
+  # check
+  #print(dt[, .N, keyby = bmgInstallType]) # better than using table as it always prints NA
+
+  dt[, bmgClampInstall := ifelse(bmgInstallType %like% "install", "Yes", "No")]
+  # check
+  #print(with(dt, table(bmgInstallType,bmgInstall, useNA = "always")))
+
+  # fix installation dates ----
+  dt <- dt[, optOutDate := lubridate::ymd(`Opt Out Date`)] # requires lubridate
+  dt$`Opt Out Date` <- NULL
+  dt <- dt[, navetasSimRouterInstallDate := lubridate::ymd(as_date(New_Sim_Router_Complete_Date))] # requires lubridate
+  dt$New_Sim_Router_Complete_Date <- NULL
+  dt <- dt[, newNavetasInstallDate := lubridate::ymd(navetasInstallDate)] # requires lubridate
+  dt$navetasInstallDate <- NULL
+  dt$bmgInstallYear <- year(dt$wmgInstallDate)
+  dt <- dt[, bmgInstallYear := ifelse(is.na(bmgInstallYear), lubridate::year(dt$wmgCompleteDate), bmgInstallYear)]
+  dt <- dt[, bmgInstallYear := ifelse(is.na(bmgInstallYear), lubridate::year(dt$navetasSimRouterInstallDate), bmgInstallYear)]
+  dt <- dt[, bmgInstallYear := ifelse(is.na(bmgInstallYear), lubridate::year(dt$newNavetasInstallDate), bmgInstallYear)]
+
+  # check
+  #print(with(dt, table(bmgInstallType,bmgInstallYear, useNA = "always")))
+  # not got all dates - most of these are caused by the self-installs
+
+  # set internet mode ----
+  dt$bmgInternetType[dt$bmgInstallType %like% "install" |
+                       dt$bmgInstallType %like% "Opt Out" |
+                       dt$bmgInstallType %like% "Removals"] <- "BMG: ADSL?" # default - has (or had) an installation of some sort
+  dt$bmgInternetType[dt$Sim_Installs == 1] <- "BMG: SIM"
+
+  # check
+  #print(with(dt, table(Sim_Installs, bmgInternetType, useNA = "always")))
+
+  # get household non-response weight ----
+  # wgtNonResponse
+
+  # Add geography vars ----
+  # index via OA code
+  dt <- dt[, bmgOA2011 := Oa11]
+
+  # > Urban vs rural ----
+  message("Adding urban/rural codes")
+  oa2011urbanDT <- data.table::fread(rurFile)
+  # change to make future matching easier
+  oa2011urbanDT$oa11cd <- oa2011urbanDT$OA11CD
+  data.table::setkey(oa2011urbanDT, oa11cd)
+  oa2011urbanDT[, OA11CD := NULL]
+
+  # > IMD ----
+  message("Adding IMD deciles etc")
+  imdnames <- c("lsoacode", "lsoaname","la_code", "la_name", "imd_rank", "imd_score_dec")
+
+  lsoaImd2015DT <- data.table::fread(lsoaImd2015File)
+  data.table::setnames(lsoaImd2015DT, imdnames)
+  lsoaImd2015DT[, lsoacode := lsoacode]
+  data.table::setkey(lsoaImd2015DT, lsoacode)
+
+  # > OA <-> LSOA ----
+  message("Adding LSOA etc codes")
+  oa2011lutDT <- data.table::fread(oaLutFile)
+  oa2011lutDT$lsoacode <- oa2011lutDT$LSOA11CD # for ease of merging
+  oa2011lutDT$oa11cd <- oa2011lutDT$OA11CD # for ease of merging
+  data.table::setkey(oa2011lutDT, oa11cd)
+  #print(paste0("Number of OAs in OA-LSOA LUT file", oaLutFile))
+  data.table::uniqueN(oa2011lutDT)
+
+  # > OA <-> County ----
+  message("Adding County codes")
+  oaCountylutDT <- data.table::fread(oaCountyFile)
+  oaCountylutDT$oa11cd <- oaCountylutDT$OACD # for ease of merging
+  # N OAs
+  #print(paste0("Number of OAs in county file: ", oaCountyFile))
+  data.table::uniqueN(oaCountylutDT)
+  data.table::setkey(oaCountylutDT, oa11cd)
+
+  oa2011lutDT <- merge(oa2011lutDT,oaCountylutDT, all = TRUE) # there are OAs without county names & vice versa
+  # N OAs with a county
+  data.table::uniqueN(oa2011lutDT)
+
+  oa2011lutDT[, ba_WiderSolentOA := ifelse(LAD11NM %like% "Southampton" |
+                                             LAD11NM %like% "Portsmouth" |
+                                             LAD11NM %like% "Isle of Wight" |
+                                             LAD11NM %like% "Hampshire",
+                                           "Wider Solent (SAVE)", # Hampshire, Southampton, Portsmouth, Isle of Wight
+                                           "not Wider Solent")
+              ]
+  #table(oa2011lutDT$ba_WiderSolentOA, useNA = "always")
+
+  # > link the OA urban/rural and census data ----
+  data.table::setkey(oa2011lutDT, oa11cd)
+  oaAllDataDT <- merge(oa2011lutDT,oa2011urbanDT, all.x = TRUE) # keep all OAs
+
+  #print(oaAllDataDT[, .(nOAs = uniqueN(oa11cd)), by = ba_WiderSolentOA])
+  #print(oaAllDataDT[, .(nLSOAs = uniqueN(lsoacode)), by = ba_WiderSolentOA])
+
+  # > add the IMD data ----
+  data.table::setkey(oaAllDataDT, lsoacode)
+  oaAllDataDT <- merge(oaAllDataDT,lsoaImd2015DT, all.x = TRUE) # keep all OAs
+
+  #t <- table(oaAllDataDT$ba_WiderSolentOA, useNA = "always")
+  oaAllDataDT[, .(nOAs = .N), by = ba_WiderSolentOA]
+
+  # oaAllDataDT is what it says on the tin (no geography filtering)
+
+  # now merge that to the response file via the OA code
+  dt <- dt[, oa11cd := bmgOA2011] # fix for merge
+  data.table::setkey(dt,oa11cd)
+  data.table::setkey(oaAllDataDT, oa11cd)
+
+  dt <- merge(dt, oaAllDataDT, all.x = TRUE) # only keeps rows in response file
+
+  # Keep Census 2011 OA but beware: potentially disclosive
+  # > Fix names ----
+
+  dt$lsoacode_2011 <- dt$LSOA11CD
+  dt$lsoaname_2011 <- dt$LSO11ANM # beware typo in original!
+  dt$countyname_2011 <- dt$FCTYNM
+  dt$countycode_2011 <- dt$FCTYCD
+  dt$lacode_2011 <- dt$LAD11CD
+  dt$laname_2011 <- dt$LAD11NM
+
+  data.table::setkey(dt, bmg_id)
+
+  # Finish ----
+  print("Done response file")
+  return(dt)
+}
+
+#' Process the household recruitment survey sheet from the BMG data file
+#'
+#' \code{getBmgRecruitSurvey} reads in and processes the original household recruitment survey data sheet. This contains all bmg_ids who completed
+#'     a recruitment survey.
+#'
+#'     Recoding variables: recode original recruitment survey variables here (not in \code{getBmgAll()}).
+#'
+#' Returns a processed data.table
+#'
+#' @param file .xlsx file to load
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#' @import readxl
+#'
+#' @family bmgData functions
+#'
+#' @export
+#'
+getBmgRecruitSurvey <- function(file){
+  # BMG recruitment survey data --
+  print(paste0("getBmgRecruitSurvey: Load full recruitment survey data from: ", file, ", sheet = FullRecruit"))
+  dt <- data.table::as.data.table(readxl::read_xlsx(file, sheet = "FullRecruit"))
+  dt <- dt[, bmg_id := as.character(BMG_ID)]
+  print(paste0("N rows in ", file, ": FullRecruit sheet (before processing) = ",
+               tidyNum(nrow(dt))))
+
+  data.table::setkey(dt, bmg_id)
+  # function to to process the BMG recruitment survey file
+  dt <- dt[, bmg_id := BMG_ID]
+  dt <- dt[, bmg_id := as.character(bmg_id)]
+
+  # Set trial groups --
+  dt <- SAVEr::setBmgTrialGroups(dt)
+
+  # to avoid confusion
+  dt <- dt[, bmgGroup.fullSurvey := bmgGroup]
+  dt <- dt[, bmgGroup:= NULL]
+
+  # Set survey type flag --
+  dt <- dt[surveyMode %like% "CATI", ba_surveyMode := "Install_CATI"]
+  dt <- dt[surveyMode %like% "CAWI", ba_surveyMode := "Install_CAWI"]
+  dt <- dt[surveyMode %like% "CAPI", ba_surveyMode := "Install_f2f"]
+  dt <- dt[is.na(surveyMode), ba_surveyMode := "No survey data"]
+
+  # Set survey source flag --
+  dt <- dt[ba_surveyMode != "No survey data", source := "Full recruitment survey"]
+  dt <- dt[ba_surveyMode == "No survey data", ba_recruitSurveySource := bmgCompleteFile]
+
+  print("Test full survey file before duplicate tests:")
+  print(paste0("Number of rows of data : ", tidyNum(nrow(dt)), " (includes all issued addresses)"))
+  print(paste0("Number of unique bmg_id: ", tidyNum(uniqueN(dt$bmg_id))))
+
+  # Are there duplicates & does it matter?
+
+  print("Removing any duplicates")
+
+  dt <- dt[!duplicated(dt$bmg_id)]
+  print("Test full survey file after duplicate tests:")
+  print(paste0("Number of rows of data : ", tidyNum(nrow(dt)), " (includes all issued addresses)"))
+  print(paste0("Number of unique bmg_id: ", tidyNum(uniqueN(dt$bmg_id))))
+  print(paste0("Number of recruitment survey responses ever collected: ",
+               tidyNum(uniqueN(dt[ba_surveyMode != "No survey data"]$bmg_id))))
+
+  data.table::setkey(dt, bmg_id)
+
+
+  # Fix interview date --
+  print("Fixing recruitment interview date - this may break with future versions of the data")
+  # This whole section should probably be inside an if version = v5
+  # (some values are badly formatted in some of the BMG files)
+  # if there is an issue the some of the dates look like strings of numbers and some look like "08/12/2017" char strings
+  # presumably becasue read_xlsx doesn't deal with variable date formats very well
+
+  # first try to force them into integers
+  # this will convert any that were dates (which got turned into integers by read_xlsx
+  # It leave a lot of NAs including those that were character strings
+  print("# -> Before fix (to test if fix required):")
+  print(summary(dt$InterviewDate))
+  dt$InterviewDate_orig <- dt$InterviewDate
+  # dt <- dt[, InterviewDate_Fixed := ymd_hms(InterviewDate)] # uses lubridate to pick up the char strings
+  #
+  # dt <- dt[!is.na(InterviewDate_Fixed), InterviewDate_orig := NA] # remove the ones we fixed
+  # # now force the conversion of the remainder to numeric and then convert using MS Windows excel epoch
+  # dt <- dt[, InterviewDate_xlFixed := as.Date(as.numeric(InterviewDate_orig), origin = "1900-01-01")] # this is unreformed Windows Excel so epoch = 1/1/1900
+  # dt <- dt[is.na(InterviewDate_Fixed), InterviewDate_Fixed := InterviewDate_xlFixed] # add the ones we fixed
+  # dt <- dt[, InterviewDate := InterviewDate_Fixed]
+  dt <- dt[, InterviewDate := as.Date(InterviewDate_orig)]
+  print("# -> After fix:")
+  print(summary(dt$InterviewDate))
+
+  # Recode variables --
+  # Do this using functions that can be re-used for the TP1/TP2 etc surveys to ensure conformity (assumes original coding is harmonised!)
+
+  # > HRP Age --
+  dt <- SAVEr::recodeHrpAge(dt)
+
+  # > HRP employment --
+  dt <- SAVEr::recodeHrpEmplType(dt)
+
+  # > HRP Ethnicity --
+  dt <- SAVEr::recodeHrpEthnicity(dt)
+
+  # > HRP gender --
+  dt <- SAVEr::recodeHrpGender(dt)
+
+  # > HRP NS_SEC --
+  dt <- SAVEr::codeHrpNSSEC(dt)
+
+  # > Mains gas etc
+  dt$ba_Q3_6_mainsGas <- dt$Q3_6
+
+  dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 1,
+                                        "Mains gas",
+                                        ba_Q3_6_mainsGas)]
+  dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 2,
+                                        "LPG gas",
+                                        ba_Q3_6_mainsGas)]
+  dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 3,
+                                        "No gas",
+                                        ba_Q3_6_mainsGas)]
+  dt <- dt[, ba_Q3_6_mainsGas := ifelse(ba_Q3_6_mainsGas == 4,
+                                        "Don't know",
+                                        ba_Q3_6_mainsGas)]
+  # > Heat source --
+  # recode heat source from full survey (not on update survey - assume is still correct!)
+  dt <- SAVEr::recodeHeatSource(dt)
+  dt <- SAVEr::recodeHeatSourceExt(dt)
+
+   # > Presence of children ----
+  dt <- SAVEr::recodePresenceChildren(dt)
+
+  # > Source of electricity
+  dt$ba_Q3_3_Mains <- dt$Q3_3_1
+  dt$ba_Q3_3_PV <- dt$Q3_3_2
+  dt$ba_Q3_3_Wind <- dt$Q3_3_3
+  dt$ba_Q3_3_Hydro <- dt$Q3_3_4
+  dt$ba_Q3_3_CHP <- dt$Q3_3_5
+  dt$ba_Q3_3_Gen <- dt$Q3_3_6
+  dt$ba_Q3_3_Oth <- dt$Q3_3_7
+  dt$ba_Q3_3_dk <- dt$Q3_3_8
+
+  # > Water heating ----
+  #Q3_18A	151	Q3.18. How is your water heated?: Electric immersion heater (to hot water tank)
+  dt$ba_Q3_12_hotWaterImm <- dt$Q3_18_1
+  #Q3_18B	152	Q3.18. How is your water heated?: Gas or oil direct heating (combi)
+  dt$ba_Q3_12_hotWaterCombi <- dt$Q3_18_2
+  #Q3_18C	153	Q3.18. How is your water heated?: Gas, oil or solid fuel (including wood) indirect heating (to hot water tank)
+  dt$ba_Q3_12_hotWaterInDirect <- dt$Q3_18_3
+  #Q3_18D	154	Q3.18. How is your water heated?: Solar heating (to hot water tank)
+  dt$ba_Q3_12_hotWaterSolar <- dt$Q3_18_4
+  #Q3_18E	155	Q3.18. How is your water heated?: Other
+  dt$ba_Q3_12_hotWaterOther <- dt$Q3_18_5
+  #table(dt$ba_Q3_12_hotWaterOther)
+  dt$ba_Q3_12_hotWaterOtherDetail <- dt$Q3_18OTHER # actual text recorded - has 119 entries could be recoded in to the other categories?
+  #Q3_18F	156	Q3.18. How is your water heated?: Don't know
+  dt$ba_Q3_12_hotWaterDK <- dt$Q3_18_6
+  #Q3_18G	157	Q3.18. How is your water heated?: Refused
+  dt$ba_Q3_12_hotWaterRefused <- dt$Q3_18_7
+
+  # > N cars ----
+  dt <- SAVEr::recodeNcars(dt)
+
+  # > Tenure (Census 2011 form) --
+
+  # > Dwelling type ----
+  # Q8_2 (dwelling type from survey) to make labels easier
+  # Lys suggests is more reliable - why?
+  dt$ba_Q8_2_dwelling <- factor(dt$Q8_2,
+                                labels = c(
+                                  "Detached",
+                                  "Semi-detached",
+                                  "Terrace or end terrace",
+                                  "Flat - purpose built",
+                                  "Flat - converted",
+                                  "Commercial",
+                                  "Caravan etc",
+                                  "Refused"
+                                )
+  )
+
+  # Merge flat types
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 1, 1, NA)]
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 2, 2, ba_Q8_2_dwellingRecoded)]
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 3, 3, ba_Q8_2_dwellingRecoded)]
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 4 | Q8_2 == 5, 4, ba_Q8_2_dwellingRecoded)]
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 6, 5, ba_Q8_2_dwellingRecoded)]
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 7, 6, ba_Q8_2_dwellingRecoded)]
+  dt <- dt[, ba_Q8_2_dwellingRecoded := ifelse(Q8_2 == 8, 7, ba_Q8_2_dwellingRecoded)]
+
+  dt$ba_Q8_2_dwellingRecoded <- factor(dt$ba_Q8_2_dwellingRecoded,
+                                       labels = c(
+                                         "Detached",
+                                         "Semi-detached",
+                                         "Terrace or end terrace",
+                                         "Flat",
+                                         "Commercial",
+                                         "Caravan etc",
+                                         "Refused"
+                                       )
+  )
+  # check
+  # with(dt, table(ba_Q8_2_dwelling,ba_Q8_2_dwellingRecoded, useNA = "always" ))
+
+  # dictionary to convert dwelling type
+  namesDwelling <-
+    c("Detached" = "Detached",
+      "Semi-detached" = "Semi",
+      "Terrace or end terrace" = "Terrace",
+      "Flat" = "Flat/Other",
+      "Commercial" = "Flat/Other",
+      "Caravan etc" = "Flat/Other",
+      "Refused" = "Flat/Other")
+
+  dt <- dt[, Dwelling := plyr::revalue(ba_Q8_2_dwellingRecoded, namesDwelling)] # <- only asked at recruitment
+
+  # > N people --
+  dt <- SAVEr::recodeNpeople(dt)
+
+  # > N rooms ----
+
+  # match to USOC by subtracting n bathrooms
+  dt <- dt[, Q8_6 := as.numeric(Q8_6)]
+  dt <- dt[, Q8_8 := as.numeric(Q8_8)]
+  dt <- dt[, ba_nroomsUSOCtemp := Q8_6 - Q8_8]
+  dt <- dt[, ba_nroomsUSOC := as.character(ba_nroomsUSOCtemp)]
+  dt <- dt[ba_nroomsUSOCtemp > 8, ba_nroomsUSOC := "9+"]
+
+  # force correspondence to census
+  # https://www.nomisweb.co.uk/census/2011/qs407ew
+  dt <- dt[, ba_nrooms := as.character(Q8_6)]
+  dt <- dt[Q8_6 > 8, ba_nrooms := "9+"]
+
+  # > Bedrooms ----
+
+  # Bedrooms recoded to 4 collapsed categories
+  # used in CustomerTypes (bmgDataUtils - SAVEr::codeCustomerTypes)
+  dt <- dt[, Bedrooms := NA]
+  dt <- dt[, Bedrooms := ifelse(Q8_7 <= 2, "0-2", Bedrooms)]
+  dt <- dt[, Bedrooms := ifelse(Q8_7 >= 3 & Q8_7 <= 4, Q8_7, Bedrooms)]
+  dt <- dt[, Bedrooms := ifelse(Q8_7 >= 5, "5+", Bedrooms)]
+
+  # table(dt$ba_Q8_6_nrooms, dt$Q8_6, useNA = "always")
+
+  # > Tenure ----
+  dt <- SAVEr::recodeTenure(dt)
+
+  # > Income ----
+  dt <- SAVEr::recodeIncome(dt)
+
+  # > recode eco satisfaction score --
+  dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 1] <- "1: I'm happy with what I do at the moment"
+  dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 2] <- "2: I'd like to do a bit more to help the environment"
+  dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 3] <- "3: I'd like to do a lot more to help the environment"
+  dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 4] <- "4: Don't know"
+  dt$ba_Q7_1_ecoSatis[dt$Q7_1 == 5] <- "5: Refused"
+  table(dt$ba_Q7_1_ecoSatis, dt$Q7_1, useNA = "always")
+
+  # Create aggregate eco behaviours index ----
+
+  # > recode so high = more eco ----
+
+  # Q7_2_x all coded 1 = Always -> 5  = Never & 6 = NA
+  # Q7_2_1	281	Leave a TV on standby for the night
+
+  dt$Q7_2_1r[dt$Q7_2_1 == 1] <- 1
+  dt$Q7_2_1r[dt$Q7_2_1 == 2] <- 2
+  dt$Q7_2_1r[dt$Q7_2_1 == 3] <- 3
+  dt$Q7_2_1r[dt$Q7_2_1 == 4] <- 4
+  dt$Q7_2_1r[dt$Q7_2_1 == 5] <- 5
+  dt$Q7_2_1r[dt$Q7_2_1 == 6] <- NA
+
+  #table(dt$Q7_2_1, dt$Q7_2_1r, useNA = "always")
+
+  # Q7_2_2	282	Switch off lights in rooms that aren't being used
+  # reverse
+  dt$Q7_2_2r[dt$Q7_2_2 == 1] <- 5
+  dt$Q7_2_2r[dt$Q7_2_2 == 2] <- 4
+  dt$Q7_2_2r[dt$Q7_2_2 == 3] <- 3
+  dt$Q7_2_2r[dt$Q7_2_2 == 4] <- 2
+  dt$Q7_2_2r[dt$Q7_2_2 == 5] <- 1
+  dt$Q7_2_2r[dt$Q7_2_2 == 6] <- NA
+
+  #table(dt$Q7_2_2, dt$Q7_2_2r, useNA = "always")
+
+  # Q7_2_3	283	Decide not to buy something because it has too much packaging
+  # reverse
+  dt$Q7_2_3r[dt$Q7_2_3 == 1] <- 5
+  dt$Q7_2_3r[dt$Q7_2_3 == 2] <- 4
+  dt$Q7_2_3r[dt$Q7_2_3 == 3] <- 3
+  dt$Q7_2_3r[dt$Q7_2_3 == 4] <- 2
+  dt$Q7_2_3r[dt$Q7_2_3 == 5] <- 1
+  dt$Q7_2_3r[dt$Q7_2_3 == 6] <- NA
+
+  #table(dt$Q7_2_3, dt$Q7_2_3r, useNA = "always")
+
+  # Q7_2_4	284	Buy recycled paper products such as toilet paper or tissues
+  # reverse
+  dt$Q7_2_4r[dt$Q7_2_4 == 1] <- 5
+  dt$Q7_2_4r[dt$Q7_2_4 == 2] <- 4
+  dt$Q7_2_4r[dt$Q7_2_4 == 3] <- 3
+  dt$Q7_2_4r[dt$Q7_2_4 == 4] <- 2
+  dt$Q7_2_4r[dt$Q7_2_4 == 5] <- 1
+  dt$Q7_2_4r[dt$Q7_2_4 == 6] <- NA
+
+  #table(dt$Q7_2_4, dt$Q7_2_4r, useNA = "always")
+
+  # Q7_2_5	285	Use public transport (e.g. bus, train) rather than travel by car
+  # reverse
+  dt$Q7_2_5r[dt$Q7_2_5 == 1] <- 5
+  dt$Q7_2_5r[dt$Q7_2_5 == 2] <- 4
+  dt$Q7_2_5r[dt$Q7_2_5 == 3] <- 3
+  dt$Q7_2_5r[dt$Q7_2_5 == 4] <- 2
+  dt$Q7_2_5r[dt$Q7_2_5 == 5] <- 1
+  dt$Q7_2_5r[dt$Q7_2_5 == 6] <- NA
+
+
+  #table(dt$Q7_2_5, dt$Q7_2_5r, useNA = "always")
+
+  # Q7_2_6	286	Walk or cycle for short journeys less than 2 or 3 miles
+  # reverse
+  dt$Q7_2_6r[dt$Q7_2_6 == 1] <- 5
+  dt$Q7_2_6r[dt$Q7_2_6 == 2] <- 4
+  dt$Q7_2_6r[dt$Q7_2_6 == 3] <- 3
+  dt$Q7_2_6r[dt$Q7_2_6 == 4] <- 2
+  dt$Q7_2_6r[dt$Q7_2_6 == 5] <- 1
+  dt$Q7_2_6r[dt$Q7_2_6 == 6] <- NA
+
+  #table(dt$Q7_2_6, dt$Q7_2_6r, useNA = "always")
+
+  # this will have created NAs from two sources: 'Not Applicable' in question and NA (no survey response)
+
+  # > create sum & mean indicators ----
+  # there has to be an easier way than this
+  ecoDTs <- dt[, .(bmg_id, Q7_2_1r, Q7_2_2r, Q7_2_3r, Q7_2_4r, Q7_2_5r, Q7_2_6r)]
+
+  # use this DT to create the sum of all non-NA values
+  ecoDTs <- ecoDTs[, ba_Q7_2_ecoSum := apply(.SD, 1, sum, na.rm = TRUE), by = bmg_id]
+  # has the effect of setting those with all NA to 0 so recode
+  ecoDTs <- ecoDTs[, ba_Q7_2_ecoSum := ifelse(ba_Q7_2_ecoSum == 0, NA, ba_Q7_2_ecoSum)]
+  data.table::setkey(ecoDTs, bmg_id)
+
+  # now the mean
+  ecoDTm <- dt[, .(bmg_id, Q7_2_1r, Q7_2_2r, Q7_2_3r, Q7_2_4r, Q7_2_5r, Q7_2_6r)]
+  # use this DT to create the mean of all non-NA values
+  ecoDTm <- ecoDTm[, ba_Q7_2_ecoMean := apply(.SD, 1, mean, na.rm = TRUE), by = bmg_id]
+  data.table::setkey(ecoDTm, bmg_id)
+  # add the columns back to the original survey file
+  data.table::setkey(dt, bmg_id)
+  dt <- merge(dt,ecoDTs)
+  dt <- merge(dt,ecoDTm)
+
+  # Finish ----
+  print("Done recruitment survey file")
+  return(dt)
+}
+
+#' Process the TP1 household update survey sheet from the BMG data file
+#'
+#' \code{getBmgTP1UpdateSurvey} reads in and processes the TP1 household update survey. This contains only those bmg_ids who
+#'     were contacted during this period.
+#'
+#'     Recoding variables: recode original TP1 update survey variables here (not in \code{getBmgAll()}).
+#'
+#' Returns a processed data.table
+#'
+#' @param file .xlsx file to load
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#' @import readxl
+#'
+#' @family bmgData functions
+#'
+#' @export
+#'
+getBmgTP1UpdateSurvey <- function(file){
+  # function to to process the BMG TP1 update survey file
+  print(paste0("getBmgTP1UpdateSurvey: Load TP1 survey data from: ", file, ", sheet = TP1Update"))
+  dt <- data.table::as.data.table(readxl::read_xlsx(file, sheet = "TP1Update"))
+  dt$bmg_id <- dt$BMG_ID
+  dt$source <- "TP1 update survey"
+  dt$tp1UpdateSurvey <- "TP1 update survey"
+  dt <- dt[, bmg_id := as.character(BMG_ID)]
+
+  # Set trial groups ----
+  dt <- SAVEr::setBmgTrialGroups(dt)
+
+  # to avoid confusion
+  dt <- dt[, bmgGroup.tp1Survey := bmgGroup]
+  dt <- dt[, bmgGroup:= NULL]
+
+  # Duplicate check ----
+  print("Check for duplicates in TP1UpdateSurveyFile:")
+  nHHs <- uniqueN(dt$bmg_id)
+  nObs <- nrow(dt)
+  print(paste0("Number of bmg_ids: ",nHHs))
+  print(paste0("Number of observations: ",nObs))
+
+  data.table::setkey(dt, bmg_id)
+
+  # Survey type flag ----
+  # Name clash with main survey is dealt with before merging them
+  dt <- dt[surveyMode %like% "CATI", ba_surveyMode := "Install_CATI"]
+  dt <- dt[surveyMode %like% "CAWI", ba_surveyMode := "Install_CAWI"]
+  dt <- dt[surveyMode %like% "CAPI", ba_surveyMode := "Install_f2f"]
+  dt <- dt[is.na(surveyMode), ba_surveyMode := "No survey data"]
+  print(paste0("Number of TP1 update survey responses: ", uniqueN(dt[ba_surveyMode != "No survey data"]$bmg_id)))
+
+  #table(dt$surveyMode,dt$ba_surveyMode, useNA = "always")
+  dt <- dt[ba_surveyMode == "No survey data", ba_tp1UpdateSource := bmgCompleteFile]
+
+  # Fix interview date ----
+  print("Fixing TP1 interview date - this may break with future versions of the data")
+  # This whole section should probably be inside an if version = v5
+  # (some values are badly formatted in some of the BMG files)
+  # if there is an issue the some of the dates look like strings of numbers and some look like "08/12/2017" char strings
+  # presumably becasue read_xlsx doesn't deal with variable date formats very well
+
+  # first try to force them into integers
+  # this will convert any that were dates (which got turned into integers by read_xlsx
+  # It leave a lot of NAs including those that were character strings
+  print("# -> Before fix (to test if fix required):")
+  print(summary(dt$InterviewDate))
+  dt$InterviewDate_orig <- dt$InterviewDate
+  # dt <- dt[, InterviewDate_Fixed := dmy(dt$InterviewDate)] # uses lubridate to pick up the char strings
+  #
+  # dt <- dt[!is.na(InterviewDate_Fixed), InterviewDate_orig := NA] # remove the ones we fixed
+  # # now force the conversion of the remainder to numeric and then convert using MS Windows excel epoch
+  # dt <- dt[, InterviewDate_xlFixed := as.Date(as.numeric(InterviewDate_orig), origin = "1900-01-01")] # this is unreformed Windows Excel so epoch = 1/1/1900
+  # dt <- dt[is.na(InterviewDate_Fixed), InterviewDate_Fixed := InterviewDate_xlFixed] # add the ones we fixed
+
+  dt <- dt[, InterviewDate := as.Date(InterviewDate_orig)]
+  print("# -> After fix:")
+  print(summary(dt$InterviewDate))
+  # Recode variables --
+
+  # > HRP Age ----
+  # in theory should be c 1 year older but perhaps different person responds?
+  dt <- SAVEr::recodeHrpAge(dt)
+
+  # > HRP gender ----
+  # why might this have changed? Different person responds?
+  dt <- SAVEr::recodeHrpGender(dt)
+
+  # > HRP employment ----
+  dt <- SAVEr::recodeHrpEmplType(dt)
+
+  # > N people ----
+  # people may have left/joined
+  dt <- SAVEr::recodeNpeople(dt)
+
+  # > Presence of children ----
+  dt <- SAVEr::recodePresenceChildren(dt)
+
+  # Finish ----
+  print("Done TP1 survey file")
+  return(dt)
+
+}
+
+
+#' Process the household update survey sheets from the BMG data file
+#'
+#' \code{getBmgUpdateSurvey} reads in and processes a single household update survey. This contains only those bmg_ids who
+#'     were contacted during this period.
+#'
+#'     Recoding variables: recode original update survey variables here (not in \code{getBmgAll()}).
+#'
+#' Returns a processed data.table
+#'
+#' @param file .xlsx file to load
+#' @param updateRound round of update survey to process, defaults to "TP1"
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk} (original function)
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk} (updated function)
+#'
+#' @import data.table
+#' @import readxl
+#'
+#' @family bmgData functions
+#'
+#' @export
+#'
+getBmgUpdateSurvey <- function(file, updateRound = "TP1"){
+
+  # loop over update rounds
+  upDate  <-  updateRound
+
+  # function to to process each BMG update survey file
+  print(paste0("Load ",upDate," survey data from: ", file, ", sheet = ",upDate,"Update"))
+  dt <- data.table::as.data.table(readxl::read_xlsx(file, sheet = paste0(upDate,"Update")))
+  dt$bmg_id <- dt$BMG_ID
+  dt$source <- paste0(upDate," update survey")
+  #dt$tp1UpdateSurvey <- "TP1 update survey" # replaced below but may not work
+  dt$UpdateSurvey <- paste0(upDate," update survey")
+  dt <- dt[, bmg_id := as.character(BMG_ID)]
+
+  # Set trial groups ----
+  dt <- SAVEr::setBmgTrialGroups(dt)
+
+  # to avoid confusion
+  bmgGroupVarName <- paste0("bmgGroup.",tolower(upDate),"Survey")
+  dt <- dt[, (bmgGroupVarName) := bmgGroup] # wrap bmgGroupVarName in parentheses to pass name
+  dt <- dt[, bmgGroup:= NULL]
+
+  # Duplicate check ----
+  print(paste0("Check for duplicates in ",upDate,"UpdateSurveyFile:"))
+  nHHs <- uniqueN(dt$bmg_id)
+  nObs <- nrow(dt)
+  print(paste0("Number of bmg_ids: ",nHHs))
+  print(paste0("Number of observations: ",nObs))
+
+  data.table::setkey(dt, bmg_id)
+
+  # Survey type flag ----
+  # Name clash with main survey is dealt with before merging them
+  dt <- dt[surveyMode %like% "CATI", ba_surveyMode := "Install_CATI"]
+  dt <- dt[surveyMode %like% "CAWI", ba_surveyMode := "Install_CAWI"]
+  dt <- dt[surveyMode %like% "CAPI", ba_surveyMode := "Install_f2f"]
+  dt <- dt[is.na(surveyMode), ba_surveyMode := "No survey data"]
+  print(paste0("Number of ",upDate," update survey responses: ", uniqueN(dt[ba_surveyMode != "No survey data"]$bmg_id)))
+
+  #table(dt$surveyMode,dt$ba_surveyMode, useNA = "always")
+  dt <- dt[ba_surveyMode == "No survey data", ba_tp1UpdateSource := bmgCompleteFile]
+
+  # Fix interview date ----
+  print(paste0("Fixing ",upDate," interview date - this may break with future versions of the data"))
+  # This whole section should probably be inside an if version = v5
+  # (some values are badly formatted in some of the BMG files)
+  # if there is an issue the some of the dates look like strings of numbers and some look like "08/12/2017" char strings
+  # presumably becasue read_xlsx doesn't deal with variable date formats very well
+
+  # first try to force them into integers
+  # this will convert any that were dates (which got turned into integers by read_xlsx
+  # It leave a lot of NAs including those that were character strings
+  print("# -> Before fix (to test if fix required):")
+  print(summary(dt$InterviewDate))
+  dt$InterviewDate_orig <- dt$InterviewDate
+  # dt <- dt[, InterviewDate_Fixed := dmy(dt$InterviewDate)] # uses lubridate to pick up the char strings
+  #
+  # dt <- dt[!is.na(InterviewDate_Fixed), InterviewDate_orig := NA] # remove the ones we fixed
+  # # now force the conversion of the remainder to numeric and then convert using MS Windows excel epoch
+  # dt <- dt[, InterviewDate_xlFixed := as.Date(as.numeric(InterviewDate_orig), origin = "1900-01-01")] # this is unreformed Windows Excel so epoch = 1/1/1900
+  # dt <- dt[is.na(InterviewDate_Fixed), InterviewDate_Fixed := InterviewDate_xlFixed] # add the ones we fixed
+
+  dt <- dt[, InterviewDate := as.Date(InterviewDate_orig)]
+  print("# -> After fix:")
+  print(summary(dt$InterviewDate))
+  # Recode variables --
+
+  # > HRP Age ----
+  # in theory should be c 1 year older but perhaps different person responds?
+  dt <- SAVEr::recodeHrpAge(dt)
+
+  # > HRP gender ----
+  # why might this have changed? Different person responds?
+  dt <- SAVEr::recodeHrpGender(dt)
+
+  # > HRP employment ----
+  dt <- SAVEr::recodeHrpEmplType(dt)
+
+  # > N people ----
+  # people may have left/joined
+  dt <- SAVEr::recodeNpeople(dt)
+
+  # > Presence of children ----
+  dt <- SAVEr::recodePresenceChildren(dt)
+
+  # Finish ----
+  print(paste0("Done ",upDate," survey file"))
+  return(dt)
+
+}
diff --git a/R/surveyDataUtils.R b/R/surveyDataUtils.R
new file mode 100644
index 0000000000000000000000000000000000000000..a42d99656f6ea050a7a8314d1a64b5d00f92dde9
--- /dev/null
+++ b/R/surveyDataUtils.R
@@ -0,0 +1,929 @@
+### ---- Utility functions used in survey data processing ---- ####
+#' Sets trial groups
+#'
+#' \code{setBmgTrialGroups} sets trial groups (`bmgGroup`) using `Intervention` variable
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+#'
+setBmgTrialGroups <- function(dt) {
+  # function to add BMG group label to dt. Expects 'INTERVENTIONTYPE'
+  dt <- dt[, bmgGroup := ifelse(Intervention == 1 | Intervention %like% "Control", # handles text labels
+                                "BMG Group 1: Control",
+                                NA)
+           ]
+  dt <- dt[, bmgGroup := ifelse(Intervention == 2 | Intervention %like% "LED",
+                                "BMG Group 2",
+                                bmgGroup)
+           ]
+  dt <- dt[, bmgGroup := ifelse(Intervention == 3 | Intervention %like% "with financial",
+                                "BMG Group 3",
+                                bmgGroup)
+           ]
+  dt <- dt[, bmgGroup := ifelse(Intervention == 4 | Intervention %like% "without financial",
+                                "BMG Group 4",
+                                bmgGroup)
+           ]
+  return(dt)
+}
+
+#' recodes age variable
+#'
+#' \code{recodeHrpAge} recodes `Q2B` to give `ba_Q2B_HRPage` with sensible labels
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeHrpAge <- function(dt){
+  # recode age for HRP plus 1st & 2nd persons
+  dt$ba_Q2B_HRPage <- factor(dt$Q2B,
+                             labels = c(
+                               #"< 18", # no under 18s
+                               "18 - 24",
+                               "25 - 34",
+                               "35 - 44",
+                               "45 - 54",
+                               "55 - 64",
+                               "65 - 74",
+                               "75+",
+                               "Refused"
+                             )
+  )
+
+  dt$ba_Q2B_2_person2age <- ifelse(dt$Q2B_2 == 0,NA,dt$Q2B_2)
+  dt$ba_Q2B_2_person2age <- factor(dt$ba_Q2B_2_person2age,
+                                   labels = c(
+                                     "< 16",
+                                     "16 - 24",
+                                     "25 - 34",
+                                     "35 - 44",
+                                     "45 - 54",
+                                     "55 - 64",
+                                     "65 - 74",
+                                     "75+",
+                                     "Refused"
+                                   )
+  )
+  # check
+  #table(dt$ba_Q2B_2_HRPage, dt$Q2B_2, useNA = "always")
+
+  # this breaks for TP1 update survey
+  # dt$ba_Q2B_3_age <- factor(dt$Q2B_3,
+  #                           labels = c(
+  #                             "< 16",
+  #                             "16 - 24",
+  #                             "25 - 34",
+  #                             "35 - 44",
+  #                             "45 - 54",
+  #                             "55 - 64",
+  #                             "65 - 74",
+  #                             "75+",
+  #                             "Refused"
+  #                           )
+  # )
+  # dt$ba_Q2B_4_age <- factor(dt$Q2B_4,
+  #                           labels = c(
+  #                             "< 16",
+  #                             "16 - 24",
+  #                             "25 - 34",
+  #                             "35 - 44",
+  #                             "45 - 54",
+  #                             "55 - 64",
+  #                             "65 - 74",
+  #                             "75+",
+  #                             "Refused"
+  #                           )
+  # )
+
+  return(dt)
+}
+
+#' recodes HRP employment variable
+#'
+#' \code{recodeHrpEmplType} recodes `Q2D` to give `ba_Q2D_HRPemplType` with sensible labels
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeHrpEmplType <- function(dt){
+  dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 1,
+                                    "HRP in full-time employment",
+                                    NA
+  )
+  ]
+  dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 2,
+                                          "HRP in part-time employment (8-29 hours/week)",
+                                          ba_Q2D_HRPemplType
+  )
+  ]
+  dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 4,
+                                          "HRP self-employed (unkown hours)",
+                                          ba_Q2D_HRPemplType
+  )
+  ]
+  dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 5 | Q2D == 6,
+                                          "Unemployed",
+                                          ba_Q2D_HRPemplType
+  )
+  ]
+  dt <- dt[, ba_Q2D_HRPemplType := ifelse(Q2D == 7,
+                                          "HRP retired",
+                                          ba_Q2D_HRPemplType
+  )
+  ]
+  dt <- dt[, ba_Q2D_HRPemplType := ifelse(is.na(ba_Q2D_HRPemplType)  & !is.na(Q2D),
+                                          "Other",
+                                          ba_Q2D_HRPemplType
+  )
+  ]
+  return(dt)
+}
+
+#' recodes HRP ethnicity variable
+#'
+#' \code{recodeHrpEthnicity} recodes `Q8_22` to give `ba_Q22_HRPethnicity` with sensible labels
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeHrpEthnicity <- function(dt) {
+  # Question has 20 codes, collapse
+  dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 >= 1 & Q8_22 <= 4,
+                                           1 , NA)] #"White British/Irish"
+  dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 4 & Q8_22 <= 9 ,
+                                           2 , ba_Q22_HRPethnicity)] # "Mixed"
+  dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 9 & Q8_22 <= 14 ,
+                                           3 , ba_Q22_HRPethnicity)] # "Asian/Asian British"
+  dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 14 & Q8_22 <= 17 ,
+                                           4 , ba_Q22_HRPethnicity)] # "Black/Black British"
+  dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 > 17 & Q8_22 <= 19 ,
+                                           5 , ba_Q22_HRPethnicity)] # "Other"
+  dt <- dt[, ba_Q22_HRPethnicity := ifelse(Q8_22 == 20 ,
+                                           6, ba_Q22_HRPethnicity)]
+  dt <- dt[, ba_Q22_HRPethnicity := factor(ba_Q22_HRPethnicity,
+                                           labels = c(
+                                             "White British/Irish",
+                                             "Mixed",
+                                             "Asian/Asian British",
+                                             "Black/Black British",
+                                             "Other",
+                                             "Refused"
+                                           )
+  )
+  ]
+  # check
+  #table(dt$Q8_22, dt$ba_Q22_HRPethnicity)
+  return(dt)
+}
+
+#' recodes HRP gender variable
+#'
+#' \code{recodeHrpGender} recodes `Q2C` to give `ba_Q2C_HRPgender` with sensible labels
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeHrpGender <- function(dt) {
+  dt$ba_Q2C_HRPgender <- factor(dt$Q2C,
+                                levels = c(1,2,3),
+                                labels=c("Male", "Female", "Refused")
+  )
+  #table(dt$Q2C, dt$ba_Q2C_HRPgender, useNA = "always")
+  return(dt)
+}
+
+#' codes HRP NS-SEC variable
+#'
+#' \code{codeHrpNSSEC} uses a number of variable to derive `ba_hrpNSSEC7` and `ba_hrpNSSEC3` with sensible labels
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+codeHrpNSSEC <- function(dt) {
+  # https://www.ons.gov.uk/methodology/classificationsandstandards/otherclassifications/thenationalstatisticssocioeconomicclassificationnssecrebasedonsoc2010#deriving-the-ns-sec-full-reduced-and-simplified-methods
+  # specifically: http://www.ons.gov.uk/ons/guide-method/classifications/current-standard-classifications/soc2010/soc2010-volume-3-ns-sec--rebased-on-soc2010--user-manual/section-14--deriving-the-ns-sec--self-coded-method.pdf
+
+  # returns NS-SEC 7 & NS-SEC 3
+  # if self-empl
+  # empl others?
+  # Yes:
+  # > 25 of them -> Empl large org Code 1
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 2 & # "Self-employed"
+                                      Q8_18 == 2 & # has employees
+                                      Q8_19 >= 2 # 25+ employees
+                                    , 1, NA)
+           ]
+  # < 25 of them -> Empl small org Code 2
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 2 & # "Self-employed" &
+                                      Q8_18 == 2 & # has employees
+                                      Q8_19 == 1 # < 25 employees
+                                    , 2, ba_hrpNSSEC7)
+           ]
+  # No: self empl, no employees -> Code 3
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 2 & # "Self-employed" &
+                                      Q8_18 == 1  # no employees
+                                    , 3, ba_hrpNSSEC7)
+           ]
+  # if empl
+  # manager? # comes from Q8_13 which is verbatim!
+  # manager currently not identified, uses supervisor as filter
+  # Yes:
+  # > 25 of them -> Managers large org Code 4
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 1 & # "employed" &
+                                      Q8_16 == 1 & # supervises
+                                      Q8_17 >= 2 #  25+ employees
+                                    , 4, ba_hrpNSSEC7)
+           ]
+  # < 25 of them -> Managers small org Code 5
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 1 & # "employed" &
+                                      Q8_16 == 1 & # supervisees
+                                      Q8_17 == 1 #  < 25 employees
+                                    , 5, ba_hrpNSSEC7)
+           ]
+  # Not manager:
+  # supervise others?
+  # 6 or 7 (depending if supervisors?)
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(Q8_15 == 1 & # "employed" &
+                                      Q8_16 == 2  # no supervisees
+                                    , "6 or 7", ba_hrpNSSEC7)
+           ]
+  dt <- dt[, ba_hrpNSSEC3 := ifelse(ba_hrpNSSEC7 == 1 | ba_hrpNSSEC7 == 4,
+                                    1, NA)
+           ]
+  dt <- dt[, ba_hrpNSSEC3 := ifelse(ba_hrpNSSEC7 == 2 | ba_hrpNSSEC7 == 5,
+                                    2, ba_hrpNSSEC3)
+           ]
+  dt <- dt[, ba_hrpNSSEC3 := ifelse(ba_hrpNSSEC7 == 3 | ba_hrpNSSEC7 == "6 or 7",
+                                    3, ba_hrpNSSEC3)
+           ]
+  # test
+  # table(hhAttributesDT$ba_hrpNSSEC3,hhAttributesDT$ba_hrpNSSEC7, useNA = "always")
+
+  # recode NS-SEC to include those who can't be coded (students etc)
+  dt <- dt[, ba_hrpNSSEC3 := ifelse(is.na(ba_hrpNSSEC3) & # NS-SEC not coded
+                                      !is.na(surveyMode), # but did complete survey
+                                    "Not coded",
+                                    ba_hrpNSSEC3
+  )
+  ]
+  dt <- dt[, ba_hrpNSSEC7 := ifelse(is.na(ba_hrpNSSEC7) & # NS-SEC not coded
+                                      !is.na(surveyMode), # but did complete survey
+                                    "Not coded",
+                                    ba_hrpNSSEC7
+  )
+  ]
+  return(dt)
+}
+
+#' recodes heat source variable
+#'
+#' \code{recodeHeatSource} recodes `Q3_12` to give `ba_heatSourceReduced` with sensible labels
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeHeatSource <- function(dt){
+  # recode heat source from full survey (not on update survey - assume is still correct)
+  dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 4 |
+                                              Q3_12 == 5 |
+                                              Q3_12 == 6 |
+                                              Q3_12 == 7 |
+                                              Q3_12 == 8,
+                                            "Oil/wood/solid fuels/biomass/other",
+                                            NA
+  )
+  ]
+  dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 1,
+                                            "Electricity (storage heaters)",
+                                            ba_heatSourceReduced
+  )
+  ]
+  dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 2,
+                                            "Other electricity (e.g. heat pump)",
+                                            ba_heatSourceReduced
+  )
+  ]
+  dt <- dt[, ba_heatSourceReduced := ifelse(Q3_12 == 3,
+                                            "Gas boiler",
+                                            ba_heatSourceReduced
+  )
+  ]
+
+  if(isTRUE(message)) message("Create derived 'HeatSource' to make chart labels and table names easier to read")
+
+  # recode to make charts easier
+  namesFuels <-
+    c('Gas boiler' = "1 Gas",
+      "Electricity (storage heaters)" = "Electricity:\nstorage heaters",
+      "Oil/wood/solid fuels/biomass/other" = "Oil, wood\nsolid fuels\nbiomass, other",
+      "Other electricity (e.g. heat pump)" = "Other electricity\ne.g. ground source\nheat pumps")
+
+  dt <- dt[, HeatSource := plyr::revalue(ba_heatSourceReduced, namesFuels)] # <- only asked at recruitment
+  return(dt)
+}
+
+
+#' recodes heat source variable for additional re-categorisation (multiple variants)
+#'
+#' \code{recodeHeatSource} recodes `Q3_12` to give:
+#' `tr_heatSourceCustType`: 6 categories
+#' `tr_heatSourceCensus`: 5 categories (electric combined)
+#' `tr_heatSourceSSEN`: 3 categories (Gas, Electric and Other)
+#' Recoding gives sensible labels for charts etc
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+#'
+recodeHeatSourceExt <- function(dt){
+  # recode heat source from full survey (not on update survey - assume is still correct)
+  dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 5 |
+                                               Q3_12 == 6 |
+                                               Q3_12 == 7,
+                                             "Solid/biomass",
+                                             NA
+  )
+  ]
+  dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 8,
+                                             "Other",
+                                             tr_heatSourceCustType
+  )
+  ]
+  dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 1,
+                                             "Elec. storage",
+                                             tr_heatSourceCustType
+  )
+  ]
+  dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 2,
+                                             "Elec. other",
+                                             tr_heatSourceCustType
+  )
+  ]
+  dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 3,
+                                             "Gas",
+                                             tr_heatSourceCustType
+  )
+  ]
+  dt <- dt[, tr_heatSourceCustType := ifelse(Q3_12 == 4,
+                                             "Oil",
+                                             tr_heatSourceCustType
+  )
+  ]
+  # now re-order using factors
+  dt <- dt[, tr_heatSourceCustType := factor(dt$tr_heatSourceCustType,
+                                             levels = c("Gas", "Elec. storage", "Elec. other", "Oil",
+                                                        "Solid/biomass", "Other"),  # specify in order required
+                                             labels = c("Gas", "Elec. storage", "Elec. other", "Oil",
+                                                        "Solid/biomass", "Other")   # create labels for levels (must be same order)
+  )]
+
+  # recode original heat source (Q3_12) into census aligned categories
+
+  dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 5 |
+                                             Q3_12 == 6 |
+                                             Q3_12 == 7,
+                                           "Solid/biomass",
+                                           NA
+  )
+  ]
+  dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 8,
+                                           "Other",
+                                           tr_heatSourceCensus
+  )
+  ]
+  dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 1 |
+                                             Q3_12 == 2,
+                                           "Electric",
+                                           tr_heatSourceCensus
+  )
+  ]
+  dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 3,
+                                           "Gas",
+                                           tr_heatSourceCensus
+  )
+  ]
+  dt <- dt[, tr_heatSourceCensus := ifelse(Q3_12 == 4,
+                                           "Oil",
+                                           tr_heatSourceCensus
+  )
+  ]
+  # now re-order using factors
+  dt <- dt[, tr_heatSourceCensus := factor(dt$tr_heatSourceCensus,
+                                           levels = c("Gas", "Electric", "Oil",
+                                                      "Solid/biomass", "Other"),  # specify in order required
+                                           labels = c("Gas", "Electric", "Oil",
+                                                      "Solid/biomass", "Other")   # create labels for levels (must be same order)
+  )]
+
+  # recode original heat source (Q3_12) into 3 categories (request by SSEN)
+
+  dt <- dt[, tr_heatSourceSSEN := ifelse(Q3_12 == 4 |
+                                           Q3_12 == 5 |
+                                           Q3_12 == 6 |
+                                           Q3_12 == 7 |
+                                           Q3_12 == 8,
+                                         "Other",
+                                         NA
+  )
+  ]
+  dt <- dt[, tr_heatSourceSSEN := ifelse(Q3_12 == 1 |
+                                           Q3_12 == 2,
+                                         "Electric",
+                                         tr_heatSourceSSEN
+  )
+  ]
+  dt <- dt[, tr_heatSourceSSEN := ifelse(Q3_12 == 3,
+                                         "Gas",
+                                         tr_heatSourceSSEN
+  )
+  ]
+  # now re-order using factors
+  dt <- dt[, tr_heatSourceSSEN := factor(dt$tr_heatSourceSSEN,
+                                         levels = c("Gas", "Electric", "Other"),  # specify in order required
+                                         labels = c("Gas", "Electric", "Other")   # create labels for levels (must be same order)
+  )]
+
+  return(dt)
+}
+
+
+#' recodes n people
+#'
+#' \code{recodeNpeople} recodes `Q2` to give `ba_Q2_npeople_reduced` & `ba_censusNpeople` (matches UK Census)
+#' with sensible labels.
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeNpeople <- function(dt){
+  dt$ba_Q2_npeople <- dt$Q2 # n people
+  # reduced form
+  dt <- dt[, ba_Q2_npeople_reduced := ifelse(ba_Q2_npeople != "Refused", as.numeric(ba_Q2_npeople), ba_Q2_npeople) ]
+  dt <- dt[, ba_Q2_npeople_reduced := ifelse(ba_Q2_npeople_reduced != "Refused" & ba_Q2_npeople_reduced > 4, "5+", ba_Q2_npeople_reduced) ]
+  # dt <- dt[, ba_Q2_npeople_reduced := ifelse(ba_Q2_npeople > 4, "5+",ba_Q2_npeople) ] - old code retained for roll-back
+  # census form
+  dt <- dt[, ba_censusNpeople := ifelse(ba_Q2_npeople != "Refused", as.numeric(ba_Q2_npeople), ba_Q2_npeople) ]
+  dt <- dt[, ba_censusNpeople := ifelse(ba_censusNpeople != "Refused" & ba_censusNpeople > 7, "8+",ba_censusNpeople) ]
+  dt <- dt[, ba_censusNpeople := ifelse(ba_censusNpeople == 10 | ba_censusNpeople == 13, "8+",ba_censusNpeople) ]
+  # dt <- dt[, ba_censusNpeople := ifelse(ba_Q2_npeople > 7, "8+",ba_Q2_npeople) ] - old code retained for roll-back
+  # check
+  #table(dt$ba_censusNpeople, dt$ba_Q2_npeople, useNA = "always")
+  return(dt)
+}
+
+#' recodes n cars variable
+#'
+#' \code{recodeNcars} recodes `Q8_9` to give `ba_censusNcars` with sensible labels.
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeNcars <- function(dt){
+  dt$ba_Q8_9_ncars <- dt$Q8_9 # n cars - not asked at update
+  # census form
+  dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 1, "0",NA) ]
+  dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 2, "1",ba_censusNcars) ]
+  dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 3, "2",ba_censusNcars) ]
+  dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars == 4, "3",ba_censusNcars) ]
+  dt <- dt[, ba_censusNcars := ifelse(ba_Q8_9_ncars >= 5 , "4+",ba_censusNcars) ]
+  # check
+  #table(dt$ba_censusNcars, dt$ba_Q8_9_ncars, useNA = "always")
+  return(dt)
+}
+
+#' recodes n children variable
+#'
+#' \code{recodePresenceChildren} recodes `Q2B_2` to give `ba_presenceChildren` with sensible labels.
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodePresenceChildren <- function(dt){
+  dt <- dt[, ba_presenceChildren := ifelse(dt$Q2B == 1 | dt$Q2B_2 == 1 | dt$Q2B_3 == 1 |
+                                             dt$Q2B_4 == 1 | dt$Q2B_5 == 1 |
+                                             dt$Q2B_6 == 1 | dt$Q2B_7 == 1 |
+                                             dt$Q2B_8 == 1,
+                                           "At least one child < 16",
+                                           "No children"
+  )
+  ]
+  # this produces NAs where there are no children AND also where we have no survey
+  dt$ba_presenceChildren[is.na(dt$ba_presenceChildren)] <- "No children"
+  dt$ba_presenceChildren[is.na(dt$surveyMode)] <- NA # set non-survey to NA - redundant but works
+  return(dt)
+}
+
+#' recodes tenure variable
+#'
+#' \code{recodeTenure} recodes `Q3_1` to give `ba_censusTenure` with sensible labels. Format matches UK Census.
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeTenure <- function(dt){
+  # Not asked at update (perhaps should have been)
+  dt <- dt[, ba_censusTenure := ifelse(Q3_1 == 1 | Q3_1 == 2, # "Own outright" | Q3_1 == "Own with mortgage"
+                                       "Own", NA)]
+  dt <- dt[Q3_2 == 1 | # "Housing association, housing co-operative, charitable trust, registered social landlord"
+             Q3_2 == 2,  # "Council (local authority)"
+           ba_censusTenure := "Social rent"
+           ]
+
+  dt <- dt[Q3_2 == 3 | # "Private landlord or letting agency"
+             Q3_2 == 4  | # "Employer of a household member"
+             Q3_2 == 5 | # "Relative or friend of a household member"
+             Q3_2 == 6 , # "Other"
+           ba_censusTenure := "Private rent"
+           ]
+  dt <- dt[Q3_1 == 7 | # "other"
+             Q3_1 == 8  | # "refused"
+             Q3_2 == 7 | # "dk"
+             Q3_2 == 6 , # "Other"
+           ba_censusTenure := "x.Refused/dk/Other"
+           ]
+  table(dt$ba_censusTenure, dt$Q3_1, useNA = "always")
+  return(dt)
+}
+
+#' recodes income variable
+#'
+#' \code{recodeIncome} recodes `Q8_27` to give `ba_Q8_27_Income` with sensible labels.
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Ben Anderson \email{b.anderson@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+recodeIncome <- function(dt){
+  # Question has 16 codes, collapse
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 >= 1 & Q8_27 <= 4,
+                                       1 , NA)] # < 10k
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 1 & Q8_27 <= 5 ,
+                                       2 , ba_Q8_27_Income)] # 10-20
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 5 & Q8_27 <= 9 ,
+                                       3 , ba_Q8_27_Income)] # 20-30
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 9 & Q8_27 <= 11 ,
+                                       4 , ba_Q8_27_Income)] # 30-40
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 11 & Q8_27 <= 12 ,
+                                       5 , ba_Q8_27_Income)] # 40-50
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 12 & Q8_27 <= 13 ,
+                                       6, ba_Q8_27_Income)] # 50-60
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 13 & Q8_27 <= 14 ,
+                                       7, ba_Q8_27_Income)] # 60-80
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 > 14 & Q8_27 <= 16 ,
+                                       8, ba_Q8_27_Income)] # 80+
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 == 17 ,
+                                       9, ba_Q8_27_Income)] # DK
+  dt <- dt[, ba_Q8_27_Income := ifelse(Q8_27 == 18 ,
+                                       10, ba_Q8_27_Income)] # Ref
+  dt <- dt[, ba_Q8_27_Income := factor(ba_Q8_27_Income,
+                                       labels = c(
+                                         "< £10k p.a",
+                                         "£10-£20k",
+                                         "£20-£30k",
+                                         "£30-£40k",
+                                         "£40-£50k",
+                                         "£50-60k",
+                                         "£60-80k",
+                                         ">£80k",
+                                         "Don't know",
+                                         "Refused"
+                                       )
+  )
+  ]
+  # check
+  #table(dt$Q8_27, dt$ba_Q8_27_Income, useNA = "always")
+  return(dt)
+}
+
+
+#' recodes household survey questions related to high-power electric appliances
+#'
+#' \code{recodeAppliances} takes `Q6_1`, `Q3_18_1`, `Q3_19_27`, `Q3_19_23` and `Q3_19_30`
+#' to return yes/no responses for types of appliance (e.g. EV or electric immersion heater).
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+
+recodeAppliances <- function(dt){
+
+# Electric vehicles (EV)
+
+  dt <- dt[, electricVehicle := Q6_1.latest]
+  dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 1, "Yes", electricVehicle)]
+  dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 2, "No", electricVehicle)]
+  dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 3, "Don't know", electricVehicle)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, electricVehicle := factor(dt$electricVehicle,
+                                       levels = c("No","Yes","Don't know"))]
+
+# Water heating - immersion heater (elec)
+
+  dt <- dt[, waterHeat := Q3_18_1]
+  dt <- dt[, waterHeat := ifelse(Q3_18_1 >= 1, "Yes", waterHeat)]
+  dt <- dt[, waterHeat := ifelse(Q3_18_1 == 0, "No", waterHeat)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, waterHeat := factor(dt$waterHeat,
+                                       levels = c("No","Yes"))]
+
+# Plug-in heater
+
+  dt <- dt[, plugInHeat := Q3_19_27]
+  dt <- dt[, plugInHeat := ifelse(Q3_19_27 >= 1, "Yes", plugInHeat)]
+  dt <- dt[, plugInHeat := ifelse(Q3_19_27 == 0, "No", plugInHeat)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, plugInHeat := factor(dt$plugInHeat,
+                                       levels = c("No","Yes"))]
+
+# Air con
+
+  dt <- dt[, airCon := Q3_19_23]
+  dt <- dt[, airCon := ifelse(Q3_19_23 >= 1, "Yes", airCon)]
+  dt <- dt[, airCon := ifelse(Q3_19_23 == 0, "No", airCon)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, airCon := factor(dt$airCon,
+                                       levels = c("No","Yes"))]
+
+# Power shower
+
+  dt <- dt[, powerShower := Q3_19_30]
+  dt <- dt[, powerShower := ifelse(Q3_19_30 >= 1, "Yes", powerShower)]
+  dt <- dt[, powerShower := ifelse(Q3_19_30 == 0, "No", powerShower)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, powerShower := factor(dt$powerShower,
+                                       levels = c("No","Yes"))]
+
+# Tumble drier
+
+  dt <- dt[, tumbleDrier := Q3_19_3]
+  dt <- dt[, tumbleDrier := ifelse(Q3_19_3 >= 1, "Yes", tumbleDrier)]
+  dt <- dt[, tumbleDrier := ifelse(Q3_19_3 == 0, "No", tumbleDrier)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, tumbleDrier := factor(dt$tumbleDrier,
+                                   levels = c("No","Yes"))]
+
+# Combined washing maching and tumble
+
+  dt <- dt[, washTumble := Q3_19_2]
+  dt <- dt[, washTumble := ifelse(Q3_19_2 >= 1, "Yes", washTumble)]
+  dt <- dt[, washTumble := ifelse(Q3_19_2 == 0, "No", washTumble)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, washTumble := factor(dt$washTumble,
+                                  levels = c("No","Yes"))]
+
+# Tumble drier or combined washer drier
+
+  dt <- dt[, tumbleDryAny := ifelse(tumbleDrier == "Yes" | washTumble == "Yes", "Yes", "No")]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, tumbleDryAny := factor(dt$tumbleDryAny,
+                                  levels = c("No","Yes"))]
+
+# Dishwasher
+
+  dt <- dt[, dishWasher := Q3_19_5]
+  dt <- dt[, dishWasher := ifelse(Q3_19_5 >= 1, "Yes", dishWasher)]
+  dt <- dt[, dishWasher := ifelse(Q3_19_5 == 0, "No", dishWasher)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, dishWasher := factor(dt$dishWasher,
+                                   levels = c("No","Yes"))]
+
+# Electric Hob
+
+  dt <- dt[, electricHob := Q3_19_6]
+  dt <- dt[, electricHob := ifelse(Q3_19_6 >= 1, "Yes", electricHob)]
+  dt <- dt[, electricHob := ifelse(Q3_19_6 == 0, "No", electricHob)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, electricHob := factor(dt$electricHob,
+                                   levels = c("No","Yes"))]
+
+# Electric Oven
+
+  dt <- dt[, electricOven := Q3_19_7]
+  dt <- dt[, electricOven := ifelse(Q3_19_7 >= 1, "Yes", electricOven)]
+  dt <- dt[, electricOven := ifelse(Q3_19_7 == 0, "No", electricOven)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, electricOven := factor(dt$electricOven,
+                                   levels = c("No","Yes"))]
+
+  return(dt)
+
+}
+
+#' recodes household survey questions related to low-carbon technologies (LCTs)
+#'
+#' \code{recodeLCTs} takes `Q6_1`, `Q3_3_2`, `Q3_3_5` and `Q3_12`
+#' to return yes/no responses for types of installed LCT (e.g. EV, PV, heat-pump or CHP).
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+
+recodeLCTs <- function(dt){
+
+  # Electric vehicles (EV)
+
+  dt <- dt[, electricVehicle := Q6_1.latest]
+  dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 1, "Yes", electricVehicle)]
+  dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 2, "No", electricVehicle)]
+  dt <- dt[, electricVehicle := ifelse(Q6_1.latest == 3, "Don't know", electricVehicle)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, electricVehicle := factor(dt$electricVehicle,
+                                       levels = c("No","Yes","Don't know"))]
+
+  # Electricity provision
+
+  # solar voltaic panels (PV)
+  dt <- dt[, solarPV := Q3_3_2]
+  dt <- dt[, solarPV := ifelse(Q3_3_2 == 1, "Yes", solarPV)]
+  dt <- dt[, solarPV := ifelse(Q3_3_2 == 0, "No", solarPV)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, solarPV := factor(dt$solarPV,
+                               levels = c("No","Yes"))]
+
+  # conbined heat and power (CHP)
+  dt <- dt[, chp := Q3_3_5]
+  dt <- dt[, chp := ifelse(Q3_3_5 == 1, "Yes", chp)]
+  dt <- dt[, chp := ifelse(Q3_3_5 == 0, "No", chp)]
+  # finally, re-order ("No" will now be contrast in models)
+  dt <- dt[, chp := factor(dt$chp,
+                           levels = c("No","Yes"))]
+
+  # heat pump (inferred from 'other electric' heat source Q3_12)
+  dt <- dt[, heatPump := "No"]
+  dt <- dt[, heatPump := ifelse(Q3_12 == 2, "Yes", heatPump)]
+  dt <- dt[, heatPump := ifelse(Q3_12 == 9, "Don't know", heatPump)]
+  dt <- dt[, heatPump := factor(dt$heatPump,
+                                       levels = c("No","Yes","Don't know"))]
+
+  return(dt)
+
+}
+
+
+#' recodes household survey responses related to qualification (education) of HRP
+#'
+#' \code{recodeQuals} takes `Q8_20` and provides recoded responses as character
+#' rather than numeric categories. Two variants are returned:
+#' `tr_qualRecoded` - a full recoding (13 categories)
+#' `Qualification` - a reduced coding (5 categories)
+#'
+#' Returns a processed data.table
+#'
+#' @param dt data.table to process
+#'
+#' @author Tom Rushby \email{t.w.rushby@@soton.ac.uk}
+#'
+#' @import data.table
+#'
+#' @export
+
+recodeQuals <- function(dt) {
+  # recode Q8_20 highest qualifications (respondent)
+  # first map values
+  valuesQualifications <- c(
+    `1` = "Higher Degree",
+    `2` = "Degree",
+    `3` = "A/AS level, Scottish Higher, ONC/OND",
+    `4` = "HNC/HND",
+    `5` = "HNC/HND",
+    `6` = "A/AS level, Scottish Higher, ONC/OND",
+    `7` = "A/AS level, Scottish Higher, ONC/OND",
+    `8` = "A/AS level, Scottish Higher, ONC/OND",
+    `9` = "GCSE/O level",
+    `10` = "GCSE/O level",
+    `11` = "Other",
+    `12` = "No qualifications",
+    `13` = "Don't know")
+
+  # now revalue Q8_20 to create new variable
+  dt <- dt[, tr_qualRecoded := plyr::revalue(as.factor(Q8_20), valuesQualifications)]
+
+  # finally, re-order ("No qualifications" will now be contrast in models)
+  dt <- dt[, tr_qualRecoded := factor(dt$tr_qualRecoded,
+                                      levels = c("No qualifications", "GCSE/O level",
+                                                 "A/AS level, Scottish Higher, ONC/OND",
+                                                 "HNC/HND", "Degree", "Higher Degree",
+                                                 "Other", "Don't know"))]
+
+  # recode Q8_20 highest qualifications (respondent)
+  # first map values
+  valuesQualsReduced <- c(
+    `1` = "HNC/D, degree and higher",
+    `2` = "HNC/D, degree and higher",
+    `3` = "A/AS level, Scottish Higher, ONC/OND",
+    `4` = "HNC/D, degree and higher",
+    `5` = "HNC/D, degree and higher",
+    `6` = "A/AS level, Scottish Higher, ONC/OND",
+    `7` = "A/AS level, Scottish Higher, ONC/OND",
+    `8` = "A/AS level, Scottish Higher, ONC/OND",
+    `9` = "GCSE/O level or below",
+    `10` = "GCSE/O level or below",
+    `11` = "Other",
+    `12` = "GCSE/O level or below",
+    `13` = "Don't know")
+
+  # now revalue Q8_20 to create new variable
+  dt <- dt[, Qualification := plyr::revalue(as.factor(Q8_20), valuesQualsReduced)]
+
+  # finally, re-order ("No qualifications" will now be contrast in models)
+  dt <- dt[, Qualification := factor(dt$Qualification,
+                                     levels = c("GCSE/O level or below",
+                                                "A/AS level, Scottish Higher, ONC/OND",
+                                                "HNC/D, degree and higher",
+                                                "Other", "Don't know"))]
+
+  return(dt)
+
+}
+
diff --git a/R/utilities.R b/R/utilities.R
new file mode 100644
index 0000000000000000000000000000000000000000..061d6a24079de4534359956e785d4d73c3c0e157
--- /dev/null
+++ b/R/utilities.R
@@ -0,0 +1,295 @@
+#' Tidy long numbers
+#'
+#' \code{tidyNum} reformats long numbers to include commas and prevents scientific formats,
+#'  making them suitable for printing within R Markdown reports and inline text.
+#'
+#' @param number an input number or list
+#'
+#' @examples
+#' tidyNum(123456789)
+#' tidyNum(10^6)
+#' tidyNum(c(10^6, 10^7, 10^8))
+#'
+#' @author Ben Anderson, \email{banderson@@soton.ac.uk}
+#' @export
+#' @family Utilities
+
+tidyNum <- function(number) {
+  format(number, big.mark=",", scientific=FALSE)
+}
+
+#' Return middle 98\% of a distribution
+#'
+#' \code{getP98} filters top & bottom 1\% and returns the middle 98\%
+#'
+#' @param dt the data table
+#' @param var the variable to filter
+#' @return the middle 98pc as a data.table
+#'
+#' @author Ben Anderson, \email{banderson@@soton.ac.uk}
+#' @export
+#' @family Utilities
+
+getP98 <-function(dt,var){
+  # expects a data.table
+  # returns a data.table with the middle 98% of var
+
+  # Calculate mean & sd for use in calculating bounds
+  attach(dt)
+  tempMean <- mean(get(var))
+  tempSD <- sd(get(var))
+
+  # p98 extract
+  p01 <- quantile(get(var), probs = 0.01) # central 98%
+  p99 <- quantile(get(var), probs = 0.99)
+  dt <- subset(dt, p01 < get(var) & get(var) < p99)
+  return(dt)
+}
+
+#' Return  middle 95\% of a distribution using z scores
+#'
+#' \code{getZ95} filters top & bottom 2.5\% using z scores and returns the middle 95\%
+#'
+#' @param dt the data table
+#' @param var the variable to filter
+#'
+#'
+#' @author Ben Anderson, \email{banderson@@soton.ac.uk}
+#' @export
+#' @family Utilities
+
+getZ95 <-function(dt,var){
+  # expects a data.table
+  # returns a data.table with only +/- 95% z for var
+
+  # Calculate mean & sd for use in calculating bounds
+  attach(dt)
+  tempMean <- mean(get(var))
+  tempSD <- sd(get(var))
+
+  # z95 extract
+  z95Lower <- tempMean - (qnorm(0.975) * tempSD) # +/- 2.5%
+  z95Upper <- tempMean + (qnorm(0.975) * tempSD)
+
+  dt <- subset(dt, z95Lower < get(var) & get(var) < z95Upper)
+
+  return(dt)
+}
+
+#' Prints information about the R session
+#'
+#' \code{DisplaySystemInfo} provides a summary printout of the
+#'   R session, including the operating system, user info and project location.
+#'
+#'   Before running this, the user should have defined the \code{projLoc} variable
+#'   using the \code{SAVEr::findParentDirectory}
+#'
+#' @param printMessage TRUE/FALSE. States whether the parameter should be printed.
+#'
+#' @author Mikey Harper, \email{m.harper@@soton.ac.uk}
+#' @seealso  \code{\link{findParentDirectory}}
+#' @seealso \code{\link{assignUserSystemInfo}}
+#' @export
+#' @family Utilities
+
+displaySystemInfo <- function(printMessage = TRUE){
+
+  if(is.na(sysName)) assignUserSystemInfo()
+  if(is.na(projLoc)) stop("Please specify `projLoc` in the Global Environment")
+
+  if(isTRUE(printMessage)){
+    cat(paste0("Running on ", sysName, " with projLoc = ", projLoc, "\n \n",
+               " User: ",Sys.info()[[7]]), "\n",
+        "Platform: ",Sys.info()[[4]], "\n",
+        "Input data path: ",dPath, "\n")
+  }
+
+}
+
+#' Installs and loads packages
+#'
+#' \code{myRequiredPackages} checks whether the package is already installed,
+#'   installing those which are not preinstalled. All the libraries are then load.
+#'
+#'   Especially useful when running on virtual machines where package installation
+#'   is not persistent (Like UoS sve). It will fail if the packages need to be
+#'   installed but there is no internet access
+#' @param ... A list of packages
+#' @param repository The repository to load functions from. Defaults to "https://cran.ma.imperial.ac.uk/"
+#' @importFrom  utils install.packages
+#'
+#' @author Luke Blunden, \email{lsb@@soton.ac.uk} (original)
+#' @author Michael Harper \email{m.harper@@soton.ac.uk} (revised version)
+#' @export
+#' @family Utilities
+
+myRequiredPackages <- function(..., repository = "https://cran.rstudio.com/"){
+
+  packages <- c(...)
+
+  # Find if package isn't installed
+  newPackages <- packages[!(packages %in% utils::installed.packages()[,1])]
+
+  # Install if required
+  if (length(newPackages)){utils::install.packages(newPackages, dependencies = TRUE)}
+
+  # Load packages
+  sapply(packages, require, character.only = TRUE)
+}
+
+
+#' Installs and loads packages from Github
+#'
+#' \code{myRequiredPackagesGithub} checks whether the package is already installed.
+#'   If the package is not detected, it will attempt to install it from GitHub,
+#'   before loading the packages.
+#'
+#'
+#' @param ... A list of packages
+#' @importFrom  utils install.packages
+#'
+#' @author Michael Harper \email{m.harper@@soton.ac.uk}
+#' @export
+#' @family Utilities
+
+myRequiredPackagesGithub <- function(..., quiet = FALSE){
+
+  packages <- c(...)
+
+  # Only keep package name
+  packageName <- gsub("^.*/", "", packages)
+
+  # Find if package isn't installed
+  newPackages <- packages[!(packageName %in% utils::installed.packages()[,1])]
+
+  # Install if required
+  if (length(newPackages)){devtools::install_github(newPackages)}
+
+  # Load packages
+  sapply(packageName, require, character.only = TRUE)
+}
+
+#' Compresses any file to .gz file
+#'
+#' The function will check and delete any existing compressed file
+#'
+#' @param file path to file
+#' @param output file
+#' @author Ben Anderson
+#'
+#' @family Utilities
+gzipIt <- function(file){
+
+  # Check if input exists
+  if(!file.exists(file)) stop("Input `file` not located.")
+
+  # Path of output file
+  zipped <- paste0(file, ".gz")
+
+  # Rremove old gzip file if exists
+  if(file.exists(zipped)){
+    message("Deleting old File...")
+    file.remove(zipped)
+    message("DONE\n")
+  }
+
+  # Gzip new one
+  # in case it fails (it will on windows - you will be left with the file)
+  message(paste0("gzipping file to: ", zipped))
+  try(system( paste0("gzip ", file)))
+  message("Gzipped file created.")
+}
+
+#' Extract a compressed .7z file
+#'
+#' The function will unpack a compressed file
+#'
+#' @param inFile name of archive file
+#' @param fileDir directory containing file
+#'
+#' @author Tom Rushby, \email{(t.w.rushby@@soton.ac.uk)}
+#' @export
+#'
+#' @family Utilities
+#' @examples
+#' extract7z(fileDir = "/SotonGitLab/SERG/SAVE", inFile = "LED_installs.7z")
+#' extract7z(fileDir = outFileDir, inFile = inFile)
+extract7z <- function(fileDir, inFile){
+
+  filePath <- paste0(fileDir,inFile)
+  # Check if input file exists
+  if(!file.exists(filePath)) stop("Input `file` does not exist.")
+
+  archive::archive(filePath)
+
+  # create output file directory
+  print(paste0("Extracting file to location: ",fileDir,"\n"))
+
+  archive::archive_extract(filePath, fileDir, file = NULL)
+
+}
+
+
+#' Find user information
+#'
+#' \code{UserSystemInfo} lists the user system information. No parameters have to
+#' be provided
+#'
+#' @return The system name, username and computer.
+#' @author Mikey Harper, \email{m.harper@@soton.ac.uk}
+#' @export
+#' @family Utilities
+assignUserSystemInfo <- function(){
+
+  assign("sysInfo", Sys.info(), envir = globalenv())
+  assign("sysName", sysInfo[[1]], envir = globalenv())
+  assign("nodeName", sysInfo[[4]], envir = globalenv())
+  assign("userName", sysInfo[[7]], envir = globalenv())
+
+  rm(sysInfo, envir = globalenv())
+}
+
+#' Converts timestamps between local times
+#'
+#' Overrides the timestamp of a timestamp, converting the clock time to a different
+#'  local time.
+#'
+#' @description
+#' All data collected within the SAVE project is collected in UTC time. For the analysis
+#'  of behaviour patterns, it is important that the local time is used
+#'
+#' @param timestamp a POSIXct, POSIXlt, Date, chron date-time object or a data.frame object.
+#' @param from,to valid timezone values
+#' @export
+#' @author Michael Harper
+#'
+#' @family Utilities
+convertTimezone <- function(timestamp, from = "UTC", to =  "Europe/London", message = TRUE, warning = TRUE){
+
+  # check valid timestamps
+  if(!is.POSIXct(timestamp)) stop("timestamp must be a POSIXct object")
+  if(!(from %in% OlsonNames())) stop("specified 'from' is not a valid timezone")
+  if(!(to %in% OlsonNames())) stop("specified 'to' is not a valid timezone")
+
+  # Check the input timestamp
+  inputTimezone <- attr(timestamp, "tzone")
+
+  if(from != inputTimezone){
+
+    # Prints warning if specified
+    if(warning) warning("Input timezone does not match the specified object.",
+                        "Timezone of object: ", inputTimezone, "\n",
+                        "Timezone specified: ", from, "\n")
+
+
+    if(message) message("Timezone overwritten by function without conversion of the clock time")
+
+    timestamp <- lubridate::force_tz(time = timestamp, tzone = from)
+  }
+
+  # Convert ID
+  timestamp2 <- lubridate::with_tz(time = timestamp, tzone = to)
+
+  return(timestamp2)
+
+}