# Functions for cleaning & processing NZ Census

#' Set different NZ Census years from a data.table
#'
#' \code{setYears} takes a 'long' form data.table and returns the same 
#' data.table with a new 'year' variable. Function is useful when
#' data files have columns for several years (as in public 2013 MB release). 
#' Don't try to pass in a data.framecas the string match syntax is data.table 
#' specific. A future version might coerce a d.f into a d.t and then reverse 
#' that for the results. Maybe.
#' 
#' Requires that the 'variable' column has the original text column labels.
#' 
#' Currently labelled years are: 2001, 2006, 2013
#' 
#' @param dt the input data table
#' 
#' @import data.table
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
setYears <- function(dt){
  dt <- dt[variable_orig %like% "2001", year := 2001]
  dt <- dt[variable_orig %like% "2006", year := 2006]
  dt <- dt[variable_orig %like% "2013", year := 2013]
  return(dt)
}

#' Fix count labels
#'
#' \code{fixCountLabels} returns a data.table with simplified textual labels in 
#' the `countLabel` column . Function is useful when data files have rows for 
#' several years and the year forms the first part of the original column name 
#' (as in public 2013 MB release).cDon't try to pass in a data.frame as the 
#' syntax is data.table specific. 
#' 
#' @param dt the input data table
#' 
#' @import data.table
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
fixCountLabels <- function(dt){
  dt <- dt[, countLabel := substr(varText, 13, nchar(varText))] # i.e. chops off "20XX_Census_"
  return(dt)
}

#' Load census data files and do some initial processing
#'
#' \code{loadData} returns a data.table after reading in a data file and adding 
#' an integer 'code.Int' converted from 
#' the input `Code` character column. 
#' 
#' @param f the input data file (assumed to be .csv but data.table::fread copes 
#' with most things)
#' 
#' @import data.table
#' @import readr
#'
#' @author Ben Anderson, \email{b.anderson@@soton.ac.uk}
#' @export
#'
loadData <- function(f){
  dt <- data.table::fread(f)
  dt <- dt[, code.Int := as.integer(Code)] # set code to int for ordering & matching
  setkey(dt, code.Int)
  
  # remove rows we don't need - saves memory & future confusion. 
  # These are usually totals of selections or subsets of rows e.g. Auckland 
  # Boards/all regions etc; also footnotes and explanations. 
  # They really should not be in this data, should be in metatdat file.
  
  # Luckily most of these are quite filtered as code.Int will be NA (could not convert
  # char code string to integer above)
  dt <- dt[!is.na(code.Int)]
  
  # Footnotes have text in the Area_Code_and_Description column
  dt <- dt[!(Area_Code_and_Description %like% "Footnotes")]
  
  return(dt)
}