diff --git a/ipf/runIPF.R b/ipf/runIPF.R new file mode 100644 index 0000000000000000000000000000000000000000..0ca97406c47784ade87cda686bc467127ff93b23 --- /dev/null +++ b/ipf/runIPF.R @@ -0,0 +1,180 @@ +# Runs ipf for survey & census inputs +# There are intricacies so it is not easy to convert to a function + +require(data.table) +require(ipfp) + +library(spatialec) # for parameters etc +spatialec::setup() + + +# Expects: ---- + +# local area labels +areasDT <- data.table::fread(p_Params$areasTable2013) + +auListDT <- areasDT[, .(nMBs = .N), keyby = .(AU2013_code, AU2013_label, REGC2013_label)] +auListDT <- auListDT[, AU2013_code := as.character(AU2013_code)] # for easier matching +setkey(auListDT, AU2013_code) + +# > an area level constraints dt : censusDT ---- +# use default pre-processed +censusf <- paste0(p_Params$dataPath, "processed/2013IpfInput.csv") +censusDT <- data.table::fread(censusf) +censusDT[, AU2013_code := as.character(AU2013_code)] # saves a lot of confusion + +# > a survey constraints dt (vars in same order) : surveyDT ---- +# use defualt pre-processed +surveyf <- paste0(p_Params$ggPath, "/safe/survey/ggIpfInput.csv") +surveyDT <- data.table::fread(surveyf) + +# > a dt of area codes ---- +areaCodesDT <- censusDT[, .(AU2013_code)] +# > a dt of household ids ---- +hhIdsDT <- surveyDT[, .(linkID)] + +# Set the constraints & set order ---- + +# relies heavily on: http://robinlovelace.net/spatial-microsim-book/smsim-in-R.html +# ipfp wants: +# y = numeric vector of constraints for each area - the names must match the survey, data as counts, no ids or anything +# which means we have to assume ipfp maintains order +# A = the transposed survey data (row = var) in dummy variable form - names must match constraints, no ids etc, variables that match constraints only +# which means we have to assume ipfp maintains order +# x0 = numeric initial vector of length ncol + +# set order of variables - has to match (probably) + + + +fullConstraintsList <- c("heatSourceWood", "heatSourceElectricity", "heatSourceGas", "heatSourceCoal", "heatSourceOther", + "nKids_0", "nKids_1m", + #"nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m", + "nRooms1_4", "nRooms5_6", "nRooms7m", + "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m" # fits last +) + + +# coal = 0 might break the IPF + +testConstraintsList <- c(#"heatSourceWood","heatSourceElectricity","heatSourceGas","heatSourceCoal","heatSourceOther", + "nKids_0", "nKids_1m", + #"nBedrooms_1_2", "nBedrooms_3", "nBedrooms_4m", + "nRooms1_4","nRooms5_6","nRooms7m", + "nPeople_1", "nPeople_2", "nPeople_3", "nPeople_4m" # fits last +) + + + +# Set up census constraints ---- +# check totals are not 0 +#censusDT <- censusDT[, nBedrooms_Total := nBedrooms_1_2 + nBedrooms_3 + nBedrooms_4m] # not yet added +censusDT <- censusDT[, nPeople_Total := nPeople_1 + nPeople_2 + nPeople_3 + nPeople_4m] +censusDT <- censusDT[, nRooms_Total := nRooms1_4 + nRooms5_6 + nRooms7m] +censusDT <- censusDT[, nKids_Total := nKids_0 + nKids_1m] +censusDT <- censusDT[, heatSource_Total := heatSourceWood + heatSourceElectricity + heatSourceGas + heatSourceCoal + heatSourceOther] + +t <- summary(au2013DT[, .SD, .SDcols = names(au2013DT) %like% "_Total"]) +t +message("Removing areas which have total counts for any constraint = 0") + +nOrig <- nrow(censusDT) +zerosDT <- censusDT[nPeople_Total == 0 | is.na(nPeople_Total)| + #nBedrooms_Total == 0 | is.na(nBedrooms_Total)| + nRooms_Total == 0 | is.na(nRooms_Total)| + nKids_Total == 0 | is.na(nKids_Total)| + heatSource_Total == 0 | is.na(heatSource_Total), + .(AU2013_code = as.character(AU2013_code), nRooms_Total,nPeople_Total, nKids_Total, + heatSource_Total + ) + ] + +setkey(zerosDT, AU2013_code) +setkey(areaCodesDT, AU2013_code) +zerosDT <- areaCodesDT[zerosDT] +zerosDT <- zerosDT[, drop := 1] + +test <- zerosDT[censusDT] + +# Remove areas where count is 0 or NA ---- +censusDT <- test[is.na(drop)] +nNonZero <- nrow(censusDT) + +finalAreaCodes <- censusDT[, AU2013_code] # need this as a list + +message("Removed ", nOrig - nNonZero, " areas where the household totals were 0 or NA.") + +# Select just the constraints we want to use ---- +print(paste0("Using ", testConstraintsList)) + +ipfInputSurveyDT <- surveyDT[,testConstraintsList , with=FALSE] +message("Input survey constraints:") +names(ipfInputSurveyDT) + +ipfInputConstraintsDT <- censusDT[, testConstraintsList, with=FALSE] +message("Input census constraints:") +names(ipfInputConstraintsDT) + +# Run IPF ---- +# returns a long form weights dt with oacodes labelled + +# must be numeric +sdt <- apply(ipfInputSurveyDT, 2, function(x) as.numeric(x)) +surveyInputDT <- data.table::as.data.table(sdt) +# transpose for ipf +surv_dtt <- t(surveyInputDT) # <- needs to be numeric too (not integers) - not sure why + +# set up initial vector of the same length as the number of households as a load of 1s (could be the response weights...) +x0 <- rep(1, nrow(surveyInputDT)) + +# census constraints can't be zero +cdt <- apply(ipfInputConstraintsDT, 2, function(x) ifelse(x == 0, 0.001, x)) +censusInputDT <- data.table::as.data.table(cdt) + +# create 2D weighst matrix (households by areas) - this will get filled with the weights +weights = array(NA, dim=c(nrow(surveyInputDT), nrow(censusInputDT))) + +for(i in 1:ncol(weights)){ # for each area (cols of weights) + print(paste0("Running ipfp on area ", i)) + weights[,i] <- ipfp::ipfp(as.numeric(cdt[i,]), # only accepts a vector of numbers + surv_dtt, + x0, + maxit = 10, # maxit = 10 iterations (for now), + verbose = F) # careful with verbose - too much output +} + +# Process results ---- + +# this is where we have to assume ipfp maintained the oaCode order +wDT <- as.data.table(weights) # convert to dt +wDT <- setnames(wDT, as.character(finalAreaCodes)) # add col names + +# this is where we have to assume ipfp maintained the bmg_id order +dt <- cbind(hhIdsDT, wDT) # add linkID to each row of weights (fills down, if you get recycling then there is a problem) +setkey(dt, linkID) + +# need to reshape the weightsDT into 1 column of weights with oacode set to oacode etc +longFormDT <- as.data.table(melt(dt, id=c("linkID"))) +# add correct area code name for future matching +longFormDT <- setnames(longFormDT, c("variable", "value"), c("AU2013_code", "ipfWeight")) +setkey(longFormDT,AU2013_code) +wDT <- NULL # save mem + +#> add region ---- + +longFormDT <- auListDT[longFormDT] + +kableExtra::kable(caption = "First few rows/columns of ipf weights table", head(longFormDT)) + +kableExtra::kable(caption = "Summary of ipf weights table", summary(longFormDT)) + +message("Dimensions of ipf weights table: ", dkUtils::tidyNum(nrow(longFormDT)), " rows x ", ncol(longFormDT), " cols") + +# > remove zero weight cases ---- +message("N rows before removing zero weights: ", nrow(longFormDT)) +longFormDT <- longFormDT[ipfWeight > 0] +message("N rows after removing zero weights: ", nrow(longFormDT)) + +# save the results for future use since they won't change unless we change the constraints ---- +data.table::fwrite(longFormDT, paste0(p_Params$ggPath, "/safe/ipf/nonZeroWeightsAu2013.csv")) +