diff --git a/dataProcessing/processGreenGridSurvey.R b/dataProcessing/processGreenGridSurvey.R
new file mode 100644
index 0000000000000000000000000000000000000000..bc234e22cd6a4058436c4f34c9e88255a1e89c93
--- /dev/null
+++ b/dataProcessing/processGreenGridSurvey.R
@@ -0,0 +1,277 @@
+# Loads GREEN Grid Household survey data
+# processes to match census 2013 as far as possible
+# saves processed file for future use
+require(spatialec)
+myPackages <- c(
+  "data.table", # data munching
+  "dkUtils", # utils - from https://github.com/dataknut/dkUtils
+  "ggplot2", # plot linear imputation model results
+  "here", # easy path management - https://speakerdeck.com/jennybc/zen-and-the-art-of-workflow-maintenance?slide=49
+  "skimr" # for skim
+)
+
+spatialec::loadLibraries(myPackages) # this will try to install packages it can't load
+spatialec::setup()
+
+# --- Project Settings ----
+source(paste0(p_Params$projRoot, "/_scripts/spatialecSettings.R"))   # loads knitr options & assigns file paths and loads system info
+
+# ---- Local Settings ----
+p_Params$censusDataSource <- "http://archive.stats.govt.nz/Census/2013-census/data-tables/meshblock-dataset.aspx"
+
+p_Params$ggPath <- "~/Data/NZ_GREENGrid" # over-rides default for speed
+
+# Print system information
+outputMessage("Running on", p_Params$sysName, 
+              "from projLoc =", projRoot)   # sysName is set in spatialecSettings.R
+
+## Load data ----
+f <- paste0(p_Params$ggPath, "/safe/survey/ggHouseholdAttributesSafe_2019-04-09.csv") # latest - includes heat source & 
+message("Loading: ", f)
+ggHhDT <- data.table::fread(f)
+message("N rows loaded: ", dkUtils::tidyNum(nrow(ggHhDT)))
+message("N households loaded: ", dkUtils::tidyNum(uniqueN(ggHhDT$linkID)))
+
+
+# We need household & Census files with just the variables (constraints) we're going to use. This will take a bit of creative re-coding.
+
+## Families: Presence of children ----
+
+# Survey: 
+# 
+#  * nChildren0_12: 0,1,2+
+#  * nTeenagers13_18: 0,1,2+
+# 
+# Census:
+# 
+#  * 0,1+ (simplest)
+
+#>> survey: number of kids: categorised as: 1,2,3,4+ ----
+ggHhDT <- ggHhDT[, nKids :=  nChildren0_12 + nTeenagers13_18]
+table(ggHhDT$nKids, useNA = "always")
+message("There are NA - we will need to remove them later")
+ggHhDT <- ggHhDT[, nKids_0 := ifelse(nKids == 0, 1, 0)]
+ggHhDT <- ggHhDT[, nKids_1m := ifelse(nKids > 0 , 1, 0)]
+
+message("Household data: n kids")
+table(ggHhDT$nKids_0)
+table(ggHhDT$nKids_1m)
+
+# create category
+ggHhDT <- ggHhDT[, presenceKids := ifelse(nKids == 0, "None", "1+")]
+
+message("Testing nKids")
+table(ggHhDT$nKids, ggHhDT$presenceKids, useNA = "always")
+
+
+# Households: Number of people ----
+
+#>> survey: number of people: categorised as: 1,2,3,4+ ----
+ggHhDT <- ggHhDT[, nPeople := nAdults + nChildren0_12 + nTeenagers13_18]
+table(ggHhDT$nPeople, useNA = "always")
+message("There are NA - we will need to remove them later")
+ggHhDT <- ggHhDT[, nPeople_1 := ifelse(nPeople == 1, 1, 0)]
+ggHhDT <- ggHhDT[, nPeople_2 := ifelse(nPeople == 2, 1, 0)]
+ggHhDT <- ggHhDT[, nPeople_3 := ifelse(nPeople == 3, 1, 0)]
+ggHhDT <- ggHhDT[, nPeople_4m := ifelse(nPeople > 3, 1, 0)]
+
+message("Household data: n people")
+table(ggHhDT$nPeople_1)
+table(ggHhDT$nPeople_2)
+table(ggHhDT$nPeople_3)
+table(ggHhDT$nPeople_4m)
+
+# create category
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_1 == 1, "1", NA)]
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_2 == 1, "2", nPeopleCat)]
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_3 == 1, "3", nPeopleCat)]
+ggHhDT <- ggHhDT[, nPeopleCat := ifelse(nPeople_4m == 1, "4+", nPeopleCat)]
+
+message("Testing n People")
+table(ggHhDT$nPeopleCat, ggHhDT$nPeople, useNA = "always")
+
+# Dwellings: Number of rooms ----
+
+# We have to do some imputation here
+
+#>> survey: n bedrooms via Q10 ----
+ggHhDT <- ggHhDT[, nRooms := `Q10#1_1_1_TEXT` + # bedrooms
+                   `Q10#1_2_1_TEXT` + # living rooms
+                   `Q10#1_3_1_TEXT` + # dining rooms-kitchens
+                   `Q10#1_4_1_TEXT` + # kitchens
+                   `Q10#1_5_1_TEXT`  # studies
+                 # Q10#1_6_1_TEXT + # bathrooms - excluded from census http://archive.stats.govt.nz/Census/2013-census/info-about-2013-census-data/2013-census-definitions-forms/definitions/dwelling.aspx
+                 # Q10#1_7_1_TEXT + # toilets - excluded from census
+                 # Q10#1_8_1_TEXT + # laundries - excluded from census
+                 ]
+message("Testing original n rooms (Q10)")
+table(ggHhDT$nRooms, useNA = "always")
+message("We're quite short of responses to nRooms - we will try to impute them from the number of adults and children using a regression model...")
+message("Test the model")
+r <- lm(ggHhDT$nRooms ~ ggHhDT$nAdults + ggHhDT$nKids)
+
+summary(r)
+
+# >> impute where missing ----
+message("that looks reasonable...impute where missing")
+ggHhDT <- ggHhDT[, nRoomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
+
+# check what we got
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRooms)) + 
+  geom_point() # original shape
+
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nRoomsImputed)) + 
+  geom_point()  + # imputed shape
+  labs(caption = "Well, it is a linear model...")
+
+message("So we use the imputed values only where we had NA")
+ggHhDT <- ggHhDT[, nRoomsCorrected := ifelse(is.na(nRooms), 
+                                             round(nRoomsImputed),
+                                             nRooms
+)
+]
+
+message("How bad does that look?")
+ggHhDT <- ggHhDT[, roomsImputed := ifelse(is.na(nRooms), "Imputed",
+                                          "observed")]
+ggplot2::ggplot(ggHhDT, aes(x = nRoomsImputed, y = nRoomsCorrected, colour = roomsImputed)) + 
+  geom_jitter() # 
+
+message("Well, we can live with that for now...")
+
+message("There are NA - we will need to remove them later")
+ggHhDT <- ggHhDT[, nRooms1_4 := ifelse(nRoomsCorrected < 5, 1, 0)]
+ggHhDT <- ggHhDT[, nRooms5_6 := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, 1, 0)]
+ggHhDT <- ggHhDT[, nRooms7m := ifelse(nRoomsCorrected > 6, 1, 0)]
+table(ggHhDT$nRooms1_4)
+table(ggHhDT$nRooms5_6)
+table(ggHhDT$nRooms7m)
+
+# create category
+ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected < 5, "1-4", NA)]
+ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 4 & nRoomsCorrected < 7, "5-6", nRoomsCat)]
+ggHhDT <- ggHhDT[, nRoomsCat := ifelse(nRoomsCorrected > 6, "7+", nRoomsCat)]
+
+message("Check n Rooms coding")
+table(ggHhDT$nRoomsCat, ggHhDT$nRoomsCorrected, useNA = "always")
+
+# Dwellings: Number of bedrooms ----
+
+# Probably collinear with n people/n rooms. Also have to impute.
+
+#> survey: Q10#1_1_1_TEXT ----
+
+ggHhDT <- ggHhDT[, nBedrooms := `Q10#1_1_1_TEXT`]
+table(ggHhDT$nBedrooms,  useNA = "always")
+# best to do 1-2, 3, 4m
+
+message("We're quite short of responses to nBedrooms - we will try to impute them from the number of people using a regression model...")
+message("Test the model")
+r <- lm(ggHhDT$nBedrooms ~ ggHhDT$nAdults + ggHhDT$nKids)
+
+summary(r)
+
+# > impute where missing ----
+message("that looks reasonable...impute where missing")
+ggHhDT <- ggHhDT[, nBedroomsImputed := predict(r, ggHhDT)] # this forces predict to update the NAs with an estimate
+
+# check what we got
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedrooms)) + 
+  geom_point() # original shape
+
+ggplot2::ggplot(ggHhDT, aes(x = nPeople, y = nBedroomsImputed)) + 
+  geom_point()  + # imputed shape
+  labs(caption = "Well, it is a linear model...")
+
+message("So we use the imputed values only where we had NA and we round them to whole numbers")
+ggHhDT <- ggHhDT[, nBedroomsCorrected := ifelse(is.na(nBedrooms), 
+                                                round(nBedroomsImputed),
+                                                nBedrooms
+)
+]
+
+message("How bad does that look?")
+ggHhDT <- ggHhDT[, bedroomsImputed := ifelse(is.na(nBedrooms), "Imputed",
+                                             "observed")]
+ggplot2::ggplot(ggHhDT, aes(x = nBedroomsImputed, y = nBedroomsCorrected, colour = bedroomsImputed)) + 
+  geom_jitter() # 
+
+message("Well, we can live with that for now...")
+
+ggHhDT <- ggHhDT[, nBedrooms_1_2 := ifelse(nBedroomsCorrected < 3, 1, 0)]
+ggHhDT <- ggHhDT[, nBedrooms_3 := ifelse(nBedroomsCorrected == 3, 1, 0)]
+ggHhDT <- ggHhDT[, nBedrooms_4m := ifelse(nBedroomsCorrected > 3, 1, 0)]
+message("Household data: n bedrooms")
+table(ggHhDT$nBedrooms_1_2)
+table(ggHhDT$nBedrooms_3)
+table(ggHhDT$nBedrooms_4m)
+
+# create category
+ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_1_2 == 1, "1-4", NA)]
+ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_3 == 1, "5-6", nBedroomsCat)]
+ggHhDT <- ggHhDT[, nBedroomsCat := ifelse(nBedrooms_4m == 1, "7+", nBedroomsCat)]
+
+message("Testing n bedrooms")
+table(ggHhDT$nBedroomsCat, ggHhDT$nBedroomsCorrected, useNA = "always")
+
+
+# Notice that the total counts for bedrooms are lower because they are counts of _dwellings_ instead of households.
+
+# Dwellings: Main heat source for hot water ----
+
+# > Survey - known ----
+# > Census - unknown ----
+
+# Dwellings: Main fuel used for heat ----
+
+#> survey: Q20 ----
+
+table(ggHhDT$Q20_coded,  useNA = "always")
+
+ggHhDT <- ggHhDT[, heatFuel := dplyr::recode(Q20_coded,
+                                             "Enclosed wood burner" = "Wood",
+                                             "Open fire" = "Wood", # could be coal
+                                             "Heat pump"= "Electricity",
+                                             "HRV or other ventilation system" = "Electricity",
+                                             "Portable electric heaters" = "Electricity",
+                                             "Portable gas heater" = "Gas", 
+                                             "Underfloor gas heating" = "Gas",
+                                             "Other" = "Other")]
+
+table(ggHhDT$Q20_coded,  ggHhDT$heatFuel, useNA = "always")
+ggHhDT <- ggHhDT[, heatSourceWood := ifelse(heatFuel == "Wood", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceElectricity := ifelse(heatFuel == "Electricity", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceGas := ifelse(heatFuel == "Gas", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceCoal := ifelse(heatFuel == "Coal", 1,0)]
+ggHhDT <- ggHhDT[, heatSourceOther := ifelse(heatFuel == "Other", 1,0)]
+
+message("Testing main heat source")
+table(ggHhDT$heatFuel,  ggHhDT$Location, useNA = "always")
+
+
+# Dwelling: Dwelling type ----
+
+#> Survey - not known ----
+#> Census - known ----
+
+# Fix final survey data ----
+
+# Filter survey data to include just the variables we want and fix any NAs.
+
+surveyDT <- ggHhDT[, .(linkID, 
+                       nKids_0, nKids_1m,
+                       nPeople_1, nPeople_2, nPeople_3, nPeople_4m,
+                       nRooms1_4, nRooms5_6, nRooms7m,
+                       heatSourceWood, heatSourceElectricity, heatSourceGas, heatSourceCoal, heatSourceOther,
+                       nBedrooms_1_2, nBedrooms_3, nBedrooms_4m)]
+
+skimr::skim(surveyDT)
+
+message("There are still a few NAs so we need to remove these households")
+surveyDT <- na.omit(surveyDT)
+
+skimr::skim(surveyDT)
+message("Fixed it?")
+
+# save data for re-use ----
+data.table::fwrite(surveyDT, paste0(p_Params$ggPath, "/safe/survey/ggIpfInput.csv"))