From b65bbeca374b61c2d4d0cbb1fc3445e9792a869c Mon Sep 17 00:00:00 2001
From: Ben Anderson <dataknut@icloud.com>
Date: Thu, 23 Apr 2020 23:01:24 +0100
Subject: [PATCH] processing NSPL file

---
 dataProcessing/natStatsPostcodeLookup.R | 82 +++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 dataProcessing/natStatsPostcodeLookup.R

diff --git a/dataProcessing/natStatsPostcodeLookup.R b/dataProcessing/natStatsPostcodeLookup.R
new file mode 100644
index 0000000..039168c
--- /dev/null
+++ b/dataProcessing/natStatsPostcodeLookup.R
@@ -0,0 +1,82 @@
+# Process NSPL to get MSOA level look up table of useful stuff
+library(data.table)
+library(drake)
+
+nsplPath <- "~/Data/UK_NSPL/NSPL_FEB_2020_UK/"
+
+fName <- "/Data/NSPL_FEB_2020_UK.csv"
+
+f <- paste0(nsplPath, fName)
+
+file.exists(f)
+
+file.info(f)
+
+# quite big
+
+# Drake plan ----
+
+#this is where we use drake if we can
+
+
+plan <- drake::drake_plan(
+  nsplData = data.table::fread(f)
+)
+
+
+# Code ----
+
+# > run drake plan ----
+plan # test the plan
+make(plan) # run the plan, re-loading data if needed
+
+nsplDT <- drake::readd(nsplData)
+names(nsplDT) # quite a few
+
+nrows(nsplDT)
+
+nsplDT[, counter := 1] # you'll see why
+
+# if you broke something start again from here
+msoaDT <- nsplDT[, .(meanCounter = mean(counter), # should be 1
+                     nPostcodes = .N, 
+                     nOAs = uniqueN(oa11),
+                     nLSOAs = uniqueN(lsoa11)),
+                 keyby = .(msoa11, ctry, rgn, cty)] # all the things we still want labelled into which MSOAs fit
+
+# MSOAs are split by:
+# pcon
+# park
+# so don't use these here (we get duplicate MSOAs)
+
+# annoyingly the data are non-human readable codes and the 'labels' are elsewhere in multiple files
+msoaNames <- data.table::fread(paste0(nsplPath, "/Documents/MSOA (2011) names and codes UK as at 12_12.csv"))
+ctryNames <- data.table::fread(paste0(nsplPath, "/Documents/Country names and codes UK as at 08_12.csv"))
+rgnNames <- data.table::fread(paste0(nsplPath, "/Documents/Region names and codes EN as at 12_10 (GOR).csv"))
+
+# and the linkable columns even have different names. How useful is that?
+msoaNames[, msoa11 := MSOA11CD]
+setkey(msoaNames, msoa11)
+setkey(msoaDT, msoa11)
+nrow(msoaDT)
+head(msoaDT)
+msoaDTl <- msoaDT[msoaNames]
+nrow(msoaDTl) # one less
+head(msoaDTl) # we dropped the first row of postcodes that aren't allocated to an MSOA
+
+ctryNames[, ctry := CTRY12CD]
+setkey(ctryNames, ctry)
+setkey(msoaDTl, ctry)
+msoaDTl <- msoaDTl[ctryNames]
+
+rgnNames[, rgn := GOR10CD]
+setkey(rgnNames, rgn)
+setkey(msoaDTl, rgn)
+msoaDTl <- msoaDTl[rgnNames]
+
+nrow(msoaDTl) # if this matches
+uniqueN(msoaDTl$msoa11) # this then we have no MSOAs split across the higher areas we selected
+head(msoaDTl)
+
+data.table::fwrite(msoaDTl, file = paste0(nsplPath, "/processed/msoaDT.csv")) # save the file out for use elsewhere
+
-- 
GitLab