From b65bbeca374b61c2d4d0cbb1fc3445e9792a869c Mon Sep 17 00:00:00 2001 From: Ben Anderson <dataknut@icloud.com> Date: Thu, 23 Apr 2020 23:01:24 +0100 Subject: [PATCH] processing NSPL file --- dataProcessing/natStatsPostcodeLookup.R | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 dataProcessing/natStatsPostcodeLookup.R diff --git a/dataProcessing/natStatsPostcodeLookup.R b/dataProcessing/natStatsPostcodeLookup.R new file mode 100644 index 0000000..039168c --- /dev/null +++ b/dataProcessing/natStatsPostcodeLookup.R @@ -0,0 +1,82 @@ +# Process NSPL to get MSOA level look up table of useful stuff +library(data.table) +library(drake) + +nsplPath <- "~/Data/UK_NSPL/NSPL_FEB_2020_UK/" + +fName <- "/Data/NSPL_FEB_2020_UK.csv" + +f <- paste0(nsplPath, fName) + +file.exists(f) + +file.info(f) + +# quite big + +# Drake plan ---- + +#this is where we use drake if we can + + +plan <- drake::drake_plan( + nsplData = data.table::fread(f) +) + + +# Code ---- + +# > run drake plan ---- +plan # test the plan +make(plan) # run the plan, re-loading data if needed + +nsplDT <- drake::readd(nsplData) +names(nsplDT) # quite a few + +nrows(nsplDT) + +nsplDT[, counter := 1] # you'll see why + +# if you broke something start again from here +msoaDT <- nsplDT[, .(meanCounter = mean(counter), # should be 1 + nPostcodes = .N, + nOAs = uniqueN(oa11), + nLSOAs = uniqueN(lsoa11)), + keyby = .(msoa11, ctry, rgn, cty)] # all the things we still want labelled into which MSOAs fit + +# MSOAs are split by: +# pcon +# park +# so don't use these here (we get duplicate MSOAs) + +# annoyingly the data are non-human readable codes and the 'labels' are elsewhere in multiple files +msoaNames <- data.table::fread(paste0(nsplPath, "/Documents/MSOA (2011) names and codes UK as at 12_12.csv")) +ctryNames <- data.table::fread(paste0(nsplPath, "/Documents/Country names and codes UK as at 08_12.csv")) +rgnNames <- data.table::fread(paste0(nsplPath, "/Documents/Region names and codes EN as at 12_10 (GOR).csv")) + +# and the linkable columns even have different names. How useful is that? +msoaNames[, msoa11 := MSOA11CD] +setkey(msoaNames, msoa11) +setkey(msoaDT, msoa11) +nrow(msoaDT) +head(msoaDT) +msoaDTl <- msoaDT[msoaNames] +nrow(msoaDTl) # one less +head(msoaDTl) # we dropped the first row of postcodes that aren't allocated to an MSOA + +ctryNames[, ctry := CTRY12CD] +setkey(ctryNames, ctry) +setkey(msoaDTl, ctry) +msoaDTl <- msoaDTl[ctryNames] + +rgnNames[, rgn := GOR10CD] +setkey(rgnNames, rgn) +setkey(msoaDTl, rgn) +msoaDTl <- msoaDTl[rgnNames] + +nrow(msoaDTl) # if this matches +uniqueN(msoaDTl$msoa11) # this then we have no MSOAs split across the higher areas we selected +head(msoaDTl) + +data.table::fwrite(msoaDTl, file = paste0(nsplPath, "/processed/msoaDT.csv")) # save the file out for use elsewhere + -- GitLab