postcode wrangling

77a23743 · Ben Anderson · 56e50b7c · 77a23743
Commit 77a23743 authored 3 years ago by Ben Anderson
--- a/R/postcodeWrangling.R
+++ b/R/postcodeWrangling.R
+# postcodeWrangling.R: doing stuff with postcodes
+library(data.table) # fast
+dp <- path.expand("~/Dropbox/data/UK_postcodes/") # where you keep your postcode data
+# load GOR region names ----
+df <- "NSPL_AUG_2020_UK/Documents/Region names and codes EN as at 12_10 (GOR).xlsx"
+region_codes <- readxl::read_xlsx(paste0(dp,df))
+region_codes_dt <- data.table::as.data.table(region_codes)
+data.table::setkey(region_codes_dt, GOR10CD)
+# derive postcode sectors & add GOR region names ----
+# > 2016 ----
+df <- "~/Dropbox/data/UK_postcodes/NSPL_AUG_2016_UK_V2/Data/NSPL_AUG_2016_UK.csv.gz"
+rawDT <- data.table::fread(df)
+rawDT[, pcd_district := data.table::tstrsplit(pcds, " ", keep = c(1))] # the characters before the space are the postcode district
+# should be 3088 (UK) + 26 = 3114 (UK + IoM & Channel Islands)
+data.table::uniqueN(rawDT$pcd_district)
+rawDT[, GOR10CD := gor]
+data.table::setkey(rawDT, GOR10CD)
+rawDT <- region_codes_dt[rawDT] # match on GOR codes & names
+rawDT[osgrdind < 9, .(n = .N), keyby = .(gor,GOR10NM)]
+# there are some rows without a GOR10NM (or gor)
+head(rawDT[is.na(GOR10NM)])
+pcd_districts_2016_dt <- rawDT[doterm < 201608 & osgrdind < 9, # remove terminated & those without grid references
+                               .(nPostcodes = .N), keyby = .(pcd_district, GOR10CD, GOR10NM)]
+data.table::uniqueN(pcd_districts_2016_dt$pcd_district)
+skimr::skim(pcd_districts_2016_dt)
+# all postcode districts should now match to a GOR
+table(pcd_districts_2016_dt$GOR10CD,pcd_districts_2016_dt$GOR10NM, useNA = "always")
+# save the file for future use
+data.table::fwrite(pcd_districts_2016_dt, file = paste0(dp, "postcode_districts_2016.csv"))