2_baselineModel_v1_3.Rmd

modelVersion <- "v1_3"

# set model v1 paths for BA laptop ----
if(startsWith(userName, "ben")) # => BA laptop
  m1iPath <- paste0(iPath, "model_v1/")
if(startsWith(userName, "ben")) # => BA laptop
  m1oPath <- paste0(dPath, "model_outputs/model_v1/")

# set paths for DM laptop ----
if(startsWith(userName, "despina")) # => DM laptop
  m1iPath <- iPath
if(startsWith(userName, "despina")) # => DM laptop
  m1oPath <- oPath

print(paste0("User: ",userName))
print(paste0("Platform: ",sysName))
print(paste0("Model version: ",modelVersion))
print(paste0("Input data path: ",m1iPath))
print(paste0("Output data path: ",m1oPath))

hhPropsDT <- data.table(read.csv(paste(m1iPath,"rdata/hhpropstemp_v1_2.csv", sep = "")))
kable(caption = "First 10 rows)",
      head(hhPropsDT,10)
)

kable(caption = "Summary of baseline hh properties",
      summary(hhPropsDT)
)

n_hh <- nrow(hhPropsDT) #n_hh <- nrow(hhpropsDT) # Number of housesholds in the modelled area.
hhPropsDT <- ba_IMPETUSsetMeteredHH(hhPropsDT)

# check
kable(caption = "Check % metered distribution",
      100*round(prop.table(table(hhPropsDT$metered, hhPropsDT$occupancy),2),3)
)
mlmDataSummaryDT <- data.table(read.csv(paste0(iPath,"Parker-2014-Thesis-Micro-Component-Golden-100_mlm_summary.csv")))

kable(caption = "Summary of mean/median values per micro-component (Parker, 2014, all households)",
  mlmDataSummaryDT
)
# Combine the base household DT with the selected l/day for each microcomponent

#hhMcDataDT <- ba_IMPETUSsetBaseline(mlmDataSummaryDT, hhPropsDT)

# function to set basic consumption from consDT & merge to hhDT
# consDT is the data from Parker's thesis
# we start from the medians but select the specific rows relevant to the model we
# are running (see text that calls this function for more details)
# requires that modelVersion is set
mcMedianDT <- mlmDataSummaryDT[type == paste0("Final values ",modelVersion),
                     .(metered = var, Basin.parkerMedian	, Bath.parkerMedian	, Dishwasher.parkerMedian	,
                       External.parkerMedian	, KitchenSink.parkerMedian	, Shower.parkerMedian	, WC.parkerMedian	, WashingMachine.parkerMedian)
                     ]

# Calculate the modelled consumption using the median data for all households from Parker.
# ideally we'd want to match occupancy levels but Parker has no tables that allow us to do this
# match on meter/not metered
setkey(mcMedianDT, metered)
setkey(hhPropsDT, metered)
hhMcDataDT <- merge(hhPropsDT, mcMedianDT, all.x = TRUE) # This will recycle the single median rows onto the relevant metered /unmetered rows


# This will have produced a table with medians which are not corrected for occupancy (or anything else) and so will be constant across occupancy groups thus:

sumt <- hhMcDataDT[, .(meanBasin = mean(Basin.parkerMedian),
                       sdBasin = sd(Basin.parkerMedian),
                       meanBath = mean(Bath.parkerMedian),
                       sdBath = sd(Bath.parkerMedian)
                       ), keyby = .(metered, occupancy)]

kable(caption = "Test l/day", sumt)
bathCut <- 30
showerCut <- 5
externalCut <- 10
# To fix this problem we calculate a distirbution for each micro-component using a normal distirbution with mean = the Parker median

# use rsnorm to re-distribute the input consumption values
# we have chosen parameters that produce distributions to match Parker's graphs
# We have no other information on the s.d. to use but Figure 5.3 (p110) of Parker's thesis
# suggests somthing close to:
# rsnorm(n, mean = 15, sd = 5, xi = 4)) - Skew Normal Distribution
# https://stackoverflow.com/questions/20254084/plot-normal-left-and-right-skewed-distribution-in-r

nObs <- nrow(hhMcDataDT)
hhMcDataDT <- hhMcDataDT[, Basin.rsnorm := rsnorm(nObs, mean = Basin.parkerMedian, sd = 3, xi = 3)]
hhMcDataDT <- hhMcDataDT[, Bath.rsnorm := rsnorm(nObs, mean = Bath.parkerMedian, sd = 10, xi = 2)]
hhMcDataDT <- hhMcDataDT[, Dishwasher.rsnorm := rsnorm(nObs, mean = Dishwasher.parkerMedian, sd = 2, xi = 3)]
hhMcDataDT <- hhMcDataDT[, External.rsnorm := rsnorm(nObs, mean = External.parkerMedian, sd = 8, xi = 3)]
hhMcDataDT <- hhMcDataDT[, KitchenSink.rsnorm := rsnorm(nObs, mean = KitchenSink.parkerMedian, sd = 3, xi = 3)]
hhMcDataDT <- hhMcDataDT[, Shower.rsnorm := rsnorm(nObs, mean = Shower.parkerMedian, sd = 5, xi = 2)]
hhMcDataDT <- hhMcDataDT[, WC.rsnorm := rsnorm(nObs, mean = WC.parkerMedian, sd = 15, xi = 2)]
hhMcDataDT <- hhMcDataDT[, WashingMachine.rsnorm := rsnorm(nObs, mean = WashingMachine.parkerMedian, sd = 14, xi = 3)]


# implement the 'cuts' specified above
hhMcDataDT <- hhMcDataDT[, Bath.rsnorm := ifelse(Bath.rsnorm < bathCut, 0, Bath.rsnorm)] # set < bathCut l/day to zero
hhMcDataDT <- hhMcDataDT[, External.rsnorm := ifelse(External.rsnorm < externalCut, 0, External.rsnorm)] # set < externalCut l/day to zero
hhMcDataDT <- hhMcDataDT[, Shower.rsnorm := ifelse(Shower.rsnorm < showerCut, 0, Shower.rsnorm)] # set < showerCut l/day to zero

# test them
myCaption <- "Baseline consumption (litres/day/household), rsnorm distribution"

myTitle <- "Basin use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "Basin.rsnorm")
myPlot

myTitle <- "Bath use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "Bath.rsnorm")
myPlot

myTitle <- "Dishwasher use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "Dishwasher.rsnorm")
myPlot

myTitle <- "External use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "External.rsnorm")
myPlot

myTitle <- "Kitchen sink use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "KitchenSink.rsnorm")
myPlot

myTitle <- "Shower use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "Shower.rsnorm")
myPlot

myTitle <- "WC use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "WC.rsnorm")
myPlot

myTitle <- "Washing machine use"
myPlot <- ba_IMPETUSmakeDensityPlot(hhMcDataDT, "WashingMachine.rsnorm")
myPlot

testDT <- hhMcDataDT[, .(Basin.rsnorm,
                         Bath.rsnorm,
                         Dishwasher.rsnorm,
                         External.rsnorm,
                         KitchenSink.rsnorm,
                         Shower.rsnorm,
                         WC.rsnorm,
                         WashingMachine.rsnorm)
                       ]

kable(caption = "Test Basin & Bath l/day as modelled using rsnorm()", summary(testDT))
t <- hhMcDataDT[, .(Basin = mean(Basin.rsnorm),
                    Bath = mean(Bath.rsnorm),
                    Dishwasher = mean(Dishwasher.rsnorm),
                    External = mean(External.rsnorm),
                    KitchenSink = mean(KitchenSink.rsnorm),
                    Shower = mean(Shower.rsnorm),
                    WC = mean(WC.rsnorm),
                    WashingMachine = mean(WashingMachine.rsnorm)
                    ),
                by = .(metered)
                ]

kable(caption = "Metered vs un-metered estimates (mean)",
      t
      )

wmMetered <- t[metered == "Metered", WC]
wmNonMetered <- t[metered == "Not metered", WC]
kable(caption = "Table of occupancy adjustments (from [@parkerThesis2014] Table A3/A4, p221-222, zeros indicate n/s coefficients)",
      occupancyAdjCoeffDT
      )


# adjust baseline rsnorm values according to Parker's occupancy coefficients
setkey(occupancyAdjCoeffDT, metered, occupancy) # appropriate adjustment table
setkey(hhMcDataDT, metered, occupancy) # hh data

occAdjValuesDT <- merge(hhMcDataDT, occupancyAdjCoeffDT, all.x = TRUE) # adds appropriate adjustment coefficients to the hh data, keeps only the original data

# Compute adjusted values for each household
# Add the relevant occupancy adjustment coefficient to the original value - note that the occupancy value is a factor in Parker's model (so compared to 1 person)
# This coefficient = 0 for single person households so the addition will have no effect
# for 1 person households
# Need to multiply by 30 to convert from daily to monthly if desired (we work in daily values)

hhDataFinalDaily_v1_3DT <- occAdjValuesDT[, ':=' (Basin.baseline = Basin.rsnorm + Basin,
                                             Bath.baseline = Bath.rsnorm + Bath,
                                             Dishwasher.baseline = Dishwasher.rsnorm + Dishwasher,
                                             External.baseline = External.rsnorm + External,
                                             KitchenSink.baseline = KitchenSink.rsnorm + KitchenSink,
                                             Shower.baseline = Shower.rsnorm + Shower,
                                             WC.baseline = WC.rsnorm + WC,
                                             WashingMachine.baseline = WashingMachine.rsnorm + WashingMachine
)
]

hhDataFinalDaily_v1_3DT <- hhDataFinalDaily_v1_3DT[, .(metered,occupancy,hhid,lsoaarea,dtype,
                                             Basin.baseline, Bath.baseline, Dishwasher.baseline, External.baseline,
                                             KitchenSink.baseline, Shower.baseline, WC.baseline, WashingMachine.baseline)] # keep only final occupancy adjusted values to avoid confusion

hhDataFinalDaily_v1_3DT <- hhDataFinalDaily_v1_3DT[, sumDaily.baseline := Basin.baseline + Bath.baseline + Dishwasher.baseline +
                                           External.baseline + KitchenSink.baseline + Shower.baseline + WC.baseline + WashingMachine.baseline]


sumt <- hhDataFinalDaily_v1_3DT[, .(meanBasin.baseline = round(mean(Basin.baseline),2),
                       sdBasin.baseline = round(sd(Basin.baseline),2),
                       meanBath.baseline = round(mean(Bath.baseline),2),
                       sdBath.baseline = round(sd(Bath.baseline),2)
                       ), keyby = .(metered, occupancy)]

kable(caption = "Test monthly fixed and occupancy adjusted l/day", sumt)
# some checks to see if it all makes sense
myCaption <- "Baseline household consumption, corrected for occupancy and rsnorm distributed"

myTitle <- "Basin use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "Basin.baseline")
myPlot

myTitle <- "Bath use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "Bath.baseline")
myPlot

myTitle <- "Dishwasher use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "Dishwasher.baseline")
myPlot

myTitle <- "Kitchen sink use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "KitchenSink.baseline")
myPlot

myTitle <- "External use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "External.baseline")
myPlot

myTitle <- "Shower use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "Shower.baseline")
myPlot

myTitle <- "WC use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "WC.baseline")
myPlot

myTitle <- "Washing machine use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "WashingMachine.baseline")
myPlot

myTitle <- "Total daily use"
myPlot <- ba_IMPETUSmakeOccupancyBoxPlot(hhDataFinalDaily_v1_3DT, "sumDaily.baseline")
myPlot


kable(caption = "Summary of mean daily litres per household with different occupancy levels (EST, 2013, imputed data using assumptions and survey of unknown bias)",
      est2013dataFig2DT
)

modelDT <- hhDataFinalDaily_v1_3DT[, .(modelMean = mean(sumDaily.baseline),
                                  modelPcc = mean(sumDaily.baseline/occupancy)
), by = occupancy]
modelDT <- modelDT[, occupancy := ifelse(occupancy == 6, "6+", occupancy)]
setkey(modelDT, occupancy)

est2013dataFig2DT <- est2013dataFig2DT[, occupancy := N.people]
setkey(est2013dataFig2DT, occupancy)

# need to fix so colours constant for source
ggplot(est2013dataFig2DT[modelDT], aes(x = factor(occupancy))) +
  geom_point(aes(y = AverageLitresPerDayPerHousehold, colour = "Per household (EST, 2013)"), shape = 1) +
  geom_point(aes(y = AverageLitresPerDayPerCapita, colour = "Per capita (EST, 2013)"), shape = 1) +
  geom_point(aes(y = modelMean, colour = "Per household (IMPETUS Model)"), shape = 7) +
  geom_point(aes(y = modelPcc, colour = "Per capita (IMPETUS model)"), shape = 7) +
  theme(legend.title = element_blank()) +
  labs(title = "Total use (l/day, EST 2013)",
       y = "'Average' litres/day",
       x = "N people")

kable(caption = "Summary of mean daily litres per micro-component (EST, 2013, imputed dta using assumptions and survey of unknown bias)",
      est2013dataFig1DT
)
myCap <- "IMPETUS model: synthetic households (n = 1800)\n Model v1" # simple for
# change labels to match EST
modelDT <- hhDataFinalDaily_v1_3DT[, .(vol = "vol",
                             Basin = round(mean(Basin.baseline),2),
                             Dishwasher = round(mean(Dishwasher.baseline),2),
                             External = round(mean(External.baseline),2),
                             "Kitchen Sink" = round(mean(KitchenSink.baseline),2),
                             Bath = round(mean(Bath.baseline),2),
                             Shower = round(mean(Shower.baseline),2),
                             Toilet = round(mean(WC.baseline),2),
                             "Washing machine" = round(mean(WashingMachine.baseline),2),
                             Total = round(mean(sumDaily.baseline),2)
)
]
# recast
modelDT <- dcast(melt(modelDT, id.vars = "vol"), variable ~ vol)
modelDT <- modelDT[, Usage := variable]
modelDT <- modelDT[, source := paste0("IMPETUS Model")]
modelTot <- modelDT[Usage == "Total", vol]
modelDT <- modelDT[, pcTot := 100*(vol/modelTot)]

setkey(modelDT, Usage, variable, source) # simplest way to remove vars!

estDT <- est2013dataFig1DT
estDT <- estDT[, pcTot := percentOfTotalDaily]
estDT <- estDT[, vol := imputed.l.hh.day]
estDT <- estDT[, source := "EST (2013)"]

modelDT$variable <- NULL # remove
estDT$percentOfTotalDaily <- NULL
estDT$imputed.l.hh.day <- NULL

plotDT <- rbind(estDT, modelDT)

plotDT <- plotDT[, Usage := as.factor(Usage)]
plotDT <- plotDT[, UsageRo := relevel(Usage, "Total")] # put usage at the end

ggplot(plotDT, aes(x=UsageRo, fill = source)) +
  geom_col(aes(y = vol), position = "dodge") +
  labs(y = "Mean l/day",
       x = "Usage",
       caption = myCap) +
  coord_flip() # rotate for legibility

estPlot <- ggplot(plotDT[Usage != "Total"], aes(x=UsageRo, fill = source)) +
  geom_col(aes(y = pcTot), position = "dodge") +
  labs(y = "% household total",
       x = "Usage",
       caption = myCap) +
  coord_flip() # rotate for legibility

estPlot

# Grey scale version if required
estPlot <- estPlot + theme_bw()

ggsave(paste0("plots_v1/Fig2_CompareModelv1_3withEST2013.png"), dpi = 400)


# save out summary stats by metered/unmetered & occpuancy for comparison
summaryDT <- hhDataFinalDaily_v1_3DT[, .(meanBasin = mean(Basin.baseline),
                                    meanBath = mean(Bath.baseline),
                                    meanDishwasher = mean(Dishwasher.baseline),
                                    meanExternal = mean(External.baseline),
                                    meanKitchenSink = mean(KitchenSink.baseline),
                                    meanShower = mean(Shower.baseline),
                                    meanWC = mean(WC.baseline),
                                    meanWashingMachine = mean(WashingMachine.baseline)
), by = .(occupancy, metered)]
oFile <- paste0(m1oPath,"summary_output-hh-baseline-mcm-consumption_",format(Sys.time(), "%Y-%m-%d"), "_", modelVersion, ".csv")

write.csv(summaryDT, oFile)

# final household data
# save out final baseline hh mcm data
oFile <- paste0(m1oPath,"output-hh-baseline-mcm-consumption_",format(Sys.time(), "%Y-%m-%d"), "_", modelVersion, ".csv")

keepDT <- hhDataFinalDaily_v1_3DT[, .(hhid, occupancy, metered,
                                 Basin.baseline, Bath.baseline , Dishwasher.baseline ,
                                 KitchenSink.baseline , Shower.baseline , WC.baseline ,
                                 WashingMachine.baseline , External.baseline, sumDaily.baseline)]
write.csv(keepDT, oFile)

# compress it
# now gzip new one
# print(paste0("gzipping file to: ", oFile, ".gz"))
# cmd <- paste0("gzip -f ", oFile) # forces over-write
# try(system(cmd)) # in case it fails (it will on windows - you will be left with a .csv file)
# path issue - fails