Commit 03d64ee0 authored by Ben Anderson's avatar Ben Anderson
Browse files

filtered out -ve HP power values (assumed to be instrument error); corrected...

filtered out -ve HP power values (assumed to be instrument error); corrected file loading data type error, re-built small & larger sample analysis
parent bce9ea01
Ben Anderson,ben,ou029107.otago.ac.nz,15.11.2018 14:52,file:///Users/ben/Library/Application%20Support/LibreOffice/4;
\ No newline at end of file
......@@ -51,6 +51,7 @@ rmdLibs <- c("data.table", # data munching
"dkUtils", # utilities from devtools::install_github("dataknut/dkUtils")
"forcats", # category manipulation
"pwr", # power stuff
"GREENGridData", # GREEN grid data loading etc from devtools::install_github("cfsOtago/GREENGridData")
"knitr" # for kable
)
# load them
......@@ -73,11 +74,13 @@ labelProfilePlot <- function(plot){
myParams <- list()
myParams$repoLoc <- dkUtils::findParentDirectory("weGotThePower")
myParams$dPath <- "~/Dropbox/Work/Otago_CfS_Ben/data/nzGREENGrid/dataExtracts/"
myParams$dPath <- "~/Dropbox/Work/Otago_CfS_Ben/data/nzGREENGrid/"
#myParams$dPath <- "~/Data/NZ_GREENGrid/safe/gridSpy/1min/dataExtracts/"
# created from https://dx.doi.org/10.5255/UKDA-SN-853334
# using https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R
heatPumpData <- paste0(myParams$dPath, "Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")
heatPumpData <- paste0(myParams$dPath, "dataExtracts/Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")
ggHHData <- paste0(myParams$dPath, "ggHouseholdAttributesSafe.csv")
myParams$GGDataDOI <- "https://dx.doi.org/10.5255/UKDA-SN-853334"
plotCaption <- paste0("Source: ", myParams$GGDataDOI)
......@@ -142,41 +145,93 @@ This report contains the analysis for a paper of the same name. The text is stor
# Load GREEN Grid Heat Pump data
if(file.exists(heatPumpData)){
message("Loading: ", heatPumpData )
dt <- data.table::as.data.table(readr::read_csv(heatPumpData, progress = TRUE))
gsDT <- GREENGridData::getCleanGridSpyFile(heatPumpData) # load Grid Spy data cleanly, throws a warning about missing fields/columns
# dt <- data.table::as.data.table(readr::read_csv(heatPumpData,
# progress = FALSE,
# col_types = list(col_character(), # hhid
# col_character(), # linkID
# col_datetime(format = ""), # dateTime
# col_character(), # circuit
# col_double()) # power
# )
# )
} else {
message("No such file: ", heatPumpData )
stop()
}
dt <- dt[, month := lubridate::month(r_dateTime)]
dt <- dt[, year := lubridate::year(r_dateTime)]
gst <- summary(gsDT)
gsDT <- gsDT[, month := lubridate::month(r_dateTime)]
gsDT <- gsDT[, year := lubridate::year(r_dateTime)]
# add southern hemisphere season
dt <- dt[, tmpM := lubridate::month(r_dateTime)] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
dt <- dt[, season := "Summer"] # easiest to set the default to be the one that bridges years
dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Spring"]
gsDT <- gsDT[, tmpM := lubridate::month(r_dateTime)] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
gsDT <- gsDT[, season := "Summer"] # easiest to set the default to be the one that bridges years
gsDT <- gsDT[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
gsDT <- gsDT[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
gsDT <- gsDT[tmpM >= 9 & tmpM <= 11, season := "Spring"]
# re-order to make sense
dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
gsDT <- gsDT[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
# Load GREEN Grid household data
if(file.exists(ggHHData)){
message("Loading: ", ggHHData )
hhDT <- data.table::as.data.table(readr::read_csv(ggHHData))
} else {
message("No such file: ", ggHHData )
stop()
}
knitr::kable(caption = "Summary of grid spy data", gst)
# there are negawatts!
gsDT <- gsDT[, negW := "PosW"]
gsDT <- gsDT[ powerW < 0, negW := "NegaW"]
t <- table(gsDT$linkID,gsDT$negW)
t
prop.table(t)
```
```{r dataPrep}
testDT <- dt[lubridate::hour(r_dateTime) > 15 & # 16:00 ->
hhDT <- hhDT[Q57 == 1, nPeople := "1"]
hhDT <- hhDT[Q57 == 2, nPeople := "2"]
hhDT <- hhDT[Q57 == 3, nPeople := "3"]
hhDT <- hhDT[Q57 > 3, nPeople := "4+"]
setkey(hhDT, linkID)
testDT <- gsDT[lubridate::hour(r_dateTime) > 15 & # 16:00 ->
lubridate::hour(r_dateTime) < 20 & # <- 20:00
lubridate::wday(r_dateTime) != 6 & # not Saturday
lubridate::wday(r_dateTime) != 7 & # not Sunday
year == 2015,
year == 2015 & negW == "PosW", # remove the negawatts (see https://github.com/CfSOtago/GREENGridData/issues/6)
.(meanW = mean(powerW, na.rm = TRUE),
sdW = sd(powerW, na.rm = TRUE)), keyby = .(season, linkID)]
setkey(testDT, linkID)
linkedTestDT <- hhDT[testDT]
testTable <- testDT[, .(meanMeanW = mean(meanW),
sdMeanW = sd(meanW)), keyby = .(season)]
testTable <- linkedTestDT[, .(meanMeanW = mean(meanW),
sdMeanW = sd(meanW),
nHouseholds = .N), keyby = .(season, nPeople)]
knitr::kable(caption = "Summary of mean consumption per household by season", testTable)
ggplot2::ggplot(testDT, aes(y = meanW, x = fct_reorder(linkID, meanW, .desc = TRUE), colour = season)) +
geom_point()
testTable <- linkedTestDT[season == "Winter", .(meanMeanW = mean(meanW),
sdMeanW = sd(meanW),
nHouseholds = .N)]
knitr::kable(caption = "Summary of mean consumption per household in winter", testTable)
# plot distn
ggplot2::ggplot(linkedTestDT, aes(y = meanW, x = fct_reorder(linkID, meanW, .desc = TRUE), colour = nPeople)) +
geom_point() +
facet_wrap(. ~ season)
```
Observations are summarised to mean W per household during 16:00 - 20:00 on weekdays for year = 2015.
......@@ -186,8 +241,9 @@ Observations are summarised to mean W per household during 16:00 - 20:00 on week
testSamples <- seq(50,3000,50)
testPower <- 0.8
testMean <- mean(testDT[season == "Winter"]$meanW)
testSD <- sd(testDT[season == "Winter"]$meanW)
# overall mean
testMean <- mean(linkedTestDT[season == "Winter"]$meanW)
testSD <- sd(linkedTestDT[season == "Winter"]$meanW)
# use package function
meansPowerDT <- weGotThePower::estimateMeanEffectSizes(testMean,testSD,testSamples,testPower) # auto-produces range of p values
......@@ -260,7 +316,7 @@ p <- p + labs(caption = myCaption) +
p <- p +
geom_hline(yintercept = y001, colour = "red") +
geom_segment(x = x001, y = y001, xend = x001, yend = 0, alpha = vLineAlpha,
colour = cbPalette[2])
colour = cbPalette[1])
p <- p +
annotate(geom = "text",
......@@ -269,13 +325,6 @@ p <- p +
label = paste0("Effect size = ", round(y001, 2) ,"% with \n p = 0.01, power = 0.8 and n = 1000"),
hjust = 0) # https://stackoverflow.com/questions/26684023/how-to-left-align-text-in-annotate-from-ggplot2
# add vline at 0.01 effect size for p = 0.01, n = 1000
p001Ref <- meansPowerDT[pValue == "p = 0.01" &
effectSize < ceiling(p001Ref$effectSize) &
effectSize > floor(p001Ref$effectSize)] # for reference line
x001 <- mean(p001Ref$sampleN)
p <- p + geom_segment(x = x001, y = y001, xend = x001, yend = 0, alpha = vLineAlpha,
colour = cbPalette[1])
# add vline at 0.05 effect size for p = 0.01, n = 1000
p005Ref <- meansPowerDT[pValue == "p = 0.05" &
......@@ -295,8 +344,8 @@ p <- p + geom_segment(x = x01, y = y001, xend = x01, yend = 0, alpha = vLineAlp
# add vline at 0.2 effect size for p = 0.01, n = 1000
p02Ref <- meansPowerDT[pValue == "p = 0.2" &
effectSize < ceiling(p001Ref$effectSize+2) &
effectSize > floor(p001Ref$effectSize-2)] # for reference line
effectSize < ceiling(p001Ref$effectSize) &
effectSize > floor(p001Ref$effectSize)] # for reference line
x02 <- mean(p02Ref$sampleN)
p <- p + geom_segment(x = x02, y = y001, xend = x02, yend = 0, alpha = vLineAlpha,
colour = cbPalette[4])
......@@ -305,6 +354,13 @@ p
ggplot2::ggsave("figs/statPowerEsts80means_All.png", p)
```
At same effect size (`r y001`%, n = 1000, p = 0.01):
* p = 0.05, n = `r x005`
* p = 0.1, n = `r x01`
* p = 0.2, n = `r x02`
Full table of results:
```{r meansPowerTable}
......@@ -390,44 +446,50 @@ This may be far too wide an error margin for our purposes so we may instead have
## Getting it 'wrong'
Use base GREENGrid and number of people but re-sample slightly.
> NB: we create a small sample roughly 2 * the size of the GREEN Grid data. Due to small numbers and the random re-sampling with replacement process, there will be random fluctuations in the results with each run. As a result the results in this section will probably not match the results in the paper...
```{r smallNTable}
# mimic an intervention using seasons
testDT <- testDT[season == "Winter", group := "Control"]
testDT <- testDT[season == "Spring", group := "Intervention 1"]
testDT <- testDT[season == "Summer", group := "Intervention 2"]
testDT <- testDT[season == "Autumn", group := "Intervention 3"]
t <- testDT[, .("mean W" = mean(meanW),
# group will be NA where we have no survey data
# select just wnter
linkedTestDT <- linkedTestDT[season == "Winter" & !is.na(nPeople)]
# create small sample - be warned, this is a random process so you will get different results each time you run it
smallTestDT <- sample_frac(linkedTestDT, 2, replace = TRUE)
t <- smallTestDT[, .("mean W" = mean(meanW),
"sd W" = sd(meanW),
"n households" = .N),
keyby = .(group)]
keyby = .(nPeople)]
knitr::kable(t, caption = "Number of households and summary statistics per group")
knitr::kable(t, caption = "Number of households and summary statistics per group (winter heat pump use)")
```
So a sample of `r nrow(smallTestDT)`.
```{r ggMeanDiffs, fig.caption = "Mean W demand per group (Error bars = 95% confidence intervals for the sample mean)"}
plotDT <- testDT[, .(meanW = mean(meanW),
plotDT <- smallTestDT[, .(meanW = mean(meanW),
sdW = sd(meanW),
nObs = .N),
keyby = .(group)
keyby = .(nPeople)
]
myCaption <- paste0("Hypothetical small n sample",
myCaption <- paste0("Hypothetical (very) small n sample",
"\nStatistic: mean W, weekdays 16:00 - 20:00")
makeMeanCIPlot <- function(dt){
# makes the plot, assumes meanW & sdW - not flexible but it does the job
dt <- dt[, ci_upper := meanW + (qnorm(0.975) * sdW/sqrt(nObs))]
dt <- dt[, ci_lower := meanW - (qnorm(0.975) * sdW/sqrt(nObs))]
p <- ggplot2::ggplot(dt, aes(x = group, y = meanW, fill = group)) +
p <- ggplot2::ggplot(dt, aes(x = nPeople, y = meanW, fill = nPeople)) +
geom_col() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.25) +
guides(fill = guide_legend(title = "Group")) +
labs(y = "Mean W",
x = "Trial group",
x = "N people in household",
caption = myCaption)
return(p)
}
......@@ -436,21 +498,21 @@ makeMeanCIPlot(plotDT)
```
T test group 1
T test 1 <-> 3
```{r tTestTabG1}
# fix
# we are going to compare winter with summer to get a large effect. This is not what we would really do as it is a repeat measures dataset but this is irrelevant for our current purposes.
tTest <- t.test(testDT[group == "Intervention 1"]$meanW, testDT[group == "Control"]$meanW)
tTest <- t.test(smallTestDT[nPeople == "1"]$meanW, smallTestDT[nPeople == "3"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate2
tTestTidy$`Intervention 1 mean` <- tTestTidy$estimate1
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`3 persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 1 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Group 1 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "3 persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 3)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
......@@ -465,24 +527,25 @@ The results show that the mean power demand for the control group was `r round(c
* 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _considerable_ uncertainty/variation;
* p value of `r round(tTest$p.value,3)` representing a _relatively low_ risk of a false positive result but which (just) fails the conventional p < 0.05 threshold.
T test Group 2
T test 1 <-> 4+
```{r tTestTabG2}
# fix
# now compare winter & spring for a smaller effect
tTest <- t.test(testDT[group == "Intervention 2"]$meanW, testDT[group == "Control"]$meanW)
tTest <- t.test(smallTestDT[nPeople == "1"]$meanW, smallTestDT[nPeople == "4+"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate2
tTestTidy$`Intervention 2 mean` <- tTestTidy$estimate1
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`4+ persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 2 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Group 2 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "4+ persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 4+)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
cil <- tTest$conf.int[[1]]
ciu <- tTest$conf.int[[2]]
```
......@@ -494,43 +557,30 @@ Now:
* p value of `r round(tTest$p.value,3)` representing a _higher_ risk of a false positive result which fails the conventional p < 0.05 threshold and also the less conservative p < 0.1.
```{r getN}
# get sample size required for Int Group 2
sd <- testDT[, sd(meanW)]
result <- power.t.test(
n = NULL,
delta = controlW - intW,
sd = sd,
sig.level = 0.05,
power = 0.8,
alternative = c("one.sided")
)
```
To detect Intervention Group 2's effect size of `r round(100 * (1-(intW/controlW)),2)`% would have required control and trial group sizes of `r round(result$n)` respectively.
## Getting it 'right'
> NB: we create a larger sample roughly 40 * the size of the GREEN Grid data. Due to the random re-sampling with replacement process, there will be random fluctuations in the results with each run. As a result the results in this section will probably not exactly match the results in the paper but as the sample is large they should be quite close...
```{r creatLargeN}
# fix.
# we just randomly re-sample the GREEN Grid data
largeTestDT <- sample_frac(testDT, 40, replace = TRUE)
largeTestDT <- sample_frac(linkedTestDT, 40, replace = TRUE)
t <- largeTestDT[, .("mean W" = mean(meanW),
"sd W" = sd(meanW),
"n households" = .N),
keyby = .(group)]
keyby = .(nPeople)]
knitr::kable(t, caption = "Number of households and summary statistics per group")
```
So n = `r nrow(largeTestDT)`
```{r largeNmeanDiffs, fig.cap="Mean W demand per group for large sample (Error bars = 95% confidence intervals for the sample mean)"}
plotDT <- largeTestDT[, .(meanW = mean(meanW),
sdW = sd(meanW),
nObs = .N),
keyby = .(group)
keyby = .(nPeople)
]
myCaption <- paste0("Hypothetical large n sample",
......@@ -538,23 +588,24 @@ myCaption <- paste0("Hypothetical large n sample",
makeMeanCIPlot(plotDT)
```
re-run T tests Control vs Group 1
re-run T tests 1 vs 3
```{r largeNtTestControl-1}
# now compare winter & spring for a smaller effect
tTest <- t.test(largeTestDT[group == "Control"]$meanW, largeTestDT[group == "Intervention 1"]$meanW)
tTest <- t.test(largeTestDT[nPeople == "1"]$meanW, largeTestDT[nPeople == "3"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate1
tTestTidy$`Intervention 1 mean` <- tTestTidy$estimate2
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`3 persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 1 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Intervention 2 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "3 persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 3)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
controlW <- tTest$estimate[[1]]
intW <- tTest$estimate[[2]]
cil <- tTest$conf.int[[1]]
ciu <- tTest$conf.int[[2]]
```
......@@ -565,23 +616,22 @@ In this case:
* 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _much less_ uncertainty/variation;
* p value of `r round(tTest$p.value,4)` representing a _very low_ risk of a false positive result as it passes all conventional thresholds.
re-run T tests Control vs Group 2
re-run T tests 1 person vs 4+
```{r largeNtTestControl-2}
# now compare winter & spring for a smaller effect
tTest <- t.test(largeTestDT[group == "Control"]$meanW, largeTestDT[group == "Intervention 2"]$meanW)
tTest <- t.test(largeTestDT[nPeople == "1"]$meanW, largeTestDT[nPeople == "4+"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate1
tTestTidy$`Intervention 2 mean` <- tTestTidy$estimate2
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`4+ persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 2 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Intervention 2 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "4+ persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 4+)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
controlW <- tTest$estimate[[1]]
intW <- tTest$estimate[[2]]
cil <- tTest$conf.int[[1]]
ciu <- tTest$conf.int[[2]]
```
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment