Administrator approval is now required for registering new accounts. If you are registering a new account, and are external to the University, please ask the repository owner to contact ServiceLine to request your account be approved. Repository owners must include the newly registered email address, and specific repository in the request for approval.

Commit 03d64ee0 authored by Ben Anderson's avatar Ben Anderson
Browse files

filtered out -ve HP power values (assumed to be instrument error); corrected...

filtered out -ve HP power values (assumed to be instrument error); corrected file loading data type error, re-built small & larger sample analysis
parent bce9ea01
Ben Anderson,ben,ou029107.otago.ac.nz,15.11.2018 14:52,file:///Users/ben/Library/Application%20Support/LibreOffice/4;
\ No newline at end of file
......@@ -51,6 +51,7 @@ rmdLibs <- c("data.table", # data munching
"dkUtils", # utilities from devtools::install_github("dataknut/dkUtils")
"forcats", # category manipulation
"pwr", # power stuff
"GREENGridData", # GREEN grid data loading etc from devtools::install_github("cfsOtago/GREENGridData")
"knitr" # for kable
)
# load them
......@@ -73,11 +74,13 @@ labelProfilePlot <- function(plot){
myParams <- list()
myParams$repoLoc <- dkUtils::findParentDirectory("weGotThePower")
myParams$dPath <- "~/Dropbox/Work/Otago_CfS_Ben/data/nzGREENGrid/dataExtracts/"
myParams$dPath <- "~/Dropbox/Work/Otago_CfS_Ben/data/nzGREENGrid/"
#myParams$dPath <- "~/Data/NZ_GREENGrid/safe/gridSpy/1min/dataExtracts/"
# created from https://dx.doi.org/10.5255/UKDA-SN-853334
# using https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R
heatPumpData <- paste0(myParams$dPath, "Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")
heatPumpData <- paste0(myParams$dPath, "dataExtracts/Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")
ggHHData <- paste0(myParams$dPath, "ggHouseholdAttributesSafe.csv")
myParams$GGDataDOI <- "https://dx.doi.org/10.5255/UKDA-SN-853334"
plotCaption <- paste0("Source: ", myParams$GGDataDOI)
......@@ -142,41 +145,93 @@ This report contains the analysis for a paper of the same name. The text is stor
# Load GREEN Grid Heat Pump data
if(file.exists(heatPumpData)){
message("Loading: ", heatPumpData )
dt <- data.table::as.data.table(readr::read_csv(heatPumpData, progress = TRUE))
gsDT <- GREENGridData::getCleanGridSpyFile(heatPumpData) # load Grid Spy data cleanly, throws a warning about missing fields/columns
# dt <- data.table::as.data.table(readr::read_csv(heatPumpData,
# progress = FALSE,
# col_types = list(col_character(), # hhid
# col_character(), # linkID
# col_datetime(format = ""), # dateTime
# col_character(), # circuit
# col_double()) # power
# )
# )
} else {
message("No such file: ", heatPumpData )
stop()
}
dt <- dt[, month := lubridate::month(r_dateTime)]
dt <- dt[, year := lubridate::year(r_dateTime)]
gst <- summary(gsDT)
gsDT <- gsDT[, month := lubridate::month(r_dateTime)]
gsDT <- gsDT[, year := lubridate::year(r_dateTime)]
# add southern hemisphere season
dt <- dt[, tmpM := lubridate::month(r_dateTime)] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
dt <- dt[, season := "Summer"] # easiest to set the default to be the one that bridges years
dt <- dt[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
dt <- dt[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
dt <- dt[tmpM >= 9 & tmpM <= 11, season := "Spring"]
gsDT <- gsDT[, tmpM := lubridate::month(r_dateTime)] # sets 1 (Jan) - 12 (Dec). May already exist but we can't rely on it
gsDT <- gsDT[, season := "Summer"] # easiest to set the default to be the one that bridges years
gsDT <- gsDT[tmpM >= 3 & tmpM <= 5, season := "Autumn"]
gsDT <- gsDT[tmpM >= 6 & tmpM <= 8 , season := "Winter"]
gsDT <- gsDT[tmpM >= 9 & tmpM <= 11, season := "Spring"]
# re-order to make sense
dt <- dt[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
gsDT <- gsDT[, season := factor(season, levels = c("Spring", "Summer", "Autumn", "Winter"))]
# Load GREEN Grid household data
if(file.exists(ggHHData)){
message("Loading: ", ggHHData )
hhDT <- data.table::as.data.table(readr::read_csv(ggHHData))
} else {
message("No such file: ", ggHHData )
stop()
}
knitr::kable(caption = "Summary of grid spy data", gst)
# there are negawatts!
gsDT <- gsDT[, negW := "PosW"]
gsDT <- gsDT[ powerW < 0, negW := "NegaW"]
t <- table(gsDT$linkID,gsDT$negW)
t
prop.table(t)
```
```{r dataPrep}
testDT <- dt[lubridate::hour(r_dateTime) > 15 & # 16:00 ->
hhDT <- hhDT[Q57 == 1, nPeople := "1"]
hhDT <- hhDT[Q57 == 2, nPeople := "2"]
hhDT <- hhDT[Q57 == 3, nPeople := "3"]
hhDT <- hhDT[Q57 > 3, nPeople := "4+"]
setkey(hhDT, linkID)
testDT <- gsDT[lubridate::hour(r_dateTime) > 15 & # 16:00 ->
lubridate::hour(r_dateTime) < 20 & # <- 20:00
lubridate::wday(r_dateTime) != 6 & # not Saturday
lubridate::wday(r_dateTime) != 7 & # not Sunday
year == 2015,
year == 2015 & negW == "PosW", # remove the negawatts (see https://github.com/CfSOtago/GREENGridData/issues/6)
.(meanW = mean(powerW, na.rm = TRUE),
sdW = sd(powerW, na.rm = TRUE)), keyby = .(season, linkID)]
setkey(testDT, linkID)
linkedTestDT <- hhDT[testDT]
testTable <- testDT[, .(meanMeanW = mean(meanW),
sdMeanW = sd(meanW)), keyby = .(season)]
testTable <- linkedTestDT[, .(meanMeanW = mean(meanW),
sdMeanW = sd(meanW),
nHouseholds = .N), keyby = .(season, nPeople)]
knitr::kable(caption = "Summary of mean consumption per household by season", testTable)
ggplot2::ggplot(testDT, aes(y = meanW, x = fct_reorder(linkID, meanW, .desc = TRUE), colour = season)) +
geom_point()
testTable <- linkedTestDT[season == "Winter", .(meanMeanW = mean(meanW),
sdMeanW = sd(meanW),
nHouseholds = .N)]
knitr::kable(caption = "Summary of mean consumption per household in winter", testTable)
# plot distn
ggplot2::ggplot(linkedTestDT, aes(y = meanW, x = fct_reorder(linkID, meanW, .desc = TRUE), colour = nPeople)) +
geom_point() +
facet_wrap(. ~ season)
```
Observations are summarised to mean W per household during 16:00 - 20:00 on weekdays for year = 2015.
......@@ -186,8 +241,9 @@ Observations are summarised to mean W per household during 16:00 - 20:00 on week
testSamples <- seq(50,3000,50)
testPower <- 0.8
testMean <- mean(testDT[season == "Winter"]$meanW)
testSD <- sd(testDT[season == "Winter"]$meanW)
# overall mean
testMean <- mean(linkedTestDT[season == "Winter"]$meanW)
testSD <- sd(linkedTestDT[season == "Winter"]$meanW)
# use package function
meansPowerDT <- weGotThePower::estimateMeanEffectSizes(testMean,testSD,testSamples,testPower) # auto-produces range of p values
......@@ -260,7 +316,7 @@ p <- p + labs(caption = myCaption) +
p <- p +
geom_hline(yintercept = y001, colour = "red") +
geom_segment(x = x001, y = y001, xend = x001, yend = 0, alpha = vLineAlpha,
colour = cbPalette[2])
colour = cbPalette[1])
p <- p +
annotate(geom = "text",
......@@ -269,13 +325,6 @@ p <- p +
label = paste0("Effect size = ", round(y001, 2) ,"% with \n p = 0.01, power = 0.8 and n = 1000"),
hjust = 0) # https://stackoverflow.com/questions/26684023/how-to-left-align-text-in-annotate-from-ggplot2
# add vline at 0.01 effect size for p = 0.01, n = 1000
p001Ref <- meansPowerDT[pValue == "p = 0.01" &
effectSize < ceiling(p001Ref$effectSize) &
effectSize > floor(p001Ref$effectSize)] # for reference line
x001 <- mean(p001Ref$sampleN)
p <- p + geom_segment(x = x001, y = y001, xend = x001, yend = 0, alpha = vLineAlpha,
colour = cbPalette[1])
# add vline at 0.05 effect size for p = 0.01, n = 1000
p005Ref <- meansPowerDT[pValue == "p = 0.05" &
......@@ -295,8 +344,8 @@ p <- p + geom_segment(x = x01, y = y001, xend = x01, yend = 0, alpha = vLineAlp
# add vline at 0.2 effect size for p = 0.01, n = 1000
p02Ref <- meansPowerDT[pValue == "p = 0.2" &
effectSize < ceiling(p001Ref$effectSize+2) &
effectSize > floor(p001Ref$effectSize-2)] # for reference line
effectSize < ceiling(p001Ref$effectSize) &
effectSize > floor(p001Ref$effectSize)] # for reference line
x02 <- mean(p02Ref$sampleN)
p <- p + geom_segment(x = x02, y = y001, xend = x02, yend = 0, alpha = vLineAlpha,
colour = cbPalette[4])
......@@ -305,6 +354,13 @@ p
ggplot2::ggsave("figs/statPowerEsts80means_All.png", p)
```
At same effect size (`r y001`%, n = 1000, p = 0.01):
* p = 0.05, n = `r x005`
* p = 0.1, n = `r x01`
* p = 0.2, n = `r x02`
Full table of results:
```{r meansPowerTable}
......@@ -390,44 +446,50 @@ This may be far too wide an error margin for our purposes so we may instead have
## Getting it 'wrong'
Use base GREENGrid and number of people but re-sample slightly.
> NB: we create a small sample roughly 2 * the size of the GREEN Grid data. Due to small numbers and the random re-sampling with replacement process, there will be random fluctuations in the results with each run. As a result the results in this section will probably not match the results in the paper...
```{r smallNTable}
# mimic an intervention using seasons
testDT <- testDT[season == "Winter", group := "Control"]
testDT <- testDT[season == "Spring", group := "Intervention 1"]
testDT <- testDT[season == "Summer", group := "Intervention 2"]
testDT <- testDT[season == "Autumn", group := "Intervention 3"]
t <- testDT[, .("mean W" = mean(meanW),
# group will be NA where we have no survey data
# select just wnter
linkedTestDT <- linkedTestDT[season == "Winter" & !is.na(nPeople)]
# create small sample - be warned, this is a random process so you will get different results each time you run it
smallTestDT <- sample_frac(linkedTestDT, 2, replace = TRUE)
t <- smallTestDT[, .("mean W" = mean(meanW),
"sd W" = sd(meanW),
"n households" = .N),
keyby = .(group)]
keyby = .(nPeople)]
knitr::kable(t, caption = "Number of households and summary statistics per group")
knitr::kable(t, caption = "Number of households and summary statistics per group (winter heat pump use)")
```
So a sample of `r nrow(smallTestDT)`.
```{r ggMeanDiffs, fig.caption = "Mean W demand per group (Error bars = 95% confidence intervals for the sample mean)"}
plotDT <- testDT[, .(meanW = mean(meanW),
plotDT <- smallTestDT[, .(meanW = mean(meanW),
sdW = sd(meanW),
nObs = .N),
keyby = .(group)
keyby = .(nPeople)
]
myCaption <- paste0("Hypothetical small n sample",
myCaption <- paste0("Hypothetical (very) small n sample",
"\nStatistic: mean W, weekdays 16:00 - 20:00")
makeMeanCIPlot <- function(dt){
# makes the plot, assumes meanW & sdW - not flexible but it does the job
dt <- dt[, ci_upper := meanW + (qnorm(0.975) * sdW/sqrt(nObs))]
dt <- dt[, ci_lower := meanW - (qnorm(0.975) * sdW/sqrt(nObs))]
p <- ggplot2::ggplot(dt, aes(x = group, y = meanW, fill = group)) +
p <- ggplot2::ggplot(dt, aes(x = nPeople, y = meanW, fill = nPeople)) +
geom_col() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.25) +
guides(fill = guide_legend(title = "Group")) +
labs(y = "Mean W",
x = "Trial group",
x = "N people in household",
caption = myCaption)
return(p)
}
......@@ -436,21 +498,21 @@ makeMeanCIPlot(plotDT)
```
T test group 1
T test 1 <-> 3
```{r tTestTabG1}
# fix
# we are going to compare winter with summer to get a large effect. This is not what we would really do as it is a repeat measures dataset but this is irrelevant for our current purposes.
tTest <- t.test(testDT[group == "Intervention 1"]$meanW, testDT[group == "Control"]$meanW)
tTest <- t.test(smallTestDT[nPeople == "1"]$meanW, smallTestDT[nPeople == "3"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate2
tTestTidy$`Intervention 1 mean` <- tTestTidy$estimate1
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`3 persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 1 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Group 1 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "3 persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 3)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
......@@ -465,24 +527,25 @@ The results show that the mean power demand for the control group was `r round(c
* 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _considerable_ uncertainty/variation;
* p value of `r round(tTest$p.value,3)` representing a _relatively low_ risk of a false positive result but which (just) fails the conventional p < 0.05 threshold.
T test Group 2
T test 1 <-> 4+
```{r tTestTabG2}
# fix
# now compare winter & spring for a smaller effect
tTest <- t.test(testDT[group == "Intervention 2"]$meanW, testDT[group == "Control"]$meanW)
tTest <- t.test(smallTestDT[nPeople == "1"]$meanW, smallTestDT[nPeople == "4+"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate2
tTestTidy$`Intervention 2 mean` <- tTestTidy$estimate1
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`4+ persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 2 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Group 2 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "4+ persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 4+)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
cil <- tTest$conf.int[[1]]
ciu <- tTest$conf.int[[2]]
```
......@@ -494,43 +557,30 @@ Now:
* p value of `r round(tTest$p.value,3)` representing a _higher_ risk of a false positive result which fails the conventional p < 0.05 threshold and also the less conservative p < 0.1.
```{r getN}
# get sample size required for Int Group 2
sd <- testDT[, sd(meanW)]
result <- power.t.test(
n = NULL,
delta = controlW - intW,
sd = sd,
sig.level = 0.05,
power = 0.8,
alternative = c("one.sided")
)
```
To detect Intervention Group 2's effect size of `r round(100 * (1-(intW/controlW)),2)`% would have required control and trial group sizes of `r round(result$n)` respectively.
## Getting it 'right'
> NB: we create a larger sample roughly 40 * the size of the GREEN Grid data. Due to the random re-sampling with replacement process, there will be random fluctuations in the results with each run. As a result the results in this section will probably not exactly match the results in the paper but as the sample is large they should be quite close...
```{r creatLargeN}
# fix.
# we just randomly re-sample the GREEN Grid data
largeTestDT <- sample_frac(testDT, 40, replace = TRUE)
largeTestDT <- sample_frac(linkedTestDT, 40, replace = TRUE)
t <- largeTestDT[, .("mean W" = mean(meanW),
"sd W" = sd(meanW),
"n households" = .N),
keyby = .(group)]
keyby = .(nPeople)]
knitr::kable(t, caption = "Number of households and summary statistics per group")
```
So n = `r nrow(largeTestDT)`
```{r largeNmeanDiffs, fig.cap="Mean W demand per group for large sample (Error bars = 95% confidence intervals for the sample mean)"}
plotDT <- largeTestDT[, .(meanW = mean(meanW),
sdW = sd(meanW),
nObs = .N),
keyby = .(group)
keyby = .(nPeople)
]
myCaption <- paste0("Hypothetical large n sample",
......@@ -538,23 +588,24 @@ myCaption <- paste0("Hypothetical large n sample",
makeMeanCIPlot(plotDT)
```
re-run T tests Control vs Group 1
re-run T tests 1 vs 3
```{r largeNtTestControl-1}
# now compare winter & spring for a smaller effect
tTest <- t.test(largeTestDT[group == "Control"]$meanW, largeTestDT[group == "Intervention 1"]$meanW)
tTest <- t.test(largeTestDT[nPeople == "1"]$meanW, largeTestDT[nPeople == "3"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate1
tTestTidy$`Intervention 1 mean` <- tTestTidy$estimate2
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`3 persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 1 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Intervention 2 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "3 persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 3)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
controlW <- tTest$estimate[[1]]
intW <- tTest$estimate[[2]]
cil <- tTest$conf.int[[1]]
ciu <- tTest$conf.int[[2]]
```
......@@ -565,23 +616,22 @@ In this case:
* 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _much less_ uncertainty/variation;
* p value of `r round(tTest$p.value,4)` representing a _very low_ risk of a false positive result as it passes all conventional thresholds.
re-run T tests Control vs Group 2
re-run T tests 1 person vs 4+
```{r largeNtTestControl-2}
# now compare winter & spring for a smaller effect
tTest <- t.test(largeTestDT[group == "Control"]$meanW, largeTestDT[group == "Intervention 2"]$meanW)
tTest <- t.test(largeTestDT[nPeople == "1"]$meanW, largeTestDT[nPeople == "4+"]$meanW)
tTestTidy <- broom::tidy(tTest)
tTestTidy$`Control mean` <- tTestTidy$estimate1
tTestTidy$`Intervention 2 mean` <- tTestTidy$estimate2
tTestTidy$`1 person mean` <- tTestTidy$estimate1
tTestTidy$`4+ persons mean` <- tTestTidy$estimate2
tTestTidy$`Mean difference` <- tTestTidy$estimate
knitr::kable(tTestTidy[c("Control mean", "Intervention 2 mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (Intervention 2 vs Control)")
knitr::kable(tTestTidy[c("1 person mean", "4+ persons mean", "Mean difference",
"statistic", "p.value", "conf.low", "conf.high")], caption = "T test results (1 vs 4+)")
controlW <- tTest$estimate[[2]]
intW <- tTest$estimate[[1]]
controlW <- tTest$estimate[[1]]
intW <- tTest$estimate[[2]]
cil <- tTest$conf.int[[1]]
ciu <- tTest$conf.int[[2]]
```
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment