diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc0704843f39ac5296408f7517a2d3efa64805da
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,3 @@
+# Analysis for a short paper on statistical power & statistical significance
+
+And the various confusions that arise...
\ No newline at end of file
diff --git a/elsarticle-template-1-num.tex b/elsarticle-template-1-num.tex
deleted file mode 100644
index 0309c9eafaa1f874f972c21c200161d7da63d2ae..0000000000000000000000000000000000000000
--- a/elsarticle-template-1-num.tex
+++ /dev/null
@@ -1,231 +0,0 @@
-%% This is file `elsarticle-template-1-num.tex',
-%%
-%% Copyright 2009 Elsevier Ltd
-%%
-%% This file is part of the 'Elsarticle Bundle'.
-%% ---------------------------------------------
-%%
-%% It may be distributed under the conditions of the LaTeX Project Public
-%% License, either version 1.2 of this license or (at your option) any
-%% later version.  The latest version of this license is in
-%%    http://www.latex-project.org/lppl.txt
-%% and version 1.2 or later is part of all distributions of LaTeX
-%% version 1999/12/01 or later.
-%%
-%% Template article for Elsevier's document class `elsarticle'
-%% with numbered style bibliographic references
-%%
-%% $Id: elsarticle-template-1-num.tex 149 2009-10-08 05:01:15Z rishi $
-%% $URL: http://lenova.river-valley.com/svn/elsbst/trunk/elsarticle-template-1-num.tex $
-%%
-\documentclass[preprint,12pt]{elsarticle}
-
-%% Use the option review to obtain double line spacing
-%% \documentclass[preprint,review,12pt]{elsarticle}
-
-%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
-%% for a journal layout:
-%% \documentclass[final,1p,times]{elsarticle}
-%% \documentclass[final,1p,times,twocolumn]{elsarticle}
-%% \documentclass[final,3p,times]{elsarticle}
-%% \documentclass[final,3p,times,twocolumn]{elsarticle}
-%% \documentclass[final,5p,times]{elsarticle}
-%% \documentclass[final,5p,times,twocolumn]{elsarticle}
-
-%% The graphicx package provides the includegraphics command.
-\usepackage{graphicx}
-%% The amssymb package provides various useful mathematical symbols
-\usepackage{amssymb}
-%% The amsthm package provides extended theorem environments
-%% \usepackage{amsthm}
-
-%% The lineno packages adds line numbers. Start line numbering with
-%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
-%% for the whole article with \linenumbers after \end{frontmatter}.
-\usepackage{lineno}
-
-%% natbib.sty is loaded by default. However, natbib options can be
-%% provided with \biboptions{...} command. Following options are
-%% valid:
-
-%%   round  -  round parentheses are used (default)
-%%   square -  square brackets are used   [option]
-%%   curly  -  curly braces are used      {option}
-%%   angle  -  angle brackets are used    <option>
-%%   semicolon  -  multiple citations separated by semi-colon
-%%   colon  - same as semicolon, an earlier confusion
-%%   comma  -  separated by comma
-%%   numbers-  selects numerical citations
-%%   super  -  numerical citations as superscripts
-%%   sort   -  sorts multiple citations according to order in ref. list
-%%   sort&compress   -  like sort, but also compresses numerical citations
-%%   compress - compresses without sorting
-%%
-%% \biboptions{comma,round}
-
-% \biboptions{}
-
-\journal{Journal Name}
-
-\begin{document}
-
-\begin{frontmatter}
-
-%% Title, authors and addresses
-
-\title{Unnecessarily Complicated Research Title}
-
-%% use the tnoteref command within \title for footnotes;
-%% use the tnotetext command for the associated footnote;
-%% use the fnref command within \author or \address for footnotes;
-%% use the fntext command for the associated footnote;
-%% use the corref command within \author for corresponding author footnotes;
-%% use the cortext command for the associated footnote;
-%% use the ead command for the email address,
-%% and the form \ead[url] for the home page:
-%%
-%% \title{Title\tnoteref{label1}}
-%% \tnotetext[label1]{}
-%% \author{Name\corref{cor1}\fnref{label2}}
-%% \ead{email address}
-%% \ead[url]{home page}
-%% \fntext[label2]{}
-%% \cortext[cor1]{}
-%% \address{Address\fnref{label3}}
-%% \fntext[label3]{}
-
-
-%% use optional labels to link authors explicitly to addresses:
-%% \author[label1,label2]{<author name>}
-%% \address[label1]{<address>}
-%% \address[label2]{<address>}
-
-\author{John Smith}
-
-\address{California, United States}
-
-\begin{abstract}
-%% Text of abstract
-Suspendisse potenti. Suspendisse quis sem elit, et mattis nisl. Phasellus consequat erat eu velit rhoncus non pharetra neque auctor. Phasellus eu lacus quam. Ut ipsum dolor, euismod aliquam congue sed, lobortis et orci. Mauris eget velit id arcu ultricies auctor in eget dolor. Pellentesque suscipit adipiscing sem, imperdiet laoreet dolor elementum ut. Mauris condimentum est sed velit lacinia placerat. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Nullam diam metus, pharetra vitae euismod sed, placerat ultrices eros. Aliquam tincidunt dapibus venenatis. In interdum tellus nec justo accumsan aliquam. Nulla sit amet massa augue.
-\end{abstract}
-
-\begin{keyword}
-Science \sep Publication \sep Complicated
-%% keywords here, in the form: keyword \sep keyword
-
-%% MSC codes here, in the form: \MSC code \sep code
-%% or \MSC[2008] code \sep code (2000 is the default)
-
-\end{keyword}
-
-\end{frontmatter}
-
-%%
-%% Start line numbering here if you want
-%%
-\linenumbers
-
-%% main text
-\section{The First Section}
-\label{S:1}
-
-Maecenas \cite{Smith:2012qr} fermentum \cite{Smith:2013jd} urna ac sapien tincidunt lobortis. Nunc feugiat faucibus varius. Ut sed purus nunc. Ut eget eros quis lectus mollis pharetra ut in tellus. Pellentesque ultricies velit sed orci pharetra et fermentum lacus imperdiet. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Suspendisse commodo ultrices mauris, condimentum hendrerit lorem condimentum et. Pellentesque urna augue, semper et rutrum ac, consequat id quam. Proin lacinia aliquet justo, ut suscipit massa commodo sit amet. Proin vehicula nibh nec mauris tempor interdum. Donec orci ante, tempor a viverra vel, volutpat sed orci.
-
-Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Pellentesque quis interdum velit. Nulla tincidunt sem quis nisi molestie nec hendrerit nulla interdum. Nunc at lectus at neque dapibus dapibus sit amet in massa. Nam ut nisl in diam consectetur dignissim. Sed lacinia diam id nunc suscipit vitae semper lorem semper. In vehicula velit at tortor fringilla elementum aliquam erat blandit. Donec pretium libero et neque vehicula blandit. Curabitur consequat interdum sem at ultrices. Sed at tincidunt metus. Etiam vulputate, lacus eget fermentum posuere, ante mi dignissim augue, et ultrices felis tortor sed nisl.
-
-\begin{itemize}
-\item Bullet point one
-\item Bullet point two
-\end{itemize}
-
-\begin{enumerate}
-\item Numbered list item one
-\item Numbered list item two
-\end{enumerate}
-
-\subsection{Subsection One}
-
-Quisque elit ipsum, porttitor et imperdiet in, facilisis ac diam. Nunc facilisis interdum felis eget tincidunt. In condimentum fermentum leo, non consequat leo imperdiet pharetra. Fusce ac massa ipsum, vel convallis diam. Quisque eget turpis felis. Curabitur posuere, risus eu placerat porttitor, magna metus mollis ipsum, eu volutpat nisl erat ac justo. Nullam semper, mi at iaculis viverra, nunc velit iaculis nunc, eu tempor ligula eros in nulla. Aenean dapibus eleifend convallis. Cras ut libero tellus. Integer mollis eros eget risus malesuada fringilla mattis leo facilisis. Etiam interdum turpis eget odio ultricies sed convallis magna accumsan. Morbi in leo a mauris sollicitudin molestie at non nisl.
-
-\begin{table}[h]
-\centering
-\begin{tabular}{l l l}
-\hline
-\textbf{Treatments} & \textbf{Response 1} & \textbf{Response 2}\\
-\hline
-Treatment 1 & 0.0003262 & 0.562 \\
-Treatment 2 & 0.0015681 & 0.910 \\
-Treatment 3 & 0.0009271 & 0.296 \\
-\hline
-\end{tabular}
-\caption{Table caption}
-\end{table}
-
-\subsection{Subsection Two}
-
-Donec eget ligula venenatis est posuere eleifend in sit amet diam. Vestibulum sollicitudin mauris ac augue blandit ultricies. Nulla facilisi. Etiam ut turpis nunc. Praesent leo orci, tincidunt vitae feugiat eu, feugiat a massa. Duis mauris ipsum, tempor vel condimentum nec, suscipit non mi. Fusce quis urna dictum felis posuere sagittis ac sit amet erat. In in ultrices lectus. Nulla vitae ipsum lectus, a gravida erat. Etiam quam nisl, blandit ut porta in, accumsan a nibh. Phasellus sodales euismod dolor sit amet elementum. Phasellus varius placerat erat, nec gravida libero pellentesque id. Fusce nisi ante, euismod nec cursus at, suscipit a enim. Nulla facilisi.
-
-\begin{figure}[h]
-\centering\includegraphics[width=0.4\linewidth]{placeholder}
-\caption{Figure caption}
-\end{figure}
-
-Integer risus dui, condimentum et gravida vitae, adipiscing et enim. Aliquam erat volutpat. Pellentesque diam sapien, egestas eget gravida ut, tempor eu nulla. Vestibulum mollis pretium lacus eget venenatis. Fusce gravida nisl quis est molestie eu luctus ipsum pretium. Maecenas non eros lorem, vel adipiscing odio. Etiam dolor risus, mattis in pellentesque id, pellentesque eu nibh. Mauris nec ante at orci ultricies placerat ac non massa. Aenean imperdiet, ante eu sollicitudin vestibulum, dolor felis dapibus arcu, sit amet fermentum urna nibh sit amet mauris. Suspendisse adipiscing mollis dolor quis lobortis.
-
-\begin{equation}
-\label{eq:emc}
-e = mc^2
-\end{equation}
-
-\section{The Second Section}
-\label{S:2}
-
-Reference to Section \ref{S:1}. Etiam congue sollicitudin diam non porttitor. Etiam turpis nulla, auctor a pretium non, luctus quis ipsum. Fusce pretium gravida libero non accumsan. Donec eget augue ut nulla placerat hendrerit ac ut mi. Phasellus euismod ornare mollis. Proin tempus fringilla ultricies. Donec pretium feugiat libero quis convallis. Nam interdum ante sed magna congue eu semper tellus sagittis. Curabitur eu augue elit.
-
-Aenean eleifend purus et massa consequat facilisis. Etiam volutpat placerat dignissim. Ut nec nibh nulla. Aliquam erat volutpat. Nam at massa velit, eu malesuada augue. Maecenas sit amet nunc mauris. Maecenas eu ligula quis turpis molestie elementum nec at est. Sed adipiscing neque ac sapien viverra sit amet vestibulum arcu rhoncus.
-
-Vivamus pharetra nibh in orci euismod congue. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Quisque lacus diam, congue vel laoreet id, iaculis eu sapien. In id risus ac leo pellentesque pellentesque et in dui. Etiam tincidunt quam ut ante vestibulum ultricies. Nam at rutrum lectus. Aenean non justo tortor, nec mattis justo. Aliquam erat volutpat. Nullam ac viverra augue. In tempus venenatis nibh quis semper. Maecenas ac nisl eu ligula dictum lobortis. Sed lacus ante, tempor eu dictum eu, accumsan in velit. Integer accumsan convallis porttitor. Maecenas pretium tincidunt metus sit amet gravida. Maecenas pretium blandit felis, ac interdum ante semper sed.
-
-In auctor ultrices elit, vel feugiat ligula aliquam sed. Curabitur aliquam elit sed dui rhoncus consectetur. Cras elit ipsum, lobortis a tempor at, viverra vitae mi. Cras sed urna sed eros bibendum faucibus. Morbi vel leo orci, vel faucibus orci. Vivamus urna nisl, sodales vitae posuere in, tempus vel tellus. Donec magna est, luctus non commodo sit amet, placerat et enim.
-
-%% The Appendices part is started with the command \appendix;
-%% appendix sections are then done as normal sections
-%% \appendix
-
-%% \section{}
-%% \label{}
-
-%% References
-%%
-%% Following citation commands can be used in the body text:
-%% Usage of \cite is as follows:
-%%   \cite{key}          ==>>  [#]
-%%   \cite[chap. 2]{key} ==>>  [#, chap. 2]
-%%   \citet{key}         ==>>  Author [#]
-
-%% References with bibTeX database:
-
-\bibliographystyle{model1-num-names}
-\bibliography{sample.bib}
-
-%% Authors are advised to submit their bibtex database files. They are
-%% requested to list a bibtex style file in the manuscript if they do
-%% not want to use model1-num-names.bst.
-
-%% References without bibTeX database:
-
-% \begin{thebibliography}{00}
-
-%% \bibitem must have the following form:
-%%   \bibitem{key}...
-%%
-
-% \bibitem{}
-
-% \end{thebibliography}
-
-
-\end{document}
-
-%%
-%% End of file `elsarticle-template-1-num.tex'.
\ No newline at end of file
diff --git a/figs/fig1_statPowerEsts80.png b/figs/fig1_statPowerEsts80.png
index 33cc85a2925937c430dc095060ea9109a367dd42..45e8f305a539bb9fa2085c49b20ec5237cb53fbc 100644
Binary files a/figs/fig1_statPowerEsts80.png and b/figs/fig1_statPowerEsts80.png differ
diff --git a/model1-num-names.bst b/model1-num-names.bst
deleted file mode 100644
index 91511cc362c1496d82f8d6ea1e5c0966e8a44712..0000000000000000000000000000000000000000
--- a/model1-num-names.bst
+++ /dev/null
@@ -1,1359 +0,0 @@
-%%
-%% This is file `model1-num-names.bst',
-%% 
-%% Copyright 2009 Elsevier Ltd
-%% 
-%% This file is part of the 'Elsarticle Bundle'.
-%% ---------------------------------------------
-%% 
-%% It may be distributed under the conditions of the LaTeX Project Public
-%% License, either version 1.2 of this license or (at your option) any
-%% later version.  The latest version of this license is in
-%%    http://www.latex-project.org/lppl.txt
-%% and version 1.2 or later is part of all distributions of LaTeX
-%% version 1999/12/01 or later.
-%%
-%% $Id: model1-num-names.bst 125 2009-10-07 11:47:47Z rishi $
-%%
-%% $URL: http://lenova.river-valley.com/svn/elsbst/trunk/New-Model-1/model1-num-names.bst $
-%%
-%%
-
-ENTRY
-  { address
-    author
-    booktitle
-    chapter
-    edition
-    editor
-    howpublished
-    institution
-    journal
-    key
-    month
-    note
-    number
-    organization
-    pages
-    publisher
-    school
-    series
-    title
-    type
-    volume
-    year
-  }
-  {}
-  { label extra.label sort.label short.list }
-INTEGERS { output.state before.all mid.sentence after.sentence after.block }
-FUNCTION {init.state.consts}
-{ #0 'before.all :=
-  #1 'mid.sentence :=
-  #2 'after.sentence :=
-  #3 'after.block :=
-}
-STRINGS { s t}
-FUNCTION {output.nonnull}
-{ 's :=
-  output.state mid.sentence =
-    { ", " * write$ }
-    { output.state after.block =
-%        { add.period$ write$
-        { ", " * write$
-          newline$
-          "\newblock " write$
-        }
-        { output.state before.all =
-            'write$
-            { add.period$ " " * write$ }
-          if$
-        }
-      if$
-      mid.sentence 'output.state :=
-    }
-  if$
-  s
-}
-FUNCTION {output}
-{ duplicate$ empty$
-    'pop$
-    'output.nonnull
-  if$
-}
-FUNCTION {output.check}
-{ 't :=
-  duplicate$ empty$
-    { pop$ "empty " t * " in " * cite$ * warning$ }
-    'output.nonnull
-  if$
-}
-FUNCTION {fin.entry}
-{ add.period$
-  write$
-  newline$
-}
-
-FUNCTION {new.block}
-{ output.state before.all =
-    'skip$
-    { after.block 'output.state := }
-  if$
-}
-FUNCTION {new.sentence}
-{ output.state after.block =
-    'skip$
-    { output.state before.all =
-        'skip$
-        { after.sentence 'output.state := }
-      if$
-    }
-  if$
-}
-FUNCTION {add.blank}
-{  " " * before.all 'output.state :=
-}
-
-FUNCTION {date.block}
-{
-  skip$
-}
-
-FUNCTION {not}
-{   { #0 }
-    { #1 }
-  if$
-}
-FUNCTION {and}
-{   'skip$
-    { pop$ #0 }
-  if$
-}
-FUNCTION {or}
-{   { pop$ #1 }
-    'skip$
-  if$
-}
-FUNCTION {new.block.checkb}
-{ empty$
-  swap$ empty$
-  and
-    'skip$
-    'new.block
-  if$
-}
-FUNCTION {field.or.null}
-{ duplicate$ empty$
-    { pop$ "" }
-    'skip$
-  if$
-}
-FUNCTION {emphasize}
-{ duplicate$ empty$
-    { pop$ "" }
-    { "\textit{" swap$ * "}" * }
-  if$
-}
-FUNCTION {tie.or.space.prefix}
-{ duplicate$ text.length$ #3 <
-    { "~" }
-    { " " }
-  if$
-  swap$
-}
-
-FUNCTION {capitalize}
-{ "u" change.case$ "t" change.case$ }
-
-FUNCTION {space.word}
-{ " " swap$ * " " * }
- % Here are the language-specific definitions for explicit words.
- % Each function has a name bbl.xxx where xxx is the English word.
- % The language selected here is ENGLISH
-FUNCTION {bbl.and}
-{ "and"}
-
-FUNCTION {bbl.etal}
-{ "et~al." }
-
-FUNCTION {bbl.editors}
-{ "eds." }
-
-FUNCTION {bbl.editor}
-{ "ed." }
-
-FUNCTION {bbl.edby}
-{ "edited by" }
-
-FUNCTION {bbl.edition}
-{ "edition" }
-
-FUNCTION {bbl.volume}
-{ "volume" }
-
-FUNCTION {bbl.of}
-{ "of" }
-
-FUNCTION {bbl.number}
-{ "number" }
-
-FUNCTION {bbl.nr}
-{ "no." }
-
-FUNCTION {bbl.in}
-{ "in" }
-
-FUNCTION {bbl.pages}
-{ "pp." }
-
-FUNCTION {bbl.page}
-{ "p." }
-
-FUNCTION {bbl.chapter}
-{ "chapter" }
-
-FUNCTION {bbl.techrep}
-{ "Technical Report" }
-
-FUNCTION {bbl.mthesis}
-{ "Master's thesis" }
-
-FUNCTION {bbl.phdthesis}
-{ "Ph.D. thesis" }
-
-MACRO {jan} {"January"}
-
-MACRO {feb} {"February"}
-
-MACRO {mar} {"March"}
-
-MACRO {apr} {"April"}
-
-MACRO {may} {"May"}
-
-MACRO {jun} {"June"}
-
-MACRO {jul} {"July"}
-
-MACRO {aug} {"August"}
-
-MACRO {sep} {"September"}
-
-MACRO {oct} {"October"}
-
-MACRO {nov} {"November"}
-
-MACRO {dec} {"December"}
-
-MACRO {acmcs} {"ACM Comput. Surv."}
-
-MACRO {acta} {"Acta Inf."}
-
-MACRO {cacm} {"Commun. ACM"}
-
-MACRO {ibmjrd} {"IBM J. Res. Dev."}
-
-MACRO {ibmsj} {"IBM Syst.~J."}
-
-MACRO {ieeese} {"IEEE Trans. Software Eng."}
-
-MACRO {ieeetc} {"IEEE Trans. Comput."}
-
-MACRO {ieeetcad}
- {"IEEE Trans. Comput. Aid. Des."}
-
-MACRO {ipl} {"Inf. Process. Lett."}
-
-MACRO {jacm} {"J.~ACM"}
-
-MACRO {jcss} {"J.~Comput. Syst. Sci."}
-
-MACRO {scp} {"Sci. Comput. Program."}
-
-MACRO {sicomp} {"SIAM J. Comput."}
-
-MACRO {tocs} {"ACM Trans. Comput. Syst."}
-
-MACRO {tods} {"ACM Trans. Database Syst."}
-
-MACRO {tog} {"ACM Trans. Graphic."}
-
-MACRO {toms} {"ACM Trans. Math. Software"}
-
-MACRO {toois} {"ACM Trans. Office Inf. Syst."}
-
-MACRO {toplas} {"ACM Trans. Progr. Lang. Syst."}
-
-MACRO {tcs} {"Theor. Comput. Sci."}
-
-FUNCTION {bibinfo.check}
-{ swap$
-  duplicate$ missing$
-    {
-      pop$ pop$
-      ""
-    }
-    { duplicate$ empty$
-        {
-          swap$ pop$
-        }
-        { swap$
-          "\bibinfo{" swap$ * "}{" * swap$ * "}" *
-        }
-      if$
-    }
-  if$
-}
-FUNCTION {bibinfo.warn}
-{ swap$
-  duplicate$ missing$
-    {
-      swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
-      ""
-    }
-    { duplicate$ empty$
-        {
-          swap$ "empty " swap$ * " in " * cite$ * warning$
-        }
-        { swap$
-          pop$
-        }
-      if$
-    }
-  if$
-}
-STRINGS  { bibinfo}
-INTEGERS { nameptr namesleft numnames }
-
-FUNCTION {format.names}
-{ 'bibinfo :=
-  duplicate$ empty$ 'skip$ {
-  's :=
-  "" 't :=
-  #1 'nameptr :=
-  s num.names$ 'numnames :=
-  numnames 'namesleft :=
-    { namesleft #0 > }
-    { s nameptr
-      "{f.~}{vv~}{ll}{, jj}"
-      format.name$
-      bibinfo bibinfo.check
-      't :=
-      nameptr #1 >
-        {
-          namesleft #1 >
-            { ", " * t * }
-            {
-              "," *
-              s nameptr "{ll}" format.name$ duplicate$ "others" =
-                { 't := }
-                { pop$ }
-              if$
-              t "others" =
-                {
-                  " " * bbl.etal *
-                }
-                { " " * t * }
-              if$
-            }
-          if$
-        }
-        't
-      if$
-      nameptr #1 + 'nameptr :=
-      namesleft #1 - 'namesleft :=
-    }
-  while$
-  } if$
-}
-FUNCTION {format.names.ed}
-{
-  format.names
-}
-FUNCTION {format.key}
-{ empty$
-    { key field.or.null }
-    { "" }
-  if$
-}
-
-FUNCTION {format.authors}
-{ author "author" format.names
-}
-FUNCTION {get.bbl.editor}
-{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
-
-FUNCTION {format.editors}
-{ editor "editor" format.names duplicate$ empty$ 'skip$
-    {
-      " " *
-      get.bbl.editor
-      capitalize
-   "(" swap$ * ")" *
-      *
-    }
-  if$
-}
-FUNCTION {format.note}
-{
- note empty$
-    { "" }
-    { note #1 #1 substring$
-      duplicate$ "{" =
-        'skip$
-        { output.state mid.sentence =
-          { "l" }
-          { "u" }
-        if$
-        change.case$
-        }
-      if$
-      note #2 global.max$ substring$ * "note" bibinfo.check
-    }
-  if$
-}
-
-FUNCTION {format.title}
-{ title
-  duplicate$ empty$ 'skip$
-    { "t" change.case$ }
-  if$
-  "title" bibinfo.check
-}
-
-FUNCTION {format.full.names}
-{'s :=
- "" 't :=
- #1 'nameptr :=
- s num.names$ 'numnames :=
- numnames 'namesleft :=
-   { namesleft #0 > }
-   { s nameptr
-     "{vv~}{ll}" format.name$
-     't :=
-     nameptr #1 >
-       {
-         namesleft #1 >
-           { ", " * t * }
-           {
-             s nameptr "{ll}" format.name$ duplicate$ "others" =
-               { 't := }
-               { pop$ }
-             if$
-             t "others" =
-               {
-                 " " * bbl.etal *
-               }
-               {
-                 numnames #2 >
-                   { "," * }
-                   'skip$
-                 if$
-                 bbl.and
-                 space.word * t *
-               }
-             if$
-           }
-         if$
-       }
-       't
-     if$
-     nameptr #1 + 'nameptr :=
-     namesleft #1 - 'namesleft :=
-   }
- while$
-}
-
-FUNCTION {author.editor.key.full}
-{ author empty$
-   { editor empty$
-       { key empty$
-           { cite$ #1 #3 substring$ }
-           'key
-         if$
-       }
-       { editor format.full.names }
-     if$
-   }
-   { author format.full.names }
- if$
-}
-
-FUNCTION {author.key.full}
-{ author empty$
-   { key empty$
-        { cite$ #1 #3 substring$ }
-         'key
-     if$
-   }
-   { author format.full.names }
- if$
-}
-
-FUNCTION {editor.key.full}
-{ editor empty$
-   { key empty$
-        { cite$ #1 #3 substring$ }
-         'key
-     if$
-   }
-   { editor format.full.names }
- if$
-}
-
-FUNCTION {make.full.names}
-{ type$ "book" =
- type$ "inbook" =
- or
-   'author.editor.key.full
-   { type$ "proceedings" =
-       'editor.key.full
-       'author.key.full
-     if$
-   }
- if$
-}
-
-FUNCTION {output.bibitem}
-{ newline$
- "\bibitem[{" write$
- label write$
- ")" make.full.names duplicate$ short.list =
-    { pop$ }
-    { * }
-  if$
- "}]{" * write$
- cite$ write$
- "}" write$
- newline$
- ""
- before.all 'output.state :=
-}
-
-FUNCTION {n.dashify}
-{
-  't :=
-  ""
-    { t empty$ not }
-    { t #1 #1 substring$ "-" =
-        { t #1 #2 substring$ "--" = not
-            { "--" *
-              t #2 global.max$ substring$ 't :=
-            }
-            {   { t #1 #1 substring$ "-" = }
-                { "-" *
-                  t #2 global.max$ substring$ 't :=
-                }
-              while$
-            }
-          if$
-        }
-        { t #1 #1 substring$ *
-          t #2 global.max$ substring$ 't :=
-        }
-      if$
-    }
-  while$
-}
-
-FUNCTION {word.in}
-{ bbl.in
-  ":" *
-  " " * }
-
-FUNCTION {format.date}
-{ year "year" bibinfo.check duplicate$ empty$
-    {
-      "empty year in " cite$ * "; set to ????" * warning$
-       pop$ "????"
-    }
-    'skip$
-  if$
-  extra.label *
-}
-FUNCTION{format.year}
-{ year "year" bibinfo.check duplicate$ empty$
-    {  "empty year in " cite$ *
-       "; set to ????" *
-       warning$
-       pop$ "????"
-    }
-    {
-    }
-  if$
-  extra.label *
-  " (" swap$ * ")" *
-}
-FUNCTION {format.btitle}
-{ title "title" bibinfo.check
-  duplicate$ empty$ 'skip$
-    {
-    }
-  if$
-}
-FUNCTION {either.or.check}
-{ empty$
-    'pop$
-    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
-  if$
-}
-FUNCTION {format.bvolume}
-{ volume empty$
-    { "" }
-    { bbl.volume volume tie.or.space.prefix
-      "volume" bibinfo.check * *
-      series "series" bibinfo.check
-      duplicate$ empty$ 'pop$
-        { swap$ bbl.of space.word * swap$
-          emphasize * }
-      if$
-      "volume and number" number either.or.check
-    }
-  if$
-}
-FUNCTION {format.number.series}
-{ volume empty$
-    { number empty$
-        { series field.or.null }
-        { series empty$
-            { number "number" bibinfo.check }
-        { output.state mid.sentence =
-            { bbl.number }
-            { bbl.number capitalize }
-          if$
-          number tie.or.space.prefix "number" bibinfo.check * *
-          bbl.in space.word *
-          series "series" bibinfo.check *
-        }
-      if$
-    }
-      if$
-    }
-    { "" }
-  if$
-}
-
-FUNCTION {format.edition}
-{ edition duplicate$ empty$ 'skip$
-    {
-      output.state mid.sentence =
-        { "l" }
-        { "t" }
-      if$ change.case$
-      "edition" bibinfo.check
-      " " * bbl.edition *
-    }
-  if$
-}
-INTEGERS { multiresult }
-FUNCTION {multi.page.check}
-{ 't :=
-  #0 'multiresult :=
-    { multiresult not
-      t empty$ not
-      and
-    }
-    { t #1 #1 substring$
-      duplicate$ "-" =
-      swap$ duplicate$ "," =
-      swap$ "+" =
-      or or
-        { #1 'multiresult := }
-        { t #2 global.max$ substring$ 't := }
-      if$
-    }
-  while$
-  multiresult
-}
-FUNCTION {format.pages}
-{ pages duplicate$ empty$ 'skip$
-    { duplicate$ multi.page.check
-        {
-          bbl.pages swap$
-          n.dashify
-        }
-        {
-          bbl.page swap$
-        }
-      if$
-      tie.or.space.prefix
-      "pages" bibinfo.check
-      * *
-    }
-  if$
-}
-FUNCTION {format.journal.pages}
-{ pages duplicate$ empty$ 'pop$
-    { swap$ duplicate$ empty$
-        { pop$ pop$ format.pages }
-        {
-          " " *
-          swap$
-          n.dashify
-          "pages" bibinfo.check
-          *
-        }
-      if$
-    }
-  if$
-}
-FUNCTION {format.vol.num.pages}
-{ volume field.or.null
-  duplicate$ empty$ 'skip$
-    {
-      "volume" bibinfo.check
-    }
-  if$
-  format.year *
-}
-
-FUNCTION {format.chapter.pages}
-{ chapter empty$
-    { "" }
-    { type empty$
-        { bbl.chapter }
-        { type "l" change.case$
-          "type" bibinfo.check
-        }
-      if$
-      chapter tie.or.space.prefix
-      "chapter" bibinfo.check
-      * *
-    }
-  if$
-}
-
-FUNCTION {format.booktitle}
-{
-  booktitle "booktitle" bibinfo.check
-}
-FUNCTION {format.in.ed.booktitle}
-{ format.booktitle duplicate$ empty$ 'skip$
-    {
-      editor "editor" format.names.ed duplicate$ empty$ 'pop$
-        {
-          " " *
-          get.bbl.editor
-          capitalize
-          "(" swap$ * "), " *
-          * swap$
-          * }
-      if$
-      word.in swap$ *
-    }
-  if$
-}
-FUNCTION {format.thesis.type}
-{ type duplicate$ empty$
-    'pop$
-    { swap$ pop$
-      "t" change.case$ "type" bibinfo.check
-    }
-  if$
-}
-FUNCTION {format.tr.number}
-{ number "number" bibinfo.check
-  type duplicate$ empty$
-    { pop$ bbl.techrep }
-    'skip$
-  if$
-  "type" bibinfo.check
-  swap$ duplicate$ empty$
-    { pop$ "t" change.case$ }
-    { tie.or.space.prefix * * }
-  if$
-}
-FUNCTION {format.article.crossref}
-{
-  word.in
-  " \cite{" * crossref * "}" *
-}
-FUNCTION {format.book.crossref}
-{ volume duplicate$ empty$
-    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
-      pop$ word.in
-    }
-    { bbl.volume
-      swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word *
-    }
-  if$
-  " \cite{" * crossref * "}" *
-}
-FUNCTION {format.incoll.inproc.crossref}
-{
-  word.in
-  " \cite{" * crossref * "}" *
-}
-FUNCTION {format.org.or.pub}
-{ 't :=
-  ""
-  address empty$ t empty$ and
-    'skip$
-    {
-      t empty$
-        { address "address" bibinfo.check *
-        }
-        { t *
-          address empty$
-            'skip$
-            { ", " * address "address" bibinfo.check * }
-          if$
-        }
-      if$
-    }
-  if$
-}
-FUNCTION {format.publisher.address}
-{ publisher "publisher" bibinfo.check format.org.or.pub
-}
-
-FUNCTION {format.organization.address}
-{ organization "organization" bibinfo.check format.org.or.pub
-}
-
-FUNCTION {article}
-{ "%Type = Article" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  new.block
-  format.title "title" output.check
-  new.block
-  crossref missing$
-    {
-      journal
-      "journal" bibinfo.check
-      "journal" output.check
-      add.blank
-      format.vol.num.pages output
-    }
-    { format.article.crossref output.nonnull
-    }
-  if$
-  format.journal.pages
-  new.sentence
-  format.note output
-  fin.entry
-}
-FUNCTION {book}
-{ "%Type = Book" write$
-  output.bibitem
-  author empty$
-    { format.editors "author and editor" output.check
-      editor format.key output
-    }
-    { format.authors output.nonnull
-      crossref missing$
-        { "author and editor" editor either.or.check }
-        'skip$
-      if$
-    }
-  if$
-  format.btitle "title" output.check
-  crossref missing$
-    { format.bvolume output
-      format.number.series output
-      format.publisher.address output
-    }
-    {
-      format.book.crossref output.nonnull
-    }
-  if$
-  format.edition output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-FUNCTION {booklet}
-{ "%Type = Booklet" write$
-  output.bibitem
-  format.authors output
-  author format.key output
-  format.title "title" output.check
-  howpublished "howpublished" bibinfo.check output
-  address "address" bibinfo.check output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {inbook}
-{ "%Type = Inbook" write$
-  output.bibitem
-  author empty$
-    { format.editors "author and editor" output.check
-      editor format.key output
-    }
-    { format.authors output.nonnull
-      crossref missing$
-        { "author and editor" editor either.or.check }
-        'skip$
-      if$
-    }
-  if$
-  format.btitle "title" output.check
-  crossref missing$
-    {
-      format.bvolume output
-      format.number.series output
-      format.publisher.address output
-    }
-    {
-      format.book.crossref output.nonnull
-    }
-  if$
-  format.edition output
-  format.pages "pages" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {incollection}
-{ "%Type = Incollection" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  new.block
-  format.title "title" output.check
-  new.block
-  crossref missing$
-    { format.in.ed.booktitle "booktitle" output.check
-      format.bvolume output
-      format.number.series output
-      format.publisher.address output
-      format.edition output
-      format.date output
-    }
-    { format.incoll.inproc.crossref output.nonnull
-    }
-  if$
-  format.pages "pages" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-FUNCTION {inproceedings}
-{ "%Type = Inproceedings" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  new.block
-  format.title "title" output.check
-  new.block
-  crossref missing$
-    { format.in.ed.booktitle "booktitle" output.check
-      format.bvolume output
-      format.number.series output
-      publisher empty$
-        { format.organization.address output }
-        { organization "organization" bibinfo.check output
-          format.publisher.address output
-          format.date output
-        }
-      if$
-    }
-    { format.incoll.inproc.crossref output.nonnull
-    }
-  if$
-  format.pages "pages" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-FUNCTION {conference} { inproceedings }
-FUNCTION {manual}
-{ "%Type = Manual" write$
-  output.bibitem
-  format.authors output
-  author format.key output
-  format.btitle "title" output.check
-  organization "organization" bibinfo.check output
-  address "address" bibinfo.check output
-  format.edition output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {mastersthesis}
-{ "%Type = Masterthesis" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  format.btitle
-  "title" output.check
-  bbl.mthesis format.thesis.type output.nonnull
-  school "school" bibinfo.warn output
-  address "address" bibinfo.check output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {misc}
-{ "%Type = Misc" write$
-  output.bibitem
-  format.authors output
-  author format.key output
-  format.title output
-  howpublished "howpublished" bibinfo.check output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-FUNCTION {phdthesis}
-{ "%Type = Phdthesis" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  format.btitle
-  "title" output.check
-  bbl.phdthesis format.thesis.type output.nonnull
-  school "school" bibinfo.warn output
-  address "address" bibinfo.check output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {proceedings}
-{ "%Type = Proceedings" write$
-  output.bibitem
-  format.editors output
-  editor format.key output
-  format.btitle "title" output.check
-  format.bvolume output
-  format.number.series output
-  publisher empty$
-    { format.organization.address output }
-    { organization "organization" bibinfo.check output
-      format.publisher.address output
-    }
-  if$
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {techreport}
-{ "%Type = Techreport" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  format.btitle
-  "title" output.check
-  format.tr.number output.nonnull
-  institution "institution" bibinfo.warn output
-  address "address" bibinfo.check output
-  format.date "year" output.check
-  new.sentence
-  format.note output
-  fin.entry
-}
-
-FUNCTION {unpublished}
-{ "%Type = Unpublished" write$
-  output.bibitem
-  format.authors "author" output.check
-  author format.key output
-  format.title "title" output.check
-  format.date "year" output.check
-  new.sentence
-  format.note "note" output.check
-  fin.entry
-}
-
-FUNCTION {default.type} { misc }
-READ
-FUNCTION {sortify}
-{ purify$
-  "l" change.case$
-}
-INTEGERS { len }
-FUNCTION {chop.word}
-{ 's :=
-  'len :=
-  s #1 len substring$ =
-    { s len #1 + global.max$ substring$ }
-    's
-  if$
-}
-FUNCTION {format.lab.names}
-{ 's :=
-  "" 't :=
-  s #1 "{vv~}{ll}" format.name$
-  s num.names$ duplicate$
-  #2 >
-    { pop$
-      " " * bbl.etal *
-    }
-    { #2 <
-        'skip$
-        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
-            {
-              " " * bbl.etal *
-            }
-            { bbl.and space.word * s #2 "{vv~}{ll}" format.name$
-              * }
-          if$
-        }
-      if$
-    }
-  if$
-}
-
-FUNCTION {author.key.label}
-{ author empty$
-    { key empty$
-        { cite$ #1 #3 substring$ }
-        'key
-      if$
-    }
-    { author format.lab.names }
-  if$
-}
-
-FUNCTION {author.editor.key.label}
-{ author empty$
-    { editor empty$
-        { key empty$
-            { cite$ #1 #3 substring$ }
-            'key
-          if$
-        }
-        { editor format.lab.names }
-      if$
-    }
-    { author format.lab.names }
-  if$
-}
-
-FUNCTION {editor.key.label}
-{ editor empty$
-    { key empty$
-        { cite$ #1 #3 substring$ }
-        'key
-      if$
-    }
-    { editor format.lab.names }
-  if$
-}
-
-FUNCTION {calc.short.authors}
-{ type$ "book" =
-  type$ "inbook" =
-  or
-    'author.editor.key.label
-    { type$ "proceedings" =
-        'editor.key.label
-        'author.key.label
-      if$
-    }
-  if$
-  'short.list :=
-}
-
-FUNCTION {calc.label}
-{ calc.short.authors
-  short.list
-  "("
-  *
-  year duplicate$ empty$
-     { pop$ "????" }
-     { purify$ #-1 #4 substring$ }
-  if$
-  *
-  'label :=
-}
-
-FUNCTION {sort.format.names}
-{ 's :=
-  #1 'nameptr :=
-  ""
-  s num.names$ 'numnames :=
-  numnames 'namesleft :=
-    { namesleft #0 > }
-    { s nameptr
-      "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}"
-      format.name$ 't :=
-      nameptr #1 >
-        {
-          "   "  *
-          namesleft #1 = t "others" = and
-            { "zzzzz" * }
-            { t sortify * }
-          if$
-        }
-        { t sortify * }
-      if$
-      nameptr #1 + 'nameptr :=
-      namesleft #1 - 'namesleft :=
-    }
-  while$
-}
-
-FUNCTION {sort.format.title}
-{ 't :=
-  "A " #2
-    "An " #3
-      "The " #4 t chop.word
-    chop.word
-  chop.word
-  sortify
-  #1 global.max$ substring$
-}
-FUNCTION {author.sort}
-{ author empty$
-    { key empty$
-        { "to sort, need author or key in " cite$ * warning$
-          ""
-        }
-        { key sortify }
-      if$
-    }
-    { author sort.format.names }
-  if$
-}
-FUNCTION {author.editor.sort}
-{ author empty$
-    { editor empty$
-        { key empty$
-            { "to sort, need author, editor, or key in " cite$ * warning$
-              ""
-            }
-            { key sortify }
-          if$
-        }
-        { editor sort.format.names }
-      if$
-    }
-    { author sort.format.names }
-  if$
-}
-FUNCTION {editor.sort}
-{ editor empty$
-    { key empty$
-        { "to sort, need editor or key in " cite$ * warning$
-          ""
-        }
-        { key sortify }
-      if$
-    }
-    { editor sort.format.names }
-  if$
-}
-FUNCTION {presort}
-{ calc.label
-  label sortify
-  "    "
-  *
-  type$ "book" =
-  type$ "inbook" =
-  or
-    'author.editor.sort
-    { type$ "proceedings" =
-        'editor.sort
-        'author.sort
-      if$
-    }
-  if$
-  #1 entry.max$ substring$
-  'sort.label :=
-  sort.label
-  *
-  "    "
-  *
-  title field.or.null
-  sort.format.title
-  *
-  #1 entry.max$ substring$
-  'sort.key$ :=
-}
-
-ITERATE {presort}
-%SORT
-STRINGS { last.label next.extra }
-INTEGERS { last.extra.num number.label }
-FUNCTION {initialize.extra.label.stuff}
-{ #0 int.to.chr$ 'last.label :=
-  "" 'next.extra :=
-  #0 'last.extra.num :=
-  #0 'number.label :=
-}
-FUNCTION {forward.pass}
-{ last.label label =
-    { last.extra.num #1 + 'last.extra.num :=
-      last.extra.num int.to.chr$ 'extra.label :=
-    }
-    { "a" chr.to.int$ 'last.extra.num :=
-      "" 'extra.label :=
-      label 'last.label :=
-    }
-  if$
-  number.label #1 + 'number.label :=
-}
-FUNCTION {reverse.pass}
-{ next.extra "b" =
-    { "a" 'extra.label := }
-    'skip$
-  if$
-  extra.label 'next.extra :=
-  extra.label
-  duplicate$ empty$
-    'skip$
-    { "{\natexlab{" swap$ * "}}" * }
-  if$
-  'extra.label :=
-  label extra.label * 'label :=
-}
-EXECUTE {initialize.extra.label.stuff}
-ITERATE {forward.pass}
-REVERSE {reverse.pass}
-FUNCTION {bib.sort.order}
-{ sort.label
-  "    "
-  *
-  year field.or.null sortify
-  *
-  "    "
-  *
-  title field.or.null
-  sort.format.title
-  *
-  #1 entry.max$ substring$
-  'sort.key$ :=
-}
-ITERATE {bib.sort.order}
-%SORT
-FUNCTION {begin.bib}
-{ preamble$ empty$
-    'skip$
-    { preamble$ write$ newline$ }
-  if$
-  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
-  write$ newline$
-  "\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi"
-  write$ newline$
-  "\providecommand{\bibinfo}[2]{#2}"
-  write$ newline$
-	"\ifx\xfnm\relax \def\xfnm[#1]{\unskip,\space#1}\fi"
-  write$ newline$
-}
-EXECUTE {begin.bib}
-EXECUTE {init.state.consts}
-ITERATE {call.type$}
-FUNCTION {end.bib}
-{ newline$
-  "\end{thebibliography}" write$ newline$
-}
-EXECUTE {end.bib}
-%% End of customized bst file
-%%
-%% End of file `model1-num-names.bst'.
\ No newline at end of file
diff --git a/sample.bib b/sample.bib
deleted file mode 100644
index 63d7ea744d1921dad7d25f960f80dc0f90d6cf69..0000000000000000000000000000000000000000
--- a/sample.bib
+++ /dev/null
@@ -1,19 +0,0 @@
-@BOOK{Smith:2012qr,
-	title = {{B}ook {T}itle},
-	publisher = {Publisher},
-	author = {Smith, J.~M. and Jones, A.~B.},
-	year = {2012},
-	edition = {7th},
-}
-
-@ARTICLE{Smith:2013jd,
-	author = {Jones, A.~B. and Smith, J.~M.},
-	title = {{A}rticle {T}itle},
-	journal = {{J}ournal {T}itle},
-	year = {2013},
-	volume = {13},
-	pages = {123-456},
-	number = {52},
-	month = {March},
-	publisher = {Publisher}
-}
\ No newline at end of file
diff --git a/sizingDemandResponseTrialsNZ.Rmd b/sizingDemandResponseTrialsNZ.Rmd
index 89ffdf8a68322313f195f7bcbdd66bbbaf4fa0cc..8d2a607efcee9351f83150f8e3d3b4b83b39609d 100644
--- a/sizingDemandResponseTrialsNZ.Rmd
+++ b/sizingDemandResponseTrialsNZ.Rmd
@@ -39,8 +39,8 @@ knitr::opts_chunk$set(echo = FALSE) # by default turn off code echo
 # Set start time ----
 startTime <- proc.time()
 
-library(GREENGridData) # local utilities
-library(GREENGrid) # local utilities
+
+library(myUtils) # local utilities
 
 # Packages needed in this .Rmd file ----
 rmdLibs <- c("data.table", # data munching
@@ -48,11 +48,12 @@ rmdLibs <- c("data.table", # data munching
              "ggplot2", # for fancy graphs
              "readr", # writing to files
              "lubridate", # for today
-             "SAVEr", # power stats
-             "kableExtra" # for extra kable
+             "SAVEr", # power stats functions
+             "GREENGridData",
+             "knitr" # for kable
 )
 # load them
-loadLibraries(rmdLibs)
+myUtils::loadLibraries(rmdLibs)
 
 # Local functions ---
 labelProfilePlot <- function(plot){
@@ -74,9 +75,6 @@ myParams$dPath <- "~/Data/NZ_GREENGrid/safe/gridSpy/1min/dataExtracts/"
 # created from https://dx.doi.org/10.5255/UKDA-SN-853334
 # using https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R
 heatPumpData <- paste0(myParams$dPath, "Heat Pump_2015-04-01_2016-03-31_observations.csv.gz")
-hotWaterData <- paste0(myParams$dPath, "Hot Water_2015-04-01_2016-03-31_observations.csv.gz")
-lightingData <- paste0(myParams$dPath, "Lighting_2015-04-01_2016-03-31_observations.csv.gz")
-
 myParams$GGDataDOI <- "https://dx.doi.org/10.5255/UKDA-SN-853334"
 plotCaption <- paste0("Source: ", myParams$GGDataDOI)
 
@@ -122,7 +120,7 @@ Code history is generally tracked via the paper [repo](https://github.com/datakn
  
 ## Data:
 
-This paper uses circuit level extracts for 'Heat Pumps', 'Lighting' and 'Hot Water' for the NZ GREEN Grid Household Electricity Demand Data (`r myParams$GGDataDOI` [@anderson_new_2018]). These have been extracted using the code found in https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R
+This report uses circuit level extracts for 'Heat Pumps' from the NZ GREEN Grid Household Electricity Demand Data (`r myParams$GGDataDOI` [@anderson_new_2018]). These have been extracted using the code found in https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R
 
 ## Acknowledgements
 
@@ -132,43 +130,13 @@ This paper uses circuit level extracts for 'Heat Pumps', 'Lighting' and 'Hot Wat
 \newpage
 
 # Introduction
-In our experiennce of designing and running empirical studies, whether experimental or naturalistic, there is ongoing confusion over the meaning and role of two key statistical terms _statistical power_ and _statistical significance_. This is compounded by confusion over how these are used in designing studies and in deciding what can be infered from the results and thus what course of action is best.
-
-We have found this to be the case both in academic research where the objective is to establish 'the most likely explanation' under academic conventions and in applied research where the objective is to 'make a robust decision' based on the balance of evidence and probability.
 
-In this brief paper we respond to these confusions using a worked example: the design of a hypothetical household electricity demand response trial in New Zealand which seeks to shift the use of Heat Pumps out of the evening winter peak demand period. We use this example to explain and demonstrate the role of statistical signficance in testing for differences and of both statistical signficance and statistical power in sample design and decision making.
+This report contains the analysis for a paper of the same name. The text is stored elsewhere for ease of editing.
 
 # Error, power, significance and decision making
 
-Two types of error are of concern in both purely academic research where the efficacy of an intervention is to be tested and also in applied research where a decision may then be taken based on the results:
-
- * Type I: a false positive - an effect is inferred when in fact there is none. From a commercial or policy perspective this could lead to the implementation of a costly intervention which would be unlikely to have the effect expected;
- * Type II: a false negative - an effect is not inferred when in fact there is one. From a commercial or policy perspective this could lead to inaction when an intervention would have been likely to have the effect expected.
- 
-_Type I error_: The significance level (p value) of the statistical test to be used to test the efficacy of an intervention represents not only the extent to which the observed data matches the null model to be tested [@wasserstein2016], but also the risk of a Type I error. In most trials the null model will be a measure of 'no difference' between control and intervenion groups. By convention, the p value _threshold_ for rejecting the null model (the risk of a Type I error) is generally set to 0.05 (5%) although this choice is entirely subjective and reflects human perceptions of what constitutes an unlikely event. In this instance, 5% (or 1 in 20) is considered to represent an unlikely event... In commercial or policy terms an action taken on a larger p value (e.g. setting the p value threshold to 10%) would increase the risk of making a Type I error and thus implementing a potentially costly intervention that is unlikely to have the effect desired. However, as we discuss in more detail below, this is not necessarily _bad practice_ as it may reflect the potential magnitude of an effect, the decision-maker's tolerance of Type I error risk and the urgency of action.
-
-_Type II error_: Statistical power is normally set to 0.8 (80%) by convention and represents the pre-study risk of making a Type II error [@Greenland2016]. From a commercial or policy perspective reducing power (e.g. to 0.7 or 70%) will therefore increase the risk of taking no action when in fact the intervention would probably have had the effect desired. Statistical power calculations enable the investigator to estimate the sample size that would be needed to robustly detect an experimental effect with a given risk of a false positive (Type I error) or false negative (Type II error) result. This prevents a study from recruiting too few participants to be able to robustly detect the hypothesised intervention effect [@Delmas2013Information] or wasting resources by recruiting a larger sample than needed. 
-
-Previous work has suggested that sample sizes in most energy efficiency studies may be too low to provide adequate power and so statistically robust conclusions cannot be drawn at conventional thresholds [@Frederiks2016Evaluating] while a more recent review focusing on demand response studies reached a similar conclusion [@Srivastava2018Assessing]. It is therefore hardly surprising that a number of studies report effect sizes which are not statistically significant at conventional thresholds [@Srivastava2018Assessing], choose to use lower statistical significance thresholds [@RockyMountainInstitute2006Automated, @AECOM2011Energy, @CER2012Smart, @Schofield2015Experimental] or lower both statistical power values _and_ statistical significance thresholds [@energyWiseT1,@energyWiseT2].
-
-However it would be wrong to conclude that this is _necessarily_ bad practice. Recent discussions of the role of p values in inference [@Greenland2016, @wasserstein2016] remind us that decisions should never be based only on statistical significance thresholds set purely by convention. Rather, inference and thus decision making should be based on:
-
- * statistic effect size - is it 2% or 22% (i.e. is the result _important_ or _useful_, "What is the estimated _bang for buck_?");
- * statistic confidence intervals - (i.e. is there _uncertainty_ or _variation_ in response, "How uncertain is the estimated bang?");
- * statistic p values - (i.e. what is the risk of a Type I error / _false positive_, “What is the risk the bang observed isn’t real?”);
-
-Only then can a contextually appropriate decision be taken as to whether the effect is large enough, certain enough and has a low enough risk of being a false positive result to warrant action.
-
-In the following sections we apply these principles to the design and analysis of a hypothetical New Zealand household electricity demand response trial and to the use of a simple statistical test of difference between trial groups to demonstrate and clarify these points.
-
 # Sample design: statistical power
 
-To return to the discussion of statistical power, we need to establish the size of the control and intervention groups we will require. This is crucial to resource budgeting (_"How many households and thus `$` do I need?"_) and ensuring good study design practice ("_Will I be able to answer my research question?_") [@Frederiks2016Evaluating]. In both cases the answer is not absolute since it will depend on our tolerance of Type I and Type II error risks.
-
-Calculation of the required sample size for a control and intervention group requires the estimation of the probable intervention effect size, agreement on the significance level (p value threshold or Type I error risk) of the statistical test to be used and agreement on the level of statistical power (Type II error risk). Given any three of these values the fourth can be calculated if an estimate of the mean and standard deviation of the outcome to be measured is known. In the case of DSR interventions the effect size comprises a given % reduction in energy demand or consumption in a given time period and estimates of the expected reduction can be derived from previous studies or data. 
-
-As we have noted the choice of significance level (p value threshold) and statistical power are subjective and normative. Most academic researchers will struggle to justify relaxing from the conventional p = 0.05 and power = 0.8. However there may be good reason in applied research to take action on results of studies that use less conservative thresholds. Nevertheless there is a strong argument for designing such studies using the more conservative conventional levels but acknowledging that making inferences from the results may require a more relaxed approach to Type I or Type II error risks than is considered 'normal' in academic research.
-
 
 ```{r loadGgData, include =FALSE}
 dt <- data.table::as.data.table(readr::read_csv(heatPumpData, progress = FALSE))
@@ -187,29 +155,20 @@ testDT <- dt[lubridate::hour(r_dateTime) > 15 & # 16:00 ->
                         lubridate::wday(r_dateTime) != 7 & # not Sunday
                         year == 2015,
                         .(meanW = mean(powerW, na.rm = TRUE)), keyby = .(season, linkID)]
+
+testMean <- mean(testDT[season == "Winter"]$meanW) 
+testSD <- mean(testDT[season == "Winter"]$meanW) 
+testSamples <- seq(50,3000,50)
+testPower <- 0.8
+
+powerRes80DT <- SAVEr::estimateEffectSizes(testMean,testSD,testSamples,testPower) # auto-produces range of p values
 ```
 
-```{r ggHPSampleSizeFig80, fig.cap="Power analysis results (power = 0.8)"}
+Figure \@ref(fig:ggHPSampleSizeFig80) shows the initial p = 0.05 plot.
 
-makePowerPlot <- function(dt, myCaption){
-  vLineAlpha <- 0.5
-  vLineCol <- "#0072B2" # http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
-  
-  yp001Ref <<- dt[pValue == "p = 0.01" & sampleN == 1000] # for reference line
-  x001 <- mean(yp001Ref$sampleN)
-  yp005Ref <- dt[pValue == "p = 0.05" & 
-                       effectSize < ceiling(yp001Ref$effectSize) &
-                       effectSize > floor(yp001Ref$effectSize)] # for reference line
-  x005 <- mean(yp005Ref$sampleN)
-  yp01Ref <- dt[pValue == "p = 0.1" & 
-                      effectSize < ceiling(yp001Ref$effectSize) &
-                      effectSize > floor(yp001Ref$effectSize)] # for reference line
-  x01 <<- mean(yp01Ref$sampleN)
-  yp02Ref <- dt[pValue == "p = 0.2" & 
-                      effectSize < ceiling(yp001Ref$effectSize+1) & # fix this to get a value!
-                      effectSize > floor(yp001Ref$effectSize-1)] # for reference line
-  x02 <- mean(yp02Ref$sampleN)
-  
+```{r ggHPSampleSizeFig80, fig.cap="Power analysis results (p = 0.05, power = 0.8)"}
+
+makePowerPlot <- function(dt){
   p <- ggplot2::ggplot(dt, aes(x = sampleN, y = effectSize, colour = pValue)) +
     geom_line() + 
     ylim(0,NA) +
@@ -220,76 +179,119 @@ makePowerPlot <- function(dt, myCaption){
     scale_colour_manual(values=cbPalette) + # use colour-blind friendly palette
     scale_y_continuous(breaks = seq(0,max(dt$effectSize),5)) + # add detail to y scale
     scale_x_continuous(breaks = seq(0,3000,200)) # add detail to x scale
-  
-  p <- p + 
-    geom_hline(yintercept = yp001Ref$effectSize, alpha = vLineAlpha, colour = "black") +
-    geom_segment(x = x001, y = yp001Ref$effectSize, xend = 1000, yend = 0, alpha = vLineAlpha,
-                 colour = cbPalette[1]) +
-    geom_segment(x = x005, y = yp001Ref$effectSize, xend = x005, yend = 0, alpha = vLineAlpha,
-                 colour = cbPalette[2]) +
-    geom_segment(x = x01, y = yp001Ref$effectSize, xend = x01, yend = 0, alpha = vLineAlpha,
-                 colour = cbPalette[3]) + 
-    geom_segment(x = x02, y = yp001Ref$effectSize, xend = x02, yend = 0, alpha = vLineAlpha,
-                 colour = cbPalette[4])
-  
-  p <- p +
-    annotate(geom = "text", 
-             x = 1200, 
-             y = yp001Ref$effectSize + 3, 
-             label = paste0("Effect size = ", round(yp001Ref$effectSize, 2) ,"%"), 
-             colour = cbPalette[1], 
-             hjust = 0) # https://stackoverflow.com/questions/26684023/how-to-left-align-text-in-annotate-from-ggplot2
   return(p)
 }
 
-testMean <- mean(testDT[season == "Winter"]$meanW) 
-testSD <- mean(testDT[season == "Winter"]$meanW) 
-testSamples <- seq(50,3000,50)
-testPower <- 0.8
-
-powerRes80DT <- SAVEr::estimateEffectSizes(testMean,testSD,testSamples,testPower) # auto-produces range of p values
-
-
-
 myCaption <- paste0(plotCaption, ", Winter 2015",
                     "\nStatistic: mean W, weekdays 16:00 - 20:00",
                     "\nTest: R function power.t.test, statistical power = 0.8")
 
-p <- makePowerPlot(powerRes80DT, myCaption)
+p <- makePowerPlot(powerRes80DT[pValue == "p = 0.05"])
+
+p <- p + labs(caption = myCaption) +
+  theme(legend.position="bottom")
+  
+# add annotations
+vLineAlpha <- 0.5
+
+# add hline at effect size for p = 0.05, n = 1000
+p005Ref <- powerRes80DT[pValue == "p = 0.05" & sampleN == 1000] # for reference line
+y005 <- p005Ref$effectSize # effect size for p = 0.05, n = 1000
+x005 <- p005Ref$sampleN
+p <- p + 
+    geom_hline(yintercept = y005, colour = "red") +
+    geom_segment(x = x005, y = y005, xend = x005, yend = 0, alpha = vLineAlpha,
+                 colour = cbPalette[1])
+
+p <- p +
+    annotate(geom = "text", 
+             x = 1800, 
+             y = y005 + 5, 
+             label = paste0("Effect size = ", round(y005, 2) ,"% with \n p = 0.05, power = 0.8 and n = 1000"), 
+             hjust = 0) # https://stackoverflow.com/questions/26684023/how-to-left-align-text-in-annotate-from-ggplot2
+
 p
 
 ggplot2::ggsave("figs/fig1_statPowerEsts80.png", p)
 
 ```
 
-As an illustration, \ref(fig:ggHPSampleSizeFig80) shows sample size calculations with power = 0.8 (80%) using 'Heat Pump' electricity demand extracted from the publicly available New Zealand Green Grid household electricity demand data [@anderson_new_2018] for winter 2015 for the peak demand period (16:00 - 20:00) on weekdays.
 
-These results show that a trial comprising a control and intervention sample of 1000 households (each) would be able to detect an effect size of `r round(yp001Ref$effectSize,2)`% with p = 0.01 and power = 0.8. Were a study to be less risk averse in it's decision making then p = 0.1 may be acceptable in which case only ~ `r x01` households would be needed in each group (see \ref(fig:ggHPSampleSizeFig80)) but of course in this case, the risk of a Type I error would increase. 
+Effect size at n = 1000: `r round(y005, 2)`.
 
-```{r ggHPSampleSizeFig70, include = FALSE, fig.cap="Power analysis results (power = 0.7)"}
+Figure \@ref(fig:ggHPSampleSizeFig80all) shows the plot for all results.
 
-testPower <- 0.7
+```{r ggHPSampleSizeFig80all, fig.cap="Power analysis results (power = 0.8)"}
+# rebuild for all p values
+p <- makePowerPlot(powerRes80DT)
 
-powerRes70DT <- SAVEr::estimateEffectSizes(testMean,testSD,testSamples,testPower) # auto-produces range of p values
+p <- p + labs(caption = myCaption) +
+  theme(legend.position="bottom")
+  
+# add annotations
+vLineAlpha <- 0.5
+
+# add hline at effect size for p = 0.05, n = 1000
+p005Ref <- powerRes80DT[pValue == "p = 0.05" & sampleN == 1000] # for reference line
+y005 <- p005Ref$effectSize # effect size for p = 0.05, n = 1000
+x005 <- p005Ref$sampleN
+p <- p + 
+    geom_hline(yintercept = y005, colour = "red") +
+    geom_segment(x = x005, y = y005, xend = x005, yend = 0, alpha = vLineAlpha,
+                 colour = cbPalette[2])
+
+p <- p +
+    annotate(geom = "text", 
+             x = 1800, 
+             y = y005 + 5, 
+             label = paste0("Effect size = ", round(y005, 2) ,"% with \n p = 0.05, power = 0.8 and n = 1000"), 
+             hjust = 0) # https://stackoverflow.com/questions/26684023/how-to-left-align-text-in-annotate-from-ggplot2
 
-myCaption <- paste0(plotCaption, ", Winter 2015",
-                    "\nStatistic: mean W, weekdays 16:00 - 20:00",
-                    "\nTest: R function power.t.test, statistical power = 0.7")
+p
 
-p <- makePowerPlot(powerRes70DT, myCaption)
+ggplot2::ggsave("figs/fig1_statPowerEsts80.png", p)
+# add vline at 0.01 effect size for p = 0.05, n = 1000
+p001Ref <- powerRes80DT[pValue == "p = 0.01" & 
+                       effectSize < ceiling(p005Ref$effectSize) &
+                       effectSize > floor(p005Ref$effectSize)] # for reference line
+x001 <- mean(p001Ref$sampleN)
+p <- p +  geom_segment(x = x001, y = y005, xend = x001, yend = 0, alpha = vLineAlpha,
+                 colour = cbPalette[1])
+
+# add vline at 0.1 effect size for p = 0.05, n = 1000
+p01Ref <- powerRes80DT[pValue == "p = 0.1" & 
+                       effectSize < ceiling(p005Ref$effectSize) &
+                       effectSize > floor(p005Ref$effectSize)] # for reference line
+x01 <- mean(p01Ref$sampleN)
+p <- p +  geom_segment(x = x01, y = y005, xend = x01, yend = 0, alpha = vLineAlpha,
+                 colour = cbPalette[3])
+
+# add vline at 0.2 effect size for p = 0.05, n = 1000
+p02Ref <- powerRes80DT[pValue == "p = 0.2" & 
+                       effectSize < ceiling(p005Ref$effectSize) &
+                       effectSize > floor(p005Ref$effectSize)] # for reference line
+x02 <- mean(p02Ref$sampleN)
+p <- p +  geom_segment(x = x02, y = y005, xend = x02, yend = 0, alpha = vLineAlpha,
+                 colour = cbPalette[4])
 p
+```
 
-ggplot2::ggsave("figs/fig1_statPowerEsts70.png", p)
+Full table of results:
 
+```{r powerTable}
+dt <- dcast.data.table(powerRes80DT, sampleN ~ pValue)
+  
+knitr::kable(caption = "Full results table (part)", 
+                  dt[sampleN <= 1000],
+                  digits = 2)
 ```
 
-Were we to reduce the statistical power to 0.7 then we would obtain the results shown in \ref(fig:ggHPSampleSizeFig70). In this case a trial comprising a control and intervention sample of 1000 households (each) would be able to detect an effect size of `r round(yp001Ref$effectSize,2)`% with p = 0.01 and power = 0.7. Were a study to be less risk averse in it's decision making then p = 0.1 may be acceptable in which case only ~ `r x01` households would be needed in each group (see \ref(fig:ggHPSampleSizeFig70)) but as before the risk of a Type I error would increase. Similarly, reducing the statistical power used would also reduce the sample required for a given effect size tested at a given p value. However, as before the risk of a Type II error would increase.
 
 # Testing for differences: effect sizes, confidence intervals and p values
 
 ## Getting it 'wrong'
 
-Let us imagine that we have not designed and implemented our sample recruitment according to \ref(fig:ggHPSampleSizeFig80) and instead decided, perhaps for cost reasons to recruit ~ 30 households per group. Now we wish to test for differences between the control and intervention groups.
+
 
 
 ```{r smallNTable}
@@ -303,9 +305,9 @@ t <- testDT[, .("mean W" = mean(meanW),
                      "n households" = .N),
                  keyby = .(group)]
 
-kableExtra::kable(t, caption = "Number of households and summary statistics per group") %>%
-  kable_styling()
+knitr::kable(t, caption = "Number of households and summary statistics per group")
 ```
+
 ```{r ggMeanDiffs, fig.caption = "Mean W demand per group (Error bars = 95% confidence intervals for the sample mean)"}
 plotDT <- testDT[, .(meanW = mean(meanW),
                      sdW = sd(meanW),
@@ -334,9 +336,7 @@ makeMeanCIPlot(plotDT)
   
 ```
 
-As a first step we plot the differences using the mean and 95% confidence intervals as shown in \ref(fig:ggMeanDiffs). As we can see the interventions appear to have reduced demand quite substantially and the error bars indicate the uncertainty (variation) around the mean within each group. Based on this, we suspect that we are unlikely to see low p values when we use statistical tests of the differences as the error bars overlap substantially.
-
-Suppose a t-test of the difference between the Control and Intervention 1 group produces the result shown below.
+T test group 1
  
 ```{r tTestTabG1}
 # fix
@@ -352,15 +352,13 @@ cil <- tTest$conf.int[[1]]
 ciu <- tTest$conf.int[[2]]
 ```
 
-The data shows that the mean power demand for the control group was `r round(controlW,2)`W and for Intervention 1 was `r round(intW,2)`W. This is a (very) large difference in the mean of `r round(controlW - intW, 2)`. The results of the t test are:
+The results show that the mean power demand for the control group was `r round(controlW,2)`W and for Intervention 1 was `r round(intW,2)`W. This is a (very) large difference in the mean of `r round(controlW - intW, 2)`. The results of the t test are:
 
  * effect size = `r round(controlW - intW)`W or `r round(100 * (1-(intW/controlW)))`%  representing a _substantial bang for buck_ for whatever caused the difference;
  * 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _considerable_ uncertainty/variation;
  * p value of `r round(tTest$p.value,3)` representing a _relatively low_ risk of a false positive result but which (just) fails the conventional p < 0.05 threshold.
  
-What would we have concluded? We have a large effect size, substantial uncertainty and a slightly raised risk of a false positive or Type I error when compared to conventional p value levels. From a narrow and conventional 'p value testing' perspective we would have concluded that there was no statistically signficant difference between the groups. However this misses the crucial point that an organisation with a higher risk tolerance might conclude that the large effect size justifies implementing the intervention even though the risk of a false positive is slightly higher. If the p value had been 0.25 then this would have still been the case but would have warranted even further caution.
-
-But what about Intervention Group 2? In this case the t.test results are slightly different:
+T test Group 2
 
 ```{r tTestTabG2}
 # fix
@@ -380,7 +378,6 @@ Now:
  * 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _even greater_ uncertainty/variation;
  * p value of `r round(tTest$p.value,3)` representing a _higher_ risk of a false positive result which fails the conventional p < 0.05 threshold and also the less conservative p < 0.1.
 
-As before, the subsequent action we take depends on our tolerance of Type I (falso positive) risk. We still have a reasonably large effect size but we are less certain about it and we have a higher risk of it not being real. What do you think we should do?
 
 ```{r getN}
 # get sample size required for Int Group 2
@@ -395,13 +392,11 @@ result <- power.t.test(
       )
 ```
 
-In both cases our decision-making is rather hampered by the small sample size even though we have extremely large effect sizes. As we can see from \ref(fig:ggHPSampleSizeFig80), to detect Intervention Group 2's effect size of `r round(100 * (1-(intW/controlW)),2)`% would have required control and trial group sizes of `r round(result$n)` respectively.
+To detect Intervention Group 2's effect size of `r round(100 * (1-(intW/controlW)),2)`% would have required control and trial group sizes of `r round(result$n)` respectively.
 
-However, as the recent discussions of the role of the p value in decision making have made clear [@wasserstein2016] statistical analysis needs to report all of the result elements to enable contextually appropriate and defensible evidence-based decisions to be taken. Simply dismissing results on the basis of a failure to meet conventional statistical levels of significance risks levitating babies and bathwater...
 
 ## Getting it 'right'
 
-Suppose instead that we had designed and implemented our sample recruitment according to \ref(fig:ggHPSampleSizeFig80) so that we have a reasonable chance of detecting a difference of ~ 14% with power = 0.8 and at a significance level (p) of 0.05. This means we should have a sample of around 4000 households split equally (and randomly) between our trial and two intervention groups.
 
 ```{r creatLargeN}
 # fix.
@@ -413,8 +408,7 @@ t <- largeTestDT[, .("mean W" = mean(meanW),
                      "n households" = .N),
                  keyby = .(group)]
 
-kableExtra::kable(t, caption = "Number of households and summary statistics per group") %>%
-  kable_styling()
+knitr::kable(t, caption = "Number of households and summary statistics per group")
 ```
 
 ```{r largeNmeanDiffs, fig.cap="Mean W demand per group for large sample (Error bars = 95% confidence intervals for the sample mean)"}
@@ -429,7 +423,7 @@ myCaption <- paste0("Hypothetical large n sample",
 makeMeanCIPlot(plotDT)
 ```
 
-In comparison to \ref(fig:ggMeanDiffs) we can now see (\ref(fig:largeNmeanDiffs)) that the 95% confidence intervals for the group means are much narrower. This is almost entirely due to the larger sample sizes. Re-running our previous test for differences now produces:
+re-run T tests Group 1
 
 ```{r largeNtTest1}
 # now compare winter & spring for a smaller effect
@@ -448,37 +442,16 @@ In this case:
  * 95% confidence interval for the test = `r round(cil,2)` to `r round(ciu,2)` representing _much less_ uncertainty/variation;
  * p value of `r round(tTest$p.value,4)` representing a _very low_ risk of a false positive result as it passes all conventional thresholds.
  
-So now we are able to be much more confident in our decision to implement Intervention 2 since the average effect is reasonably large, the expected variation in the effect size is reasonably narrow and the risk of a Type I (false positive) error is extremely small. 
 
-# Summary and recomendations
+# Summary and recommendations
 
 ## Statistical power and sample design
 
-Get it right _first time_: we should do the statistical power analysis before we start to make sure the study is even worth trying. If we don't have previous data to use, we _justify_ our choices through power analysis based on defensible assumptions.
-
 ## Reporting statistical tests of difference (effects)
 
-Report all three elements _always_:
-
- * average effect size
- * effect size confidence intervals
- * the p value (risk of Type I errors)
-
-We should also report the statistical power used just to be clear on the risk of Type II errors.
-
 ## Making inferences and taking decisions
 
-Pay attention to all three elements _always_:
-
- * average effect size: what is the _average bang for buck_?
- * effect size confidence intervals: _how uncertain is the bang_?
- * the p value: _what is the risk of a false positive_?
-
-If we have ticked all the boxes so far then we have combined good study design based on statistical power analysis, with a nuanced understanding of what test statistic effect sizes, confidence intervals and p values can tell us. As a result we now have a robust, evidence-based, contextually meaningful and _defensible_ strategy.
-
-# Ackowledgements
-
-We would like to thank collaborators and partners on a number of applied research projects for prodding us into thinking about these issues more deeply and clearly than we othweise would have done. We hope this paper helps to bring some clarity.
+# Acknowledgments
 
 # Runtime
 
@@ -502,7 +475,7 @@ R packages used:
  * readr - for csv reading/writing [@readr]
  * dplyr - for select and contains [@dplyr]
  * progress - for progress bars [@progress]
- * kableExtra - to create this document & neat tables [@knitr]
+ * knitr - to create this document & neat tables [@knitr]
  * GREENGrid - for local NZ GREEN Grid project utilities
 
 Session info:
diff --git a/sizingDemandResponseTrialsNZ.docx b/sizingDemandResponseTrialsNZ.docx
index 359b593a58b3c1be9e88173fcc9ba530209f103c..e0314cf509ee2dfcec001f5b9c5f6da54cb61e2f 100644
Binary files a/sizingDemandResponseTrialsNZ.docx and b/sizingDemandResponseTrialsNZ.docx differ
diff --git a/sizingDemandResponseTrialsNZ.html b/sizingDemandResponseTrialsNZ.html
index b260d2ad16dfff52d2c4159ed11fb03da45b067d..c488989bc0d64b171fd342821f9d50c51de43f52 100644
--- a/sizingDemandResponseTrialsNZ.html
+++ b/sizingDemandResponseTrialsNZ.html
@@ -240,7 +240,7 @@ div.tocify {
 <h1 class="title toc-ignore">Statistical Power, Statistical Significance, Study Design and Decision Making: A Worked Example</h1>
 <h3 class="subtitle"><em>Sizing Demand Response Trials in New Zealand</em></h3>
 <h4 class="author"><em>Ben Anderson and Tom Rushby (Contact: <a href="mailto:b.anderson@soton.ac.uk">b.anderson@soton.ac.uk</a>, <code>@dataknut</code>)</em></h4>
-<h4 class="date"><em>Last run at: 2018-09-27 15:34:32</em></h4>
+<h4 class="date"><em>Last run at: 2018-10-26 16:48:10</em></h4>
 
 </div>
 
@@ -292,7 +292,7 @@ div.tocify {
 </div>
 <div id="data" class="section level2">
 <h2><span class="header-section-number">1.5</span> Data:</h2>
-<p>This paper uses circuit level extracts for ‘Heat Pumps’, ‘Lighting’ and ‘Hot Water’ for the NZ GREEN Grid Household Electricity Demand Data (<a href="https://dx.doi.org/10.5255/UKDA-SN-853334" class="uri">https://dx.doi.org/10.5255/UKDA-SN-853334</a> <span class="citation">(Anderson et al. 2018)</span>). These have been extracted using the code found in <a href="https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R" class="uri">https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R</a></p>
+<p>This report uses circuit level extracts for ‘Heat Pumps’ from the NZ GREEN Grid Household Electricity Demand Data (<a href="https://dx.doi.org/10.5255/UKDA-SN-853334" class="uri">https://dx.doi.org/10.5255/UKDA-SN-853334</a> <span class="citation">(Anderson et al. 2018)</span>). These have been extracted using the code found in <a href="https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R" class="uri">https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R</a></p>
 </div>
 <div id="acknowledgements" class="section level2">
 <h2><span class="header-section-number">1.6</span> Acknowledgements</h2>
@@ -309,52 +309,413 @@ div.tocify {
 </div>
 <div id="introduction" class="section level1">
 <h1><span class="header-section-number">2</span> Introduction</h1>
-<p>In our experiennce of designing and running empirical studies, whether experimental or naturalistic, there is ongoing confusion over the meaning and role of two key statistical terms <em>statistical power</em> and <em>statistical significance</em>. This is compounded by confusion over how these are used in designing studies and in deciding what can be infered from the results and thus what course of action is best.</p>
-<p>We have found this to be the case both in academic research where the objective is to establish ‘the most likely explanation’ under academic conventions and in applied research where the objective is to ‘make a robust decision’ based on the balance of evidence and probability.</p>
-<p>In this brief paper we respond to these confusions using a worked example: the design of a hypothetical household electricity demand response trial in New Zealand which seeks to shift the use of Heat Pumps out of the evening winter peak demand period. We use this example to explain and demonstrate the role of statistical signficance in testing for differences and of both statistical signficance and statistical power in sample design and decision making.</p>
+<p>This report contains the analysis for a paper of the same name. The text is stored elsewhere for ease of editing.</p>
 </div>
 <div id="error-power-significance-and-decision-making" class="section level1">
 <h1><span class="header-section-number">3</span> Error, power, significance and decision making</h1>
-<p>Two types of error are of concern in both purely academic research where the efficacy of an intervention is to be tested and also in applied research where a decision may then be taken based on the results:</p>
-<ul>
-<li>Type I: a false positive - an effect is inferred when in fact there is none. From a commercial or policy perspective this could lead to the implementation of a costly intervention which would be unlikely to have the effect expected;</li>
-<li>Type II: a false negative - an effect is not inferred when in fact there is one. From a commercial or policy perspective this could lead to inaction when an intervention would have been likely to have the effect expected.</li>
-</ul>
-<p><em>Type I error</em>: The significance level (p value) of the statistical test to be used to test the efficacy of an intervention represents not only the extent to which the observed data matches the null model to be tested <span class="citation">(Wasserstein and Lazar 2016)</span>, but also the risk of a Type I error. In most trials the null model will be a measure of ‘no difference’ between control and intervenion groups. By convention, the p value <em>threshold</em> for rejecting the null model (the risk of a Type I error) is generally set to 0.05 (5%) although this choice is entirely subjective and reflects human perceptions of what constitutes an unlikely event. In this instance, 5% (or 1 in 20) is considered to represent an unlikely event… In commercial or policy terms an action taken on a larger p value (e.g. setting the p value threshold to 10%) would increase the risk of making a Type I error and thus implementing a potentially costly intervention that is unlikely to have the effect desired. However, as we discuss in more detail below, this is not necessarily <em>bad practice</em> as it may reflect the potential magnitude of an effect, the decision-maker’s tolerance of Type I error risk and the urgency of action.</p>
-<p><em>Type II error</em>: Statistical power is normally set to 0.8 (80%) by convention and represents the pre-study risk of making a Type II error <span class="citation">(Greenland et al. 2016)</span>. From a commercial or policy perspective reducing power (e.g. to 0.7 or 70%) will therefore increase the risk of taking no action when in fact the intervention would probably have had the effect desired. Statistical power calculations enable the investigator to estimate the sample size that would be needed to robustly detect an experimental effect with a given risk of a false positive (Type I error) or false negative (Type II error) result. This prevents a study from recruiting too few participants to be able to robustly detect the hypothesised intervention effect <span class="citation">(Delmas, Fischlein, and Asensio 2013)</span> or wasting resources by recruiting a larger sample than needed.</p>
-<p>Previous work has suggested that sample sizes in most energy efficiency studies may be too low to provide adequate power and so statistically robust conclusions cannot be drawn at conventional thresholds <span class="citation">(Frederiks et al. 2016)</span> while a more recent review focusing on demand response studies reached a similar conclusion <span class="citation">(Srivastava, Van Passel, and Laes 2018)</span>. It is therefore hardly surprising that a number of studies report effect sizes which are not statistically significant at conventional thresholds <span class="citation">(Srivastava, Van Passel, and Laes 2018)</span>, choose to use lower statistical significance thresholds <span class="citation">(Rocky Mountain Institute 2006, <span class="citation">AECOM (2011)</span>, <span class="citation">CER (2012)</span>, <span class="citation">Schofield et al. (2015)</span>)</span> or lower both statistical power values <em>and</em> statistical significance thresholds <span class="citation">(UKPN 2017,<span class="citation">UKPN (2018)</span>)</span>.</p>
-<p>However it would be wrong to conclude that this is <em>necessarily</em> bad practice. Recent discussions of the role of p values in inference <span class="citation">(Greenland et al. 2016, <span class="citation">Wasserstein and Lazar (2016)</span>)</span> remind us that decisions should never be based only on statistical significance thresholds set purely by convention. Rather, inference and thus decision making should be based on:</p>
-<ul>
-<li>statistic effect size - is it 2% or 22% (i.e. is the result <em>important</em> or <em>useful</em>, “What is the estimated <em>bang for buck</em>?”);</li>
-<li>statistic confidence intervals - (i.e. is there <em>uncertainty</em> or <em>variation</em> in response, “How uncertain is the estimated bang?”);</li>
-<li>statistic p values - (i.e. what is the risk of a Type I error / <em>false positive</em>, “What is the risk the bang observed isn’t real?”);</li>
-</ul>
-<p>Only then can a contextually appropriate decision be taken as to whether the effect is large enough, certain enough and has a low enough risk of being a false positive result to warrant action.</p>
-<p>In the following sections we apply these principles to the design and analysis of a hypothetical New Zealand household electricity demand response trial and to the use of a simple statistical test of difference between trial groups to demonstrate and clarify these points.</p>
 </div>
 <div id="sample-design-statistical-power" class="section level1">
 <h1><span class="header-section-number">4</span> Sample design: statistical power</h1>
-<p>To return to the discussion of statistical power, we need to establish the size of the control and intervention groups we will require. This is crucial to resource budgeting (<em>“How many households and thus <code>$</code> do I need?”</em>) and ensuring good study design practice (“<em>Will I be able to answer my research question?</em>”) <span class="citation">(Frederiks et al. 2016)</span>. In both cases the answer is not absolute since it will depend on our tolerance of Type I and Type II error risks.</p>
-<p>Calculation of the required sample size for a control and intervention group requires the estimation of the probable intervention effect size, agreement on the significance level (p value threshold or Type I error risk) of the statistical test to be used and agreement on the level of statistical power (Type II error risk). Given any three of these values the fourth can be calculated if an estimate of the mean and standard deviation of the outcome to be measured is known. In the case of DSR interventions the effect size comprises a given % reduction in energy demand or consumption in a given time period and estimates of the expected reduction can be derived from previous studies or data.</p>
-<p>As we have noted the choice of significance level (p value threshold) and statistical power are subjective and normative. Most academic researchers will struggle to justify relaxing from the conventional p = 0.05 and power = 0.8. However there may be good reason in applied research to take action on results of studies that use less conservative thresholds. Nevertheless there is a strong argument for designing such studies using the more conservative conventional levels but acknowledging that making inferences from the results may require a more relaxed approach to Type I or Type II error risks than is considered ‘normal’ in academic research.</p>
+<p>Figure <a href="#fig:ggHPSampleSizeFig80">4.1</a> shows the initial p = 0.05 plot.</p>
 <pre><code>## Scale for &#39;y&#39; is already present. Adding another scale for &#39;y&#39;, which
 ## will replace the existing scale.</code></pre>
 <div class="figure"><span id="fig:ggHPSampleSizeFig80"></span>
-<img src="sizingDemandResponseTrialsNZ_files/figure-html/ggHPSampleSizeFig80-1.png" alt="Power analysis results (power = 0.8)"  />
+<img src="sizingDemandResponseTrialsNZ_files/figure-html/ggHPSampleSizeFig80-1.png" alt="Power analysis results (p = 0.05, power = 0.8)"  />
 <p class="caption">
-Figure 4.1: Power analysis results (power = 0.8)
+Figure 4.1: Power analysis results (p = 0.05, power = 0.8)
 </p>
 </div>
 <pre><code>## Saving 7 x 5 in image</code></pre>
-<p>As an illustration, (fig:ggHPSampleSizeFig80) shows sample size calculations with power = 0.8 (80%) using ‘Heat Pump’ electricity demand extracted from the publicly available New Zealand Green Grid household electricity demand data <span class="citation">(Anderson et al. 2018)</span> for winter 2015 for the peak demand period (16:00 - 20:00) on weekdays.</p>
-<p>These results show that a trial comprising a control and intervention sample of 1000 households (each) would be able to detect an effect size of 14.18% with p = 0.01 and power = 0.8. Were a study to be less risk averse in it’s decision making then p = 0.1 may be acceptable in which case only ~ 450 households would be needed in each group (see (fig:ggHPSampleSizeFig80)) but of course in this case, the risk of a Type I error would increase.</p>
-<p>Were we to reduce the statistical power to 0.7 then we would obtain the results shown in (fig:ggHPSampleSizeFig70). In this case a trial comprising a control and intervention sample of 1000 households (each) would be able to detect an effect size of 12.76% with p = 0.01 and power = 0.7. Were a study to be less risk averse in it’s decision making then p = 0.1 may be acceptable in which case only ~ 425 households would be needed in each group (see (fig:ggHPSampleSizeFig70)) but as before the risk of a Type I error would increase. Similarly, reducing the statistical power used would also reduce the sample required for a given effect size tested at a given p value. However, as before the risk of a Type II error would increase.</p>
+<p>Effect size at n = 1000: 11.12.</p>
+<p>Figure <a href="#fig:ggHPSampleSizeFig80all">4.2</a> shows the plot for all results.</p>
+<pre><code>## Scale for &#39;y&#39; is already present. Adding another scale for &#39;y&#39;, which
+## will replace the existing scale.</code></pre>
+<div class="figure"><span id="fig:ggHPSampleSizeFig80all"></span>
+<img src="sizingDemandResponseTrialsNZ_files/figure-html/ggHPSampleSizeFig80all-1.png" alt="Power analysis results (power = 0.8)"  />
+<p class="caption">
+Figure 4.2: Power analysis results (power = 0.8)
+</p>
+</div>
+<pre><code>## Saving 7 x 5 in image</code></pre>
+<div class="figure"><span id="fig:ggHPSampleSizeFig80all"></span>
+<img src="sizingDemandResponseTrialsNZ_files/figure-html/ggHPSampleSizeFig80all-2.png" alt="Power analysis results (power = 0.8)"  />
+<p class="caption">
+Figure 4.2: Power analysis results (power = 0.8)
+</p>
+</div>
+<p>Full table of results:</p>
+<pre><code>## Using &#39;effectSize&#39; as value column. Use &#39;value.var&#39; to override</code></pre>
+<table>
+<caption>
+<span id="tab:powerTable">Table 4.1: </span>Full results table (part)
+</caption>
+<thead>
+<tr>
+<th style="text-align:right;">
+sampleN
+</th>
+<th style="text-align:right;">
+p = 0.01
+</th>
+<th style="text-align:right;">
+p = 0.05
+</th>
+<th style="text-align:right;">
+p = 0.1
+</th>
+<th style="text-align:right;">
+p = 0.2
+</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:right;">
+50
+</td>
+<td style="text-align:right;">
+64.25
+</td>
+<td style="text-align:right;">
+50.08
+</td>
+<td style="text-align:right;">
+42.64
+</td>
+<td style="text-align:right;">
+33.73
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+100
+</td>
+<td style="text-align:right;">
+45.11
+</td>
+<td style="text-align:right;">
+35.28
+</td>
+<td style="text-align:right;">
+30.09
+</td>
+<td style="text-align:right;">
+23.83
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+150
+</td>
+<td style="text-align:right;">
+36.75
+</td>
+<td style="text-align:right;">
+28.78
+</td>
+<td style="text-align:right;">
+24.55
+</td>
+<td style="text-align:right;">
+19.45
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+200
+</td>
+<td style="text-align:right;">
+31.79
+</td>
+<td style="text-align:right;">
+24.91
+</td>
+<td style="text-align:right;">
+21.25
+</td>
+<td style="text-align:right;">
+16.84
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+250
+</td>
+<td style="text-align:right;">
+28.41
+</td>
+<td style="text-align:right;">
+22.27
+</td>
+<td style="text-align:right;">
+19.01
+</td>
+<td style="text-align:right;">
+15.06
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+300
+</td>
+<td style="text-align:right;">
+25.93
+</td>
+<td style="text-align:right;">
+20.32
+</td>
+<td style="text-align:right;">
+17.35
+</td>
+<td style="text-align:right;">
+13.75
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+350
+</td>
+<td style="text-align:right;">
+23.99
+</td>
+<td style="text-align:right;">
+18.81
+</td>
+<td style="text-align:right;">
+16.06
+</td>
+<td style="text-align:right;">
+12.73
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+400
+</td>
+<td style="text-align:right;">
+22.44
+</td>
+<td style="text-align:right;">
+17.60
+</td>
+<td style="text-align:right;">
+15.02
+</td>
+<td style="text-align:right;">
+11.90
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+450
+</td>
+<td style="text-align:right;">
+21.15
+</td>
+<td style="text-align:right;">
+16.59
+</td>
+<td style="text-align:right;">
+14.16
+</td>
+<td style="text-align:right;">
+11.22
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+500
+</td>
+<td style="text-align:right;">
+20.06
+</td>
+<td style="text-align:right;">
+15.74
+</td>
+<td style="text-align:right;">
+13.43
+</td>
+<td style="text-align:right;">
+10.65
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+550
+</td>
+<td style="text-align:right;">
+19.13
+</td>
+<td style="text-align:right;">
+15.00
+</td>
+<td style="text-align:right;">
+12.81
+</td>
+<td style="text-align:right;">
+10.15
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+600
+</td>
+<td style="text-align:right;">
+18.31
+</td>
+<td style="text-align:right;">
+14.36
+</td>
+<td style="text-align:right;">
+12.26
+</td>
+<td style="text-align:right;">
+9.72
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+650
+</td>
+<td style="text-align:right;">
+17.59
+</td>
+<td style="text-align:right;">
+13.80
+</td>
+<td style="text-align:right;">
+11.78
+</td>
+<td style="text-align:right;">
+9.34
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+700
+</td>
+<td style="text-align:right;">
+16.95
+</td>
+<td style="text-align:right;">
+13.30
+</td>
+<td style="text-align:right;">
+11.35
+</td>
+<td style="text-align:right;">
+9.00
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+750
+</td>
+<td style="text-align:right;">
+16.37
+</td>
+<td style="text-align:right;">
+12.85
+</td>
+<td style="text-align:right;">
+10.97
+</td>
+<td style="text-align:right;">
+8.69
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+800
+</td>
+<td style="text-align:right;">
+15.85
+</td>
+<td style="text-align:right;">
+12.44
+</td>
+<td style="text-align:right;">
+10.62
+</td>
+<td style="text-align:right;">
+8.42
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+850
+</td>
+<td style="text-align:right;">
+15.38
+</td>
+<td style="text-align:right;">
+12.07
+</td>
+<td style="text-align:right;">
+10.30
+</td>
+<td style="text-align:right;">
+8.17
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+900
+</td>
+<td style="text-align:right;">
+14.95
+</td>
+<td style="text-align:right;">
+11.73
+</td>
+<td style="text-align:right;">
+10.01
+</td>
+<td style="text-align:right;">
+7.94
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+950
+</td>
+<td style="text-align:right;">
+14.55
+</td>
+<td style="text-align:right;">
+11.41
+</td>
+<td style="text-align:right;">
+9.74
+</td>
+<td style="text-align:right;">
+7.72
+</td>
+</tr>
+<tr>
+<td style="text-align:right;">
+1000
+</td>
+<td style="text-align:right;">
+14.18
+</td>
+<td style="text-align:right;">
+11.12
+</td>
+<td style="text-align:right;">
+9.50
+</td>
+<td style="text-align:right;">
+7.53
+</td>
+</tr>
+</tbody>
+</table>
 </div>
 <div id="testing-for-differences-effect-sizes-confidence-intervals-and-p-values" class="section level1">
 <h1><span class="header-section-number">5</span> Testing for differences: effect sizes, confidence intervals and p values</h1>
 <div id="getting-it-wrong" class="section level2">
 <h2><span class="header-section-number">5.1</span> Getting it ‘wrong’</h2>
-<p>Let us imagine that we have not designed and implemented our sample recruitment according to (fig:ggHPSampleSizeFig80) and instead decided, perhaps for cost reasons to recruit ~ 30 households per group. Now we wish to test for differences between the control and intervention groups.</p>
 <table class="table" style="margin-left: auto; margin-right: auto;">
 <caption>
 <span id="tab:smallNTable">Table 5.1: </span>Number of households and summary statistics per group
@@ -435,8 +796,7 @@ Intervention 3
 </tbody>
 </table>
 <p><img src="sizingDemandResponseTrialsNZ_files/figure-html/ggMeanDiffs-1.png" /><!-- --></p>
-<p>As a first step we plot the differences using the mean and 95% confidence intervals as shown in (fig:ggMeanDiffs). As we can see the interventions appear to have reduced demand quite substantially and the error bars indicate the uncertainty (variation) around the mean within each group. Based on this, we suspect that we are unlikely to see low p values when we use statistical tests of the differences as the error bars overlap substantially.</p>
-<p>Suppose a t-test of the difference between the Control and Intervention 1 group produces the result shown below.</p>
+<p>T test group 1</p>
 <pre><code>## 
 ##  Welch Two Sample t-test
 ## 
@@ -448,14 +808,13 @@ Intervention 3
 ## sample estimates:
 ## mean of x mean of y 
 ##  35.13947 162.66915</code></pre>
-<p>The data shows that the mean power demand for the control group was 162.67W and for Intervention 1 was 35.14W. This is a (very) large difference in the mean of 127.53. The results of the t test are:</p>
+<p>The results show that the mean power demand for the control group was 162.67W and for Intervention 1 was 35.14W. This is a (very) large difference in the mean of 127.53. The results of the t test are:</p>
 <ul>
 <li>effect size = 128W or 78% representing a <em>substantial bang for buck</em> for whatever caused the difference;</li>
 <li>95% confidence interval for the test = -258.11 to 3.05 representing <em>considerable</em> uncertainty/variation;</li>
 <li>p value of 0.055 representing a <em>relatively low</em> risk of a false positive result but which (just) fails the conventional p &lt; 0.05 threshold.</li>
 </ul>
-<p>What would we have concluded? We have a large effect size, substantial uncertainty and a slightly raised risk of a false positive or Type I error when compared to conventional p value levels. From a narrow and conventional ‘p value testing’ perspective we would have concluded that there was no statistically signficant difference between the groups. However this misses the crucial point that an organisation with a higher risk tolerance might conclude that the large effect size justifies implementing the intervention even though the risk of a false positive is slightly higher. If the p value had been 0.25 then this would have still been the case but would have warranted even further caution.</p>
-<p>But what about Intervention Group 2? In this case the t.test results are slightly different:</p>
+<p>T test Group 2</p>
 <pre><code>## 
 ##  Welch Two Sample t-test
 ## 
@@ -473,13 +832,10 @@ Intervention 3
 <li>95% confidence interval for the test = -236.83 to 29.1 representing <em>even greater</em> uncertainty/variation;</li>
 <li>p value of 0.122 representing a <em>higher</em> risk of a false positive result which fails the conventional p &lt; 0.05 threshold and also the less conservative p &lt; 0.1.</li>
 </ul>
-<p>As before, the subsequent action we take depends on our tolerance of Type I (falso positive) risk. We still have a reasonably large effect size but we are less certain about it and we have a higher risk of it not being real. What do you think we should do?</p>
-<p>In both cases our decision-making is rather hampered by the small sample size even though we have extremely large effect sizes. As we can see from (fig:ggHPSampleSizeFig80), to detect Intervention Group 2’s effect size of 63.85% would have required control and trial group sizes of 47 respectively.</p>
-<p>However, as the recent discussions of the role of the p value in decision making have made clear <span class="citation">(Wasserstein and Lazar 2016)</span> statistical analysis needs to report all of the result elements to enable contextually appropriate and defensible evidence-based decisions to be taken. Simply dismissing results on the basis of a failure to meet conventional statistical levels of significance risks levitating babies and bathwater…</p>
+<p>To detect Intervention Group 2’s effect size of 63.85% would have required control and trial group sizes of 47 respectively.</p>
 </div>
 <div id="getting-it-right" class="section level2">
 <h2><span class="header-section-number">5.2</span> Getting it ‘right’</h2>
-<p>Suppose instead that we had designed and implemented our sample recruitment according to (fig:ggHPSampleSizeFig80) so that we have a reasonable chance of detecting a difference of ~ 14% with power = 0.8 and at a significance level (p) of 0.05. This means we should have a sample of around 4000 households split equally (and randomly) between our trial and two intervention groups.</p>
 <table class="table" style="margin-left: auto; margin-right: auto;">
 <caption>
 <span id="tab:creatLargeN">Table 5.2: </span>Number of households and summary statistics per group
@@ -506,13 +862,13 @@ n households
 Control
 </td>
 <td style="text-align:right;">
-160.06236
+156.60137
 </td>
 <td style="text-align:right;">
-320.32863
+314.53385
 </td>
 <td style="text-align:right;">
-1070
+1056
 </td>
 </tr>
 <tr>
@@ -520,13 +876,13 @@ Control
 Intervention 1
 </td>
 <td style="text-align:right;">
-38.38931
+36.48668
 </td>
 <td style="text-align:right;">
-84.81464
+83.07123
 </td>
 <td style="text-align:right;">
-899
+859
 </td>
 </tr>
 <tr>
@@ -534,13 +890,13 @@ Intervention 1
 Intervention 2
 </td>
 <td style="text-align:right;">
-62.96979
+65.60424
 </td>
 <td style="text-align:right;">
-116.10543
+116.56908
 </td>
 <td style="text-align:right;">
-1056
+1076
 </td>
 </tr>
 <tr>
@@ -548,13 +904,13 @@ Intervention 2
 Intervention 3
 </td>
 <td style="text-align:right;">
-66.80930
+63.70874
 </td>
 <td style="text-align:right;">
-145.74274
+140.13172
 </td>
 <td style="text-align:right;">
-1175
+1209
 </td>
 </tr>
 </tbody>
@@ -565,61 +921,44 @@ Intervention 3
 Figure 5.1: Mean W demand per group for large sample (Error bars = 95% confidence intervals for the sample mean)
 </p>
 </div>
-<p>In comparison to (fig:ggMeanDiffs) we can now see ((fig:largeNmeanDiffs)) that the 95% confidence intervals for the group means are much narrower. This is almost entirely due to the larger sample sizes. Re-running our previous test for differences now produces:</p>
+<p>re-run T tests Group 1</p>
 <pre><code>## 
 ##  Welch Two Sample t-test
 ## 
 ## data:  largeTestDT[group == &quot;Intervention 2&quot;]$meanW and largeTestDT[group == &quot;Control&quot;]$meanW
-## t = -9.3142, df = 1348.3, p-value &lt; 2.2e-16
+## t = -8.8254, df = 1334.8, p-value &lt; 2.2e-16
 ## alternative hypothesis: true difference in means is not equal to 0
 ## 95 percent confidence interval:
-##  -117.54191  -76.64322
+##  -111.22438  -70.76988
 ## sample estimates:
 ## mean of x mean of y 
-##  62.96979 160.06236</code></pre>
+##  65.60424 156.60137</code></pre>
 <p>In this case:</p>
 <ul>
-<li>effect size = 97.0925661W or 60.66% representing a still <em>reasonable bang for buck</em> for whatever caused the difference;</li>
-<li>95% confidence interval for the test = -117.54 to -76.64 representing <em>much less</em> uncertainty/variation;</li>
+<li>effect size = 90.9971321W or 58.11% representing a still <em>reasonable bang for buck</em> for whatever caused the difference;</li>
+<li>95% confidence interval for the test = -111.22 to -70.77 representing <em>much less</em> uncertainty/variation;</li>
 <li>p value of 0 representing a <em>very low</em> risk of a false positive result as it passes all conventional thresholds.</li>
 </ul>
-<p>So now we are able to be much more confident in our decision to implement Intervention 2 since the average effect is reasonably large, the expected variation in the effect size is reasonably narrow and the risk of a Type I (false positive) error is extremely small.</p>
 </div>
 </div>
-<div id="summary-and-recomendations" class="section level1">
-<h1><span class="header-section-number">6</span> Summary and recomendations</h1>
+<div id="summary-and-recommendations" class="section level1">
+<h1><span class="header-section-number">6</span> Summary and recommendations</h1>
 <div id="statistical-power-and-sample-design" class="section level2">
 <h2><span class="header-section-number">6.1</span> Statistical power and sample design</h2>
-<p>Get it right <em>first time</em>: we should do the statistical power analysis before we start to make sure the study is even worth trying. If we don’t have previous data to use, we <em>justify</em> our choices through power analysis based on defensible assumptions.</p>
 </div>
 <div id="reporting-statistical-tests-of-difference-effects" class="section level2">
 <h2><span class="header-section-number">6.2</span> Reporting statistical tests of difference (effects)</h2>
-<p>Report all three elements <em>always</em>:</p>
-<ul>
-<li>average effect size</li>
-<li>effect size confidence intervals</li>
-<li>the p value (risk of Type I errors)</li>
-</ul>
-<p>We should also report the statistical power used just to be clear on the risk of Type II errors.</p>
 </div>
 <div id="making-inferences-and-taking-decisions" class="section level2">
 <h2><span class="header-section-number">6.3</span> Making inferences and taking decisions</h2>
-<p>Pay attention to all three elements <em>always</em>:</p>
-<ul>
-<li>average effect size: what is the <em>average bang for buck</em>?</li>
-<li>effect size confidence intervals: <em>how uncertain is the bang</em>?</li>
-<li>the p value: <em>what is the risk of a false positive</em>?</li>
-</ul>
-<p>If we have ticked all the boxes so far then we have combined good study design based on statistical power analysis, with a nuanced understanding of what test statistic effect sizes, confidence intervals and p values can tell us. As a result we now have a robust, evidence-based, contextually meaningful and <em>defensible</em> strategy.</p>
 </div>
 </div>
-<div id="ackowledgements" class="section level1">
-<h1><span class="header-section-number">7</span> Ackowledgements</h1>
-<p>We would like to thank collaborators and partners on a number of applied research projects for prodding us into thinking about these issues more deeply and clearly than we othweise would have done. We hope this paper helps to bring some clarity.</p>
+<div id="acknowledgments" class="section level1">
+<h1><span class="header-section-number">7</span> Acknowledgments</h1>
 </div>
 <div id="runtime" class="section level1">
 <h1><span class="header-section-number">8</span> Runtime</h1>
-<p>Analysis completed in 71.73 seconds ( 1.2 minutes) using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a> with R version 3.5.1 (2018-07-02) running on x86_64-apple-darwin15.6.0.</p>
+<p>Analysis completed in 50.19 seconds ( 0.84 minutes) using <a href="https://cran.r-project.org/package=knitr">knitr</a> in <a href="http://www.rstudio.com">RStudio</a> with R version 3.5.1 (2018-07-02) running on x86_64-apple-darwin15.6.0.</p>
 </div>
 <div id="r-environment" class="section level1">
 <h1><span class="header-section-number">9</span> R environment</h1>
@@ -645,93 +984,57 @@ Figure 5.1: Mean W demand per group for large sample (Error bars = 95% confidenc
 ## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
 ## 
 ## locale:
-## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
+## [1] en_NZ.UTF-8/en_NZ.UTF-8/en_NZ.UTF-8/C/en_NZ.UTF-8/en_NZ.UTF-8
 ## 
 ## attached base packages:
 ## [1] stats     graphics  grDevices utils     datasets  methods   base     
 ## 
 ## other attached packages:
-## [1] kableExtra_0.9.0  SAVEr_0.0.1.9000  lubridate_1.7.4   readr_1.1.1      
-## [5] ggplot2_3.0.0     dplyr_0.7.6       data.table_1.11.4 GREENGrid_0.1.0  
-## [9] GREENGridData_1.0
+## [1] kableExtra_0.9.0   GREENGridData_1.0  SAVEr_0.0.1.9000  
+## [4] lubridate_1.7.4    readr_1.1.1        ggplot2_3.1.0     
+## [7] dplyr_0.7.7        data.table_1.11.8  myUtils_0.0.0.9000
 ## 
 ## loaded via a namespace (and not attached):
-##  [1] Rcpp_0.12.18      lattice_0.20-35   tidyr_0.8.1      
-##  [4] prettyunits_1.0.2 png_0.1-7         utf8_1.1.4       
-##  [7] assertthat_0.2.0  rprojroot_1.3-2   digest_0.6.15    
-## [10] R6_2.2.2          cellranger_1.1.0  plyr_1.8.4       
-## [13] backports_1.1.2   evaluate_0.11     highr_0.7        
-## [16] httr_1.3.1        pillar_1.3.0      RgoogleMaps_1.4.2
-## [19] rlang_0.2.2       progress_1.2.0    lazyeval_0.2.1   
-## [22] readxl_1.1.0      rstudioapi_0.7    geosphere_1.5-7  
-## [25] rmarkdown_1.10    labeling_0.3      proto_1.0.0      
-## [28] stringr_1.3.1     munsell_0.5.0     broom_0.5.0      
-## [31] compiler_3.5.1    modelr_0.1.2      xfun_0.3         
-## [34] pkgconfig_2.0.2   htmltools_0.3.6   openssl_1.0.2    
-## [37] tidyselect_0.2.4  tibble_1.4.2      bookdown_0.7     
-## [40] fansi_0.3.0       viridisLite_0.3.0 crayon_1.3.4     
-## [43] withr_2.1.2       grid_3.5.1        nlme_3.1-137     
-## [46] jsonlite_1.5      gtable_0.2.0      magrittr_1.5     
-## [49] scales_1.0.0      cli_1.0.0         stringi_1.2.4    
-## [52] mapproj_1.2.6     reshape2_1.4.3    bindrcpp_0.2.2   
-## [55] sp_1.3-1          tidyverse_1.2.1   xml2_1.2.0       
-## [58] rjson_0.2.20      tools_3.5.1       forcats_0.3.0    
-## [61] ggmap_2.6.1       glue_1.3.0        purrr_0.2.5      
-## [64] maps_3.3.0        hms_0.4.2         jpeg_0.1-8       
-## [67] yaml_2.2.0        colorspace_1.3-2  rvest_0.3.2      
-## [70] knitr_1.20.13     bindr_0.1.1       haven_1.1.2</code></pre>
+##  [1] progress_1.2.0    tidyselect_0.2.5  xfun_0.4         
+##  [4] purrr_0.2.5       reshape2_1.4.3    haven_1.1.2      
+##  [7] lattice_0.20-35   colorspace_1.3-2  viridisLite_0.3.0
+## [10] htmltools_0.3.6   yaml_2.2.0        utf8_1.1.4       
+## [13] rlang_0.3.0.1     pillar_1.3.0      glue_1.3.0       
+## [16] withr_2.1.2       tidyverse_1.2.1   modelr_0.1.2     
+## [19] readxl_1.1.0      bindrcpp_0.2.2    bindr_0.1.1      
+## [22] plyr_1.8.4        stringr_1.3.1     munsell_0.5.0    
+## [25] gtable_0.2.0      cellranger_1.1.0  rvest_0.3.2      
+## [28] evaluate_0.12     labeling_0.3      knitr_1.20       
+## [31] forcats_0.3.0     fansi_0.4.0       highr_0.7        
+## [34] broom_0.5.0       Rcpp_0.12.19      scales_1.0.0     
+## [37] backports_1.1.2   jsonlite_1.5      hms_0.4.2        
+## [40] digest_0.6.18     stringi_1.2.4     bookdown_0.7     
+## [43] grid_3.5.1        rprojroot_1.3-2   cli_1.0.1        
+## [46] tools_3.5.1       magrittr_1.5      lazyeval_0.2.1   
+## [49] tibble_1.4.2      crayon_1.3.4      tidyr_0.8.1      
+## [52] pkgconfig_2.0.2   xml2_1.2.0        prettyunits_1.0.2
+## [55] rstudioapi_0.8    assertthat_0.2.0  rmarkdown_1.10   
+## [58] httr_1.3.1        R6_2.3.0          nlme_3.1-137     
+## [61] compiler_3.5.1</code></pre>
 </div>
 <div id="references" class="section level1 unnumbered">
 <h1>References</h1>
 <div id="refs" class="references">
-<div id="ref-AECOM2011Energy">
-<p>AECOM. 2011. “Energy Demand Research Project: Final Analysis.” St Albans: AECOM.</p>
-</div>
 <div id="ref-anderson_new_2018">
 <p>Anderson, Ben, David Eyers, Rebecca Ford, Diana Giraldo Ocampo, Rana Peniamina, Janet Stephenson, Kiti Suomalainen, Lara Wilcocks, and Michael Jack. 2018. “New Zealand GREEN Grid Household Electricity Demand Study 2014-2018,” September. doi:<a href="https://doi.org/10.5255/UKDA-SN-853334">10.5255/UKDA-SN-853334</a>.</p>
 </div>
-<div id="ref-CER2012Smart">
-<p>CER. 2012. “Smart Meter Electricity Consumer Behaviour Trial data.” Dublin: Irish Social Science Data Archive. <a href="http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/" class="uri">http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/</a>.</p>
-</div>
 <div id="ref-progress">
 <p>Csárdi, Gábor, and Rich FitzJohn. 2016. <em>Progress: Terminal Progress Bars</em>. <a href="https://CRAN.R-project.org/package=progress" class="uri">https://CRAN.R-project.org/package=progress</a>.</p>
 </div>
-<div id="ref-Delmas2013Information">
-<p>Delmas, Magali A., Miriam Fischlein, and Omar I. Asensio. 2013. “Information strategies and energy conservation behavior: A meta-analysis of experimental studies from 1975 to 2012.” <em>Energy Policy</em> 61 (October): 729–39. doi:<a href="https://doi.org/10.1016/j.enpol.2013.05.109">10.1016/j.enpol.2013.05.109</a>.</p>
-</div>
 <div id="ref-data.table">
 <p>Dowle, M, A Srinivasan, T Short, S Lianoglou with contributions from R Saporta, and E Antonyan. 2015. <em>Data.table: Extension of Data.frame</em>. <a href="https://CRAN.R-project.org/package=data.table" class="uri">https://CRAN.R-project.org/package=data.table</a>.</p>
 </div>
-<div id="ref-Frederiks2016Evaluating">
-<p>Frederiks, Elisha R., Karen Stenner, Elizabeth V. Hobman, and Mark Fischle. 2016. “Evaluating energy behavior change programs using randomized controlled trials: Best practice guidelines for policymakers.” <em>Energy Research &amp; Social Science</em> 22 (December): 147–64. doi:<a href="https://doi.org/10.1016/j.erss.2016.08.020">10.1016/j.erss.2016.08.020</a>.</p>
-</div>
-<div id="ref-Greenland2016">
-<p>Greenland, Sander, Stephen J. Senn, Kenneth J. Rothman, John B. Carlin, Charles Poole, Steven N. Goodman, and Douglas G. Altman. 2016. “Statistical Tests, P Values, Confidence Intervals, and Power: A Guide to Misinterpretations.” <em>European Journal of Epidemiology</em> 31 (4): 337–50. doi:<a href="https://doi.org/10.1007/s10654-016-0149-3">10.1007/s10654-016-0149-3</a>.</p>
-</div>
 <div id="ref-lubridate">
 <p>Grolemund, Garrett, and Hadley Wickham. 2011. “Dates and Times Made Easy with lubridate.” <em>Journal of Statistical Software</em> 40 (3): 1–25. <a href="http://www.jstatsoft.org/v40/i03/" class="uri">http://www.jstatsoft.org/v40/i03/</a>.</p>
 </div>
 <div id="ref-baseR">
 <p>R Core Team. 2016. <em>R: A Language and Environment for Statistical Computing</em>. Vienna, Austria: R Foundation for Statistical Computing. <a href="https://www.R-project.org/" class="uri">https://www.R-project.org/</a>.</p>
 </div>
-<div id="ref-RockyMountainInstitute2006Automated">
-<p>Rocky Mountain Institute. 2006. “Automated demand response system pilot: Final report.” <a href="https://www.smartgrid.gov/files/Aumated_Demd_Response_System_Pilot_Volume_1_Intro_Exec_Summa.pdf" class="uri">https://www.smartgrid.gov/files/Aumated_Demd_Response_System_Pilot_Volume_1_Intro_Exec_Summa.pdf</a>.</p>
-</div>
-<div id="ref-Schofield2015Experimental">
-<p>Schofield, James, Richard Carmichael, Simon Tindemans, Matt Woolf, Mark Bilton, and Goran Strbac. 2015. “Experimental validation of residential consumer responsiveness to dynamic time-of-use pricing.” In <em>23 International Conference on Electricity Distribution</em>.</p>
-</div>
-<div id="ref-Srivastava2018Assessing">
-<p>Srivastava, Aman, Steven Van Passel, and Erik Laes. 2018. “Assessing the Success of Electricity Demand Response Programs: A Meta-Analysis.” <em>Energy Research &amp; Social Science</em> 40 (June): 110–17. doi:<a href="https://doi.org/10.1016/j.erss.2017.12.005">10.1016/j.erss.2017.12.005</a>.</p>
-</div>
-<div id="ref-energyWiseT1">
-<p>UKPN. 2017. “The Final Energy Saving Trial Report.” London: UK Power Networks. <a href="http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/" class="uri">http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/</a>.</p>
-</div>
-<div id="ref-energyWiseT2">
-<p>———. 2018. “The Energy Shifting Trial Report.” London: UK Power Networks. <a href="http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/" class="uri">http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/</a>.</p>
-</div>
-<div id="ref-wasserstein2016">
-<p>Wasserstein, Ronald L., and Nicole A. Lazar. 2016. “The Asa’s Statement on P-Values: Context, Process, and Purpose.” <em>The American Statistician</em> 70 (2). Taylor &amp; Francis: 129–33. doi:<a href="https://doi.org/10.1080/00031305.2016.1154108">10.1080/00031305.2016.1154108</a>.</p>
-</div>
 <div id="ref-ggplot2">
 <p>Wickham, Hadley. 2009. <em>Ggplot2: Elegant Graphics for Data Analysis</em>. Springer-Verlag New York. <a href="http://ggplot2.org" class="uri">http://ggplot2.org</a>.</p>
 </div>
diff --git a/sizingDemandResponseTrialsNZ.md b/sizingDemandResponseTrialsNZ.md
deleted file mode 100644
index a4e47b206d3dd7055950493f87acdf1428ae474a..0000000000000000000000000000000000000000
--- a/sizingDemandResponseTrialsNZ.md
+++ /dev/null
@@ -1,444 +0,0 @@
----
-params:
-  author: 'Ben Anderson and Tom Rushby'
-  title: 'Statistical Power, Statistical Significance, Study Design and Decision Making: A Worked Example'
-  subtitle: 'Sizing Demand Response Trials in New Zealand'
-title: 'Statistical Power, Statistical Significance, Study Design and Decision Making: A Worked Example'
-subtitle: 'Sizing Demand Response Trials in New Zealand'
-author: 'Ben Anderson and Tom Rushby (Contact: b.anderson@soton.ac.uk, `@dataknut`)'
-date: 'Last run at: 2018-09-27 15:34:32'
-always_allow_html: yes
-output:
-  bookdown::html_document2:
-    code_folding: hide
-    fig_caption: yes
-    keep_md: yes
-    number_sections: yes
-    self_contained: no
-    toc: yes
-    toc_depth: 2
-    toc_float: yes
-  bookdown::word_document2:
-    fig_caption: yes
-    toc: yes
-    toc_depth: 2
-  bookdown::pdf_document2:
-    fig_caption: yes
-    keep_tex: yes
-    number_sections: yes
-    toc: yes
-    toc_depth: 2
-bibliography: '/Users/ben/bibliography.bib'
----
-
-
-
-
-
-\newpage
-
-# About
-
-## Paper circulation:
-
- * Public
-
-## License
-
-
-This work is made available under the Creative Commons [Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) License](https://creativecommons.org/licenses/by-sa/4.0/).
-
-This means you are free to:
-
- * _Share_ — copy and redistribute the material in any medium or format
- * _Adapt_ — remix, transform, and build upon the material for any purpose, even commercially.
- 
-Under the following terms:
-
- * _Attribution_ — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
- * _ShareAlike_ — If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.
- * _No additional restrictions_ — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
-
-**Notices:**
-
- * You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
- * No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material. #YMMV
-
-For the avoidance of doubt and explanation of terms please refer to the full [license notice](https://creativecommons.org/licenses/by-sa/4.0/) and [legal code](https://creativecommons.org/licenses/by-sa/4.0/legalcode).
- 
-## Citation
-
-If you wish to use any of the material from this paper please cite as:
-
- * Ben Anderson and Tom Rushby. (2018) Statistical Power, Statistical Significance, Study Design and Decision Making: A Worked Example (Sizing Demand Response Trials in New Zealand), Southampton: University of Southampton.
-
-This work is (c) 2018 the authors.
-
-## History
-
-Code history is generally tracked via the paper [repo](https://github.com/dataknut/powerSignificanceDesignAndDecisionMaking):
-
- * [Paper history](https://github.com/dataknut/powerSignificanceDesignAndDecisionMaking/commits/master)
- 
-## Data:
-
-This paper uses circuit level extracts for 'Heat Pumps', 'Lighting' and 'Hot Water' for the NZ GREEN Grid Household Electricity Demand Data (https://dx.doi.org/10.5255/UKDA-SN-853334 [@anderson_new_2018]). These have been extracted using the code found in https://github.com/CfSOtago/GREENGridData/blob/master/examples/code/extractCleanGridSpy1minCircuit.R
-
-## Acknowledgements
-
-
-This work was supported by:
-
- * The [University of Otago](https://www.otago.ac.nz/);
- * The [University of Southampton](https://www.southampton.ac.uk/);
- * The New Zealand [Ministry of Business, Innovation and Employment (MBIE)](http://www.mbie.govt.nz/) through the [NZ GREEN Grid](https://www.otago.ac.nz/centre-sustainability/research/energy/otago050285.html) grant (Contract ID: UOCX1203);
- * The UK Office of Gas and Electricity Markets through the [Low Carbon Network Fund](https://www.ofgem.gov.uk/press-releases/ofgem-announces-%C2%A359.4-million-funding-10-innovation-projects)-funded ‘[Solent Achieving Value from Efficiency](http://www.energy.soton.ac.uk/tag/save/)’ (SAVE) project; 
- * [SPATIALEC](http://www.energy.soton.ac.uk/tag/spatialec/) - a [Marie Skłodowska-Curie Global Fellowship](http://ec.europa.eu/research/mariecurieactions/about-msca/actions/if/index_en.htm) based at the University of Otago’s [Centre for Sustainability](http://www.otago.ac.nz/centre-sustainability/staff/otago673896.html) (2017-2019) & the University of Southampton's Sustainable Energy Research Group (2019-202).
- 
-\newpage
-
-# Introduction
-In our experiennce of designing and running empirical studies, whether experimental or naturalistic, there is ongoing confusion over the meaning and role of two key statistical terms _statistical power_ and _statistical significance_. This is compounded by confusion over how these are used in designing studies and in deciding what can be infered from the results and thus what course of action is best.
-
-We have found this to be the case both in academic research where the objective is to establish 'the most likely explanation' under academic conventions and in applied research where the objective is to 'make a robust decision' based on the balance of evidence and probability.
-
-In this brief paper we respond to these confusions using a worked example: the design of a hypothetical household electricity demand response trial in New Zealand which seeks to shift the use of Heat Pumps out of the evening winter peak demand period. We use this example to explain and demonstrate the role of statistical signficance in testing for differences and of both statistical signficance and statistical power in sample design and decision making.
-
-# Error, power, significance and decision making
-
-Two types of error are of concern in both purely academic research where the efficacy of an intervention is to be tested and also in applied research where a decision may then be taken based on the results:
-
- * Type I: a false positive - an effect is inferred when in fact there is none. From a commercial or policy perspective this could lead to the implementation of a costly intervention which would be unlikely to have the effect expected;
- * Type II: a false negative - an effect is not inferred when in fact there is one. From a commercial or policy perspective this could lead to inaction when an intervention would have been likely to have the effect expected.
- 
-_Type I error_: The significance level (p value) of the statistical test to be used to test the efficacy of an intervention represents not only the extent to which the observed data matches the null model to be tested [@wasserstein2016], but also the risk of a Type I error. In most trials the null model will be a measure of 'no difference' between control and intervenion groups. By convention, the p value _threshold_ for rejecting the null model (the risk of a Type I error) is generally set to 0.05 (5%) although this choice is entirely subjective and reflects human perceptions of what constitutes an unlikely event. In this instance, 5% (or 1 in 20) is considered to represent an unlikely event... In commercial or policy terms an action taken on a larger p value (e.g. setting the p value threshold to 10%) would increase the risk of making a Type I error and thus implementing a potentially costly intervention that is unlikely to have the effect desired. However, as we discuss in more detail below, this is not necessarily _bad practice_ as it may reflect the potential magnitude of an effect, the decision-maker's tolerance of Type I error risk and the urgency of action.
-
-_Type II error_: Statistical power is normally set to 0.8 (80%) by convention and represents the pre-study risk of making a Type II error [@Greenland2016]. From a commercial or policy perspective reducing power (e.g. to 0.7 or 70%) will therefore increase the risk of taking no action when in fact the intervention would probably have had the effect desired. Statistical power calculations enable the investigator to estimate the sample size that would be needed to robustly detect an experimental effect with a given risk of a false positive (Type I error) or false negative (Type II error) result. This prevents a study from recruiting too few participants to be able to robustly detect the hypothesised intervention effect [@Delmas2013Information] or wasting resources by recruiting a larger sample than needed. 
-
-Previous work has suggested that sample sizes in most energy efficiency studies may be too low to provide adequate power and so statistically robust conclusions cannot be drawn at conventional thresholds [@Frederiks2016Evaluating] while a more recent review focusing on demand response studies reached a similar conclusion [@Srivastava2018Assessing]. It is therefore hardly surprising that a number of studies report effect sizes which are not statistically significant at conventional thresholds [@Srivastava2018Assessing], choose to use lower statistical significance thresholds [@RockyMountainInstitute2006Automated, @AECOM2011Energy, @CER2012Smart, @Schofield2015Experimental] or lower both statistical power values _and_ statistical significance thresholds [@energyWiseT1,@energyWiseT2].
-
-However it would be wrong to conclude that this is _necessarily_ bad practice. Recent discussions of the role of p values in inference [@Greenland2016, @wasserstein2016] remind us that decisions should never be based only on statistical significance thresholds set purely by convention. Rather, inference and thus decision making should be based on:
-
- * statistic effect size - is it 2% or 22% (i.e. is the result _important_ or _useful_, "What is the estimated _bang for buck_?");
- * statistic confidence intervals - (i.e. is there _uncertainty_ or _variation_ in response, "How uncertain is the estimated bang?");
- * statistic p values - (i.e. what is the risk of a Type I error / _false positive_, “What is the risk the bang observed isn’t real?”);
-
-Only then can a contextually appropriate decision be taken as to whether the effect is large enough, certain enough and has a low enough risk of being a false positive result to warrant action.
-
-In the following sections we apply these principles to the design and analysis of a hypothetical New Zealand household electricity demand response trial and to the use of a simple statistical test of difference between trial groups to demonstrate and clarify these points.
-
-# Sample design: statistical power
-
-To return to the discussion of statistical power, we need to establish the size of the control and intervention groups we will require. This is crucial to resource budgeting (_"How many households and thus `$` do I need?"_) and ensuring good study design practice ("_Will I be able to answer my research question?_") [@Frederiks2016Evaluating]. In both cases the answer is not absolute since it will depend on our tolerance of Type I and Type II error risks.
-
-Calculation of the required sample size for a control and intervention group requires the estimation of the probable intervention effect size, agreement on the significance level (p value threshold or Type I error risk) of the statistical test to be used and agreement on the level of statistical power (Type II error risk). Given any three of these values the fourth can be calculated if an estimate of the mean and standard deviation of the outcome to be measured is known. In the case of DSR interventions the effect size comprises a given % reduction in energy demand or consumption in a given time period and estimates of the expected reduction can be derived from previous studies or data. 
-
-As we have noted the choice of significance level (p value threshold) and statistical power are subjective and normative. Most academic researchers will struggle to justify relaxing from the conventional p = 0.05 and power = 0.8. However there may be good reason in applied research to take action on results of studies that use less conservative thresholds. Nevertheless there is a strong argument for designing such studies using the more conservative conventional levels but acknowledging that making inferences from the results may require a more relaxed approach to Type I or Type II error risks than is considered 'normal' in academic research.
-
-
-
-
-
-
-
-
-```
-## Scale for 'y' is already present. Adding another scale for 'y', which
-## will replace the existing scale.
-```
-
-<div class="figure">
-<img src="sizingDemandResponseTrialsNZ_files/figure-html/ggHPSampleSizeFig80-1.png" alt="Power analysis results (power = 0.8)"  />
-<p class="caption">(\#fig:ggHPSampleSizeFig80)Power analysis results (power = 0.8)</p>
-</div>
-
-```
-## Saving 7 x 5 in image
-```
-
-As an illustration, \ref(fig:ggHPSampleSizeFig80) shows sample size calculations with power = 0.8 (80%) using 'Heat Pump' electricity demand extracted from the publicly available New Zealand Green Grid household electricity demand data [@anderson_new_2018] for winter 2015 for the peak demand period (16:00 - 20:00) on weekdays.
-
-These results show that a trial comprising a control and intervention sample of 1000 households (each) would be able to detect an effect size of 14.18% with p = 0.01 and power = 0.8. Were a study to be less risk averse in it's decision making then p = 0.1 may be acceptable in which case only ~ 450 households would be needed in each group (see \ref(fig:ggHPSampleSizeFig80)) but of course in this case, the risk of a Type I error would increase. 
-
-
-
-Were we to reduce the statistical power to 0.7 then we would obtain the results shown in \ref(fig:ggHPSampleSizeFig70). In this case a trial comprising a control and intervention sample of 1000 households (each) would be able to detect an effect size of 12.76% with p = 0.01 and power = 0.7. Were a study to be less risk averse in it's decision making then p = 0.1 may be acceptable in which case only ~ 425 households would be needed in each group (see \ref(fig:ggHPSampleSizeFig70)) but as before the risk of a Type I error would increase. Similarly, reducing the statistical power used would also reduce the sample required for a given effect size tested at a given p value. However, as before the risk of a Type II error would increase.
-
-# Testing for differences: effect sizes, confidence intervals and p values
-
-## Getting it 'wrong'
-
-Let us imagine that we have not designed and implemented our sample recruitment according to \ref(fig:ggHPSampleSizeFig80) and instead decided, perhaps for cost reasons to recruit ~ 30 households per group. Now we wish to test for differences between the control and intervention groups.
-
-
-<table class="table" style="margin-left: auto; margin-right: auto;">
-<caption>(\#tab:smallNTable)Number of households and summary statistics per group</caption>
- <thead>
-  <tr>
-   <th style="text-align:left;"> group </th>
-   <th style="text-align:right;"> mean W </th>
-   <th style="text-align:right;"> sd W </th>
-   <th style="text-align:right;"> n households </th>
-  </tr>
- </thead>
-<tbody>
-  <tr>
-   <td style="text-align:left;"> Control </td>
-   <td style="text-align:right;"> 162.66915 </td>
-   <td style="text-align:right;"> 325.51171 </td>
-   <td style="text-align:right;"> 28 </td>
-  </tr>
-  <tr>
-   <td style="text-align:left;"> Intervention 1 </td>
-   <td style="text-align:right;"> 35.13947 </td>
-   <td style="text-align:right;"> 83.90258 </td>
-   <td style="text-align:right;"> 22 </td>
-  </tr>
-  <tr>
-   <td style="text-align:left;"> Intervention 2 </td>
-   <td style="text-align:right;"> 58.80597 </td>
-   <td style="text-align:right;"> 113.53102 </td>
-   <td style="text-align:right;"> 26 </td>
-  </tr>
-  <tr>
-   <td style="text-align:left;"> Intervention 3 </td>
-   <td style="text-align:right;"> 68.37439 </td>
-   <td style="text-align:right;"> 147.37279 </td>
-   <td style="text-align:right;"> 29 </td>
-  </tr>
-</tbody>
-</table>
-![](sizingDemandResponseTrialsNZ_files/figure-html/ggMeanDiffs-1.png)<!-- -->
-
-As a first step we plot the differences using the mean and 95% confidence intervals as shown in \ref(fig:ggMeanDiffs). As we can see the interventions appear to have reduced demand quite substantially and the error bars indicate the uncertainty (variation) around the mean within each group. Based on this, we suspect that we are unlikely to see low p values when we use statistical tests of the differences as the error bars overlap substantially.
-
-Suppose a t-test of the difference between the Control and Intervention 1 group produces the result shown below.
- 
-
-```
-## 
-## 	Welch Two Sample t-test
-## 
-## data:  testDT[group == "Intervention 1"]$meanW and testDT[group == "Control"]$meanW
-## t = -1.9907, df = 31.47, p-value = 0.05526
-## alternative hypothesis: true difference in means is not equal to 0
-## 95 percent confidence interval:
-##  -258.110005    3.050644
-## sample estimates:
-## mean of x mean of y 
-##  35.13947 162.66915
-```
-
-The data shows that the mean power demand for the control group was 162.67W and for Intervention 1 was 35.14W. This is a (very) large difference in the mean of 127.53. The results of the t test are:
-
- * effect size = 128W or 78%  representing a _substantial bang for buck_ for whatever caused the difference;
- * 95% confidence interval for the test = -258.11 to 3.05 representing _considerable_ uncertainty/variation;
- * p value of 0.055 representing a _relatively low_ risk of a false positive result but which (just) fails the conventional p < 0.05 threshold.
- 
-What would we have concluded? We have a large effect size, substantial uncertainty and a slightly raised risk of a false positive or Type I error when compared to conventional p value levels. From a narrow and conventional 'p value testing' perspective we would have concluded that there was no statistically signficant difference between the groups. However this misses the crucial point that an organisation with a higher risk tolerance might conclude that the large effect size justifies implementing the intervention even though the risk of a false positive is slightly higher. If the p value had been 0.25 then this would have still been the case but would have warranted even further caution.
-
-But what about Intervention Group 2? In this case the t.test results are slightly different:
-
-
-```
-## 
-## 	Welch Two Sample t-test
-## 
-## data:  testDT[group == "Intervention 2"]$meanW and testDT[group == "Control"]$meanW
-## t = -1.5876, df = 33.909, p-value = 0.1217
-## alternative hypothesis: true difference in means is not equal to 0
-## 95 percent confidence interval:
-##  -236.82848   29.10212
-## sample estimates:
-## mean of x mean of y 
-##  58.80597 162.66915
-```
-
-Now:
- 
- * effect size = 104W or 63.85%  representing a still _reasonable bang for buck_ for whatever caused the difference;
- * 95% confidence interval for the test = -236.83 to 29.1 representing _even greater_ uncertainty/variation;
- * p value of 0.122 representing a _higher_ risk of a false positive result which fails the conventional p < 0.05 threshold and also the less conservative p < 0.1.
-
-As before, the subsequent action we take depends on our tolerance of Type I (falso positive) risk. We still have a reasonably large effect size but we are less certain about it and we have a higher risk of it not being real. What do you think we should do?
-
-
-
-In both cases our decision-making is rather hampered by the small sample size even though we have extremely large effect sizes. As we can see from \ref(fig:ggHPSampleSizeFig80), to detect Intervention Group 2's effect size of 63.85% would have required control and trial group sizes of 47 respectively.
-
-However, as the recent discussions of the role of the p value in decision making have made clear [@wasserstein2016] statistical analysis needs to report all of the result elements to enable contextually appropriate and defensible evidence-based decisions to be taken. Simply dismissing results on the basis of a failure to meet conventional statistical levels of significance risks levitating babies and bathwater...
-
-## Getting it 'right'
-
-Suppose instead that we had designed and implemented our sample recruitment according to \ref(fig:ggHPSampleSizeFig80) so that we have a reasonable chance of detecting a difference of ~ 14% with power = 0.8 and at a significance level (p) of 0.05. This means we should have a sample of around 4000 households split equally (and randomly) between our trial and two intervention groups.
-
-<table class="table" style="margin-left: auto; margin-right: auto;">
-<caption>(\#tab:creatLargeN)Number of households and summary statistics per group</caption>
- <thead>
-  <tr>
-   <th style="text-align:left;"> group </th>
-   <th style="text-align:right;"> mean W </th>
-   <th style="text-align:right;"> sd W </th>
-   <th style="text-align:right;"> n households </th>
-  </tr>
- </thead>
-<tbody>
-  <tr>
-   <td style="text-align:left;"> Control </td>
-   <td style="text-align:right;"> 160.06236 </td>
-   <td style="text-align:right;"> 320.32863 </td>
-   <td style="text-align:right;"> 1070 </td>
-  </tr>
-  <tr>
-   <td style="text-align:left;"> Intervention 1 </td>
-   <td style="text-align:right;"> 38.38931 </td>
-   <td style="text-align:right;"> 84.81464 </td>
-   <td style="text-align:right;"> 899 </td>
-  </tr>
-  <tr>
-   <td style="text-align:left;"> Intervention 2 </td>
-   <td style="text-align:right;"> 62.96979 </td>
-   <td style="text-align:right;"> 116.10543 </td>
-   <td style="text-align:right;"> 1056 </td>
-  </tr>
-  <tr>
-   <td style="text-align:left;"> Intervention 3 </td>
-   <td style="text-align:right;"> 66.80930 </td>
-   <td style="text-align:right;"> 145.74274 </td>
-   <td style="text-align:right;"> 1175 </td>
-  </tr>
-</tbody>
-</table>
-
-<div class="figure">
-<img src="sizingDemandResponseTrialsNZ_files/figure-html/largeNmeanDiffs-1.png" alt="Mean W demand per group for large sample (Error bars = 95% confidence intervals for the sample mean)"  />
-<p class="caption">(\#fig:largeNmeanDiffs)Mean W demand per group for large sample (Error bars = 95% confidence intervals for the sample mean)</p>
-</div>
-
-In comparison to \ref(fig:ggMeanDiffs) we can now see (\ref(fig:largeNmeanDiffs)) that the 95% confidence intervals for the group means are much narrower. This is almost entirely due to the larger sample sizes. Re-running our previous test for differences now produces:
-
-
-```
-## 
-## 	Welch Two Sample t-test
-## 
-## data:  largeTestDT[group == "Intervention 2"]$meanW and largeTestDT[group == "Control"]$meanW
-## t = -9.3142, df = 1348.3, p-value < 2.2e-16
-## alternative hypothesis: true difference in means is not equal to 0
-## 95 percent confidence interval:
-##  -117.54191  -76.64322
-## sample estimates:
-## mean of x mean of y 
-##  62.96979 160.06236
-```
-
-In this case:
-
-  * effect size = 97.0925661W or 60.66%  representing a still _reasonable bang for buck_ for whatever caused the difference;
- * 95% confidence interval for the test = -117.54 to -76.64 representing _much less_ uncertainty/variation;
- * p value of 0 representing a _very low_ risk of a false positive result as it passes all conventional thresholds.
- 
-So now we are able to be much more confident in our decision to implement Intervention 2 since the average effect is reasonably large, the expected variation in the effect size is reasonably narrow and the risk of a Type I (false positive) error is extremely small. 
-
-# Summary and recomendations
-
-## Statistical power and sample design
-
-Get it right _first time_: we should do the statistical power analysis before we start to make sure the study is even worth trying. If we don't have previous data to use, we _justify_ our choices through power analysis based on defensible assumptions.
-
-## Reporting statistical tests of difference (effects)
-
-Report all three elements _always_:
-
- * average effect size
- * effect size confidence intervals
- * the p value (risk of Type I errors)
-
-We should also report the statistical power used just to be clear on the risk of Type II errors.
-
-## Making inferences and taking decisions
-
-Pay attention to all three elements _always_:
-
- * average effect size: what is the _average bang for buck_?
- * effect size confidence intervals: _how uncertain is the bang_?
- * the p value: _what is the risk of a false positive_?
-
-If we have ticked all the boxes so far then we have combined good study design based on statistical power analysis, with a nuanced understanding of what test statistic effect sizes, confidence intervals and p values can tell us. As a result we now have a robust, evidence-based, contextually meaningful and _defensible_ strategy.
-
-# Ackowledgements
-
-We would like to thank collaborators and partners on a number of applied research projects for prodding us into thinking about these issues more deeply and clearly than we othweise would have done. We hope this paper helps to bring some clarity.
-
-# Runtime
-
-
-
-
-Analysis completed in 71.73 seconds ( 1.2 minutes) using [knitr](https://cran.r-project.org/package=knitr) in [RStudio](http://www.rstudio.com) with R version 3.5.1 (2018-07-02) running on x86_64-apple-darwin15.6.0.
-
-# R environment
-
-R packages used:
-
- * base R - for the basics [@baseR]
- * data.table - for fast (big) data handling [@data.table]
- * lubridate - date manipulation [@lubridate]
- * ggplot2 - for slick graphics [@ggplot2]
- * readr - for csv reading/writing [@readr]
- * dplyr - for select and contains [@dplyr]
- * progress - for progress bars [@progress]
- * kableExtra - to create this document & neat tables [@knitr]
- * GREENGrid - for local NZ GREEN Grid project utilities
-
-Session info:
-
-
-```
-## R version 3.5.1 (2018-07-02)
-## Platform: x86_64-apple-darwin15.6.0 (64-bit)
-## Running under: macOS High Sierra 10.13.6
-## 
-## Matrix products: default
-## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
-## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
-## 
-## locale:
-## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
-## 
-## attached base packages:
-## [1] stats     graphics  grDevices utils     datasets  methods   base     
-## 
-## other attached packages:
-## [1] kableExtra_0.9.0  SAVEr_0.0.1.9000  lubridate_1.7.4   readr_1.1.1      
-## [5] ggplot2_3.0.0     dplyr_0.7.6       data.table_1.11.4 GREENGrid_0.1.0  
-## [9] GREENGridData_1.0
-## 
-## loaded via a namespace (and not attached):
-##  [1] Rcpp_0.12.18      lattice_0.20-35   tidyr_0.8.1      
-##  [4] prettyunits_1.0.2 png_0.1-7         utf8_1.1.4       
-##  [7] assertthat_0.2.0  rprojroot_1.3-2   digest_0.6.15    
-## [10] R6_2.2.2          cellranger_1.1.0  plyr_1.8.4       
-## [13] backports_1.1.2   evaluate_0.11     highr_0.7        
-## [16] httr_1.3.1        pillar_1.3.0      RgoogleMaps_1.4.2
-## [19] rlang_0.2.2       progress_1.2.0    lazyeval_0.2.1   
-## [22] readxl_1.1.0      rstudioapi_0.7    geosphere_1.5-7  
-## [25] rmarkdown_1.10    labeling_0.3      proto_1.0.0      
-## [28] stringr_1.3.1     munsell_0.5.0     broom_0.5.0      
-## [31] compiler_3.5.1    modelr_0.1.2      xfun_0.3         
-## [34] pkgconfig_2.0.2   htmltools_0.3.6   openssl_1.0.2    
-## [37] tidyselect_0.2.4  tibble_1.4.2      bookdown_0.7     
-## [40] fansi_0.3.0       viridisLite_0.3.0 crayon_1.3.4     
-## [43] withr_2.1.2       grid_3.5.1        nlme_3.1-137     
-## [46] jsonlite_1.5      gtable_0.2.0      magrittr_1.5     
-## [49] scales_1.0.0      cli_1.0.0         stringi_1.2.4    
-## [52] mapproj_1.2.6     reshape2_1.4.3    bindrcpp_0.2.2   
-## [55] sp_1.3-1          tidyverse_1.2.1   xml2_1.2.0       
-## [58] rjson_0.2.20      tools_3.5.1       forcats_0.3.0    
-## [61] ggmap_2.6.1       glue_1.3.0        purrr_0.2.5      
-## [64] maps_3.3.0        hms_0.4.2         jpeg_0.1-8       
-## [67] yaml_2.2.0        colorspace_1.3-2  rvest_0.3.2      
-## [70] knitr_1.20.13     bindr_0.1.1       haven_1.1.2
-```
-
-# References
diff --git a/sizingDemandResponseTrialsNZ.tex b/sizingDemandResponseTrialsNZ.tex
deleted file mode 100644
index 3c9211f9d01b76cc409c521862c7b79663c30581..0000000000000000000000000000000000000000
--- a/sizingDemandResponseTrialsNZ.tex
+++ /dev/null
@@ -1,887 +0,0 @@
-\documentclass[]{article}
-\usepackage{lmodern}
-\usepackage{amssymb,amsmath}
-\usepackage{ifxetex,ifluatex}
-\usepackage{fixltx2e} % provides \textsubscript
-\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
-  \usepackage[T1]{fontenc}
-  \usepackage[utf8]{inputenc}
-\else % if luatex or xelatex
-  \ifxetex
-    \usepackage{mathspec}
-  \else
-    \usepackage{fontspec}
-  \fi
-  \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
-\fi
-% use upquote if available, for straight quotes in verbatim environments
-\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
-% use microtype if available
-\IfFileExists{microtype.sty}{%
-\usepackage{microtype}
-\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
-}{}
-\usepackage[margin=1in]{geometry}
-\usepackage{hyperref}
-\hypersetup{unicode=true,
-            pdftitle={Statistical Power, Statistical Significance, Study Design and Decision Making: A Worked Example},
-            pdfauthor={Ben Anderson and Tom Rushby (Contact: b.anderson@soton.ac.uk, @dataknut)},
-            pdfborder={0 0 0},
-            breaklinks=true}
-\urlstyle{same}  % don't use monospace font for urls
-\usepackage{longtable,booktabs}
-\usepackage{graphicx,grffile}
-\makeatletter
-\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
-\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
-\makeatother
-% Scale images if necessary, so that they will not overflow the page
-% margins by default, and it is still possible to overwrite the defaults
-% using explicit options in \includegraphics[width, height, ...]{}
-\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
-\IfFileExists{parskip.sty}{%
-\usepackage{parskip}
-}{% else
-\setlength{\parindent}{0pt}
-\setlength{\parskip}{6pt plus 2pt minus 1pt}
-}
-\setlength{\emergencystretch}{3em}  % prevent overfull lines
-\providecommand{\tightlist}{%
-  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
-\setcounter{secnumdepth}{5}
-% Redefines (sub)paragraphs to behave more like sections
-\ifx\paragraph\undefined\else
-\let\oldparagraph\paragraph
-\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
-\fi
-\ifx\subparagraph\undefined\else
-\let\oldsubparagraph\subparagraph
-\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
-\fi
-
-%%% Use protect on footnotes to avoid problems with footnotes in titles
-\let\rmarkdownfootnote\footnote%
-\def\footnote{\protect\rmarkdownfootnote}
-
-%%% Change title format to be more compact
-\usepackage{titling}
-
-% Create subtitle command for use in maketitle
-\newcommand{\subtitle}[1]{
-  \posttitle{
-    \begin{center}\large#1\end{center}
-    }
-}
-
-\setlength{\droptitle}{-2em}
-
-  \title{Statistical Power, Statistical Significance, Study Design and Decision
-Making: A Worked Example}
-    \pretitle{\vspace{\droptitle}\centering\huge}
-  \posttitle{\par}
-  \subtitle{Sizing Demand Response Trials in New Zealand}
-  \author{Ben Anderson and Tom Rushby (Contact:
-\href{mailto:b.anderson@soton.ac.uk}{\nolinkurl{b.anderson@soton.ac.uk}},
-\texttt{@dataknut})}
-    \preauthor{\centering\large\emph}
-  \postauthor{\par}
-      \predate{\centering\large\emph}
-  \postdate{\par}
-    \date{Last run at: 2018-09-20 18:00:10}
-
-\usepackage{booktabs}
-\usepackage{longtable}
-\usepackage{array}
-\usepackage{multirow}
-\usepackage[table]{xcolor}
-\usepackage{wrapfig}
-\usepackage{float}
-\usepackage{colortbl}
-\usepackage{pdflscape}
-\usepackage{tabu}
-\usepackage{threeparttable}
-\usepackage{threeparttablex}
-\usepackage[normalem]{ulem}
-\usepackage{makecell}
-
-\usepackage{amsthm}
-\newtheorem{theorem}{Theorem}[section]
-\newtheorem{lemma}{Lemma}[section]
-\theoremstyle{definition}
-\newtheorem{definition}{Definition}[section]
-\newtheorem{corollary}{Corollary}[section]
-\newtheorem{proposition}{Proposition}[section]
-\theoremstyle{definition}
-\newtheorem{example}{Example}[section]
-\theoremstyle{definition}
-\newtheorem{exercise}{Exercise}[section]
-\theoremstyle{remark}
-\newtheorem*{remark}{Remark}
-\newtheorem*{solution}{Solution}
-\begin{document}
-\maketitle
-
-{
-\setcounter{tocdepth}{2}
-\tableofcontents
-}
-\newpage
-
-\section{About}\label{about}
-
-\subsection{Paper circulation:}\label{paper-circulation}
-
-\begin{itemize}
-\tightlist
-\item
-  Public
-\end{itemize}
-
-\subsection{License}\label{license}
-
-\subsection{Citation}\label{citation}
-
-If you wish to use any of the material from this report please cite as:
-
-\begin{itemize}
-\tightlist
-\item
-  Ben Anderson and Tom Rushby. (2018) Statistical Power, Statistical
-  Significance, Study Design and Decision Making: A Worked Example
-  (Sizing Demand Response Trials in New Zealand),
-  \href{http://www.otago.ac.nz/centre-sustainability/}{Centre for
-  Sustainability}, University of Otago: Dunedin, New Zealand.
-\end{itemize}
-
-This work is (c) 2018 the authors.
-
-\subsection{History}\label{history}
-
-Code history is generally tracked via our
-\href{https://github.com/CfSOtago/GREENGrid}{repo}:
-
-\begin{itemize}
-\tightlist
-\item
-  \href{https://github.com/CfSOtago/GREENGrid/commits/master/analysis/powerAnalysis}{Report
-  history}
-\end{itemize}
-
-\subsection{Data:}\label{data}
-
-This paper uses circuit level extracts for `Heat Pumps', `Lighting' and
-`Hot Water' for the NZ GREEN Grid Household Electricity Demand Data
-(\url{https://dx.doi.org/10.5255/UKDA-SN-853334} (Anderson et al.
-2018)). These have been extracted using the code found in
-
-\subsection{Support}\label{support}
-
-This work was supported by:
-
-\begin{itemize}
-\tightlist
-\item
-  The \href{https://www.otago.ac.nz/}{University of Otago};
-\item
-  The \href{https://www.southampton.ac.uk/}{University of Southampton};
-\item
-  The New Zealand \href{http://www.mbie.govt.nz/}{Ministry of Business,
-  Innovation and Employment (MBIE)} through the
-  \href{https://www.otago.ac.nz/centre-sustainability/research/energy/otago050285.html}{NZ
-  GREEN Grid} project;
-\item
-  \href{http://www.energy.soton.ac.uk/tag/spatialec/}{SPATIALEC} - a
-  \href{http://ec.europa.eu/research/mariecurieactions/about-msca/actions/if/index_en.htm}{Marie
-  Skłodowska-Curie Global Fellowship} based at the University of Otago's
-  \href{http://www.otago.ac.nz/centre-sustainability/staff/otago673896.html}{Centre
-  for Sustainability} (2017-2019) \& the University of Southampton's
-  Sustainable Energy Research Group (2019-2020).
-\end{itemize}
-
-We do not `support' the code but if you notice a problem please check
-the \href{https://github.com/CfSOtago/GREENGrid/issues}{issues} on our
-\href{https://github.com/CfSOtago/GREENGrid}{repo} and if it doesn't
-already exist, please open a new one.
-
-\newpage
-
-\section{Introduction}\label{introduction}
-
-In our experiennce of designing and running empirical studies, whether
-experimental or naturalistic, there is ongoing confusion over the
-meaning and role of two key statistical terms:
-
-\begin{itemize}
-\tightlist
-\item
-  statistical power
-\item
-  statistical significance
-\end{itemize}
-
-We have found this to be the case both in academic research where the
-objective is to establish `the most likely explanation' under academic
-conventions and in applied research where the objective is to `make a
-robust decision' based on the balance of evidence and probability.
-
-In this brief paper we respond to these confusions using a worked
-example: the design of a hypothetical household electricity demand
-response trial in New Zealand which seeks to shift the use of Heat Pumps
-out of the evening winter peak demand period. We use this example to
-explain and demonstrate the role of statistical signficance in testing
-for differences and of both statistical signficance and statistical
-power in sample design and decision making.
-
-\section{Error, power, significance and decision
-making}\label{error-power-significance-and-decision-making}
-
-Two types of error are of concern in both purely academic research where
-the efficacy of an intervention is to be tested and also in applied
-research where a decision may then be taken based on the results:
-
-\begin{itemize}
-\tightlist
-\item
-  Type I: a false positive - an effect is inferred when in fact there is
-  none. From a commercial or policy perspective this could lead to the
-  implementation of a costly intervention which would be unlikely to
-  have the effect expected;
-\item
-  Type II: a false negative - an effect is not inferred when in fact
-  there is one. From a commercial or policy perspective this could lead
-  to inaction when an intervention would have been likely to have the
-  effect expected.
-\end{itemize}
-
-\emph{Type I error}: The significance level (p value) of the statistical
-test to be used to test the efficacy of an intervention represents the
-extent to which the observed data matches the null model to be tested
-(Wasserstein and Lazar 2016). In most trials the null model will be a
-measure of `no difference' between control and intervenion groups. By
-convention, the p value \emph{threshold} for rejecting the null model
-(the risk of a Type I error) is generally set to 0.05 (5\%) although
-this choice is entirely subjective. In commercial or policy terms an
-action taken on a larger p value (e.g.~setting the p value threshold to
-10\%) would increase the risk of making a Type I error and thus
-implementing a potentially costly intervention that is unlikely to have
-the effect desired. However, as we discuss in more detail below, this is
-not necessarily \emph{bad practice} as it may reflect the potential
-magnitude of an effect, the decision-maker's tolerance of Type I error
-risk and the urgency of action.
-
-\emph{Type II error}: Statistical power is normally set to 0.8 (80\%) by
-convention and represents the pre-study risk of making a Type II error
-(Greenland et al. 2016). From a commercial or policy perspective
-reducing power (e.g.~to 0.7 or 70\%) will therefore increase the risk of
-taking no action when in fact the intervention would probably have had
-the effect desired. Statistical power calculations enable the
-investigator to estimate the sample size that would be needed to
-robustly detect an experimental effect with a given risk of a false
-positive (Type I error) or false negative (Type II error) result. This
-prevents a study from recruiting too few participants to be able to
-robustly detect the hypothesised intervention effect (Delmas, Fischlein,
-and Asensio 2013) or wasting resources by recruiting a larger sample
-than needed.
-
-Previous work has suggested that sample sizes in most energy efficiency
-studies may be too low to provide adequate power and so statistically
-robust conclusions cannot be drawn at conventional thresholds (Frederiks
-et al. 2016) while a more recent review focusing on demand response
-studies reached a similar conclusion (Srivastava, Van Passel, and Laes
-2018). It is therefore hardly surprising that a number of studies report
-effect sizes which are not statistically significant at conventional
-thresholds (Srivastava, Van Passel, and Laes 2018), choose to use lower
-statistical significance thresholds (Rocky Mountain Institute 2006,
-AECOM (2011), CER (2012), Schofield et al. (2015)) or both lower
-statistical power values \emph{and} lower statistical significance
-thresholds (UKPN 2017,UKPN (2018)).
-
-However it would be wrong to conclude that this is \emph{necessarily}
-bad practice. Recent discussions of the role of p values in inference
-(Greenland et al. 2016, Wasserstein and Lazar (2016)) should remind us
-that decisions should never be based only on statistical significance
-thresholds set purely by convention. Rather, inference and thus decision
-making should be based on:
-
-\begin{itemize}
-\tightlist
-\item
-  statistic effect size - is it 2\% or 22\% (i.e.~is the result
-  \emph{important} or \emph{useful}, ``What is the estimated \emph{bang
-  for buck}?'');
-\item
-  statistic confidence intervals - (i.e.~is there \emph{uncertainty} or
-  \emph{variation} in response, ``How uncertain is the estimated
-  bang?'');
-\item
-  statistic p values - (i.e.~what is the risk of a Type I error /
-  \emph{false positive}, ``What is the risk the bang observed isn't
-  real?'');
-\end{itemize}
-
-Only then can a contextually appropriate decision be taken as to whether
-the effect is large enough, certain enough and has a low enough risk of
-being a false positive result to warrant action.
-
-In the following sections we apply these principles to the design and
-analysis of a hypothetical New Zealand household electricity demand
-response trial and to the use of a simple statistical test of difference
-between trial groups to demonstrate and clarify these points.
-
-\section{Sample design: statistical
-power}\label{sample-design-statistical-power}
-
-To return to the discussion of statistical power, we need to establish
-the probably size of the control and intervention groups we will
-require. This is an aid to resource budgeting (\emph{``How many
-households and thus \texttt{\$} do I need?''}) and to ensure good study
-design practice (``\emph{Will I be able to answer my research
-question?}'') (Frederiks et al. 2016).
-
-Calculation of the required sample size for a control and intervention
-group requires the estimation of the probable intervention effect size,
-agreement on the significance level (p value threshold or Type I error
-risk) of the statistical test to be used and agreement on the level of
-statistical power (Type II error risk). Given any three of these values
-the fourth can be calculated if an estimate of the mean and standard
-deviation of the outcome to be measured is known. In the case of DSR
-interventions the effect size comprises a given \% reduction in energy
-demand or consumption in a given time period and estimates of the likely
-reduction can be derived from previous studies or data.
-
-As we have noted the choice of significance level (p value threshold)
-and statistical power are subjective and normative. Most academic
-researchers will struggle to justify relaxing from the conventional p =
-0.05 and power = 0.8. However as we have discussed there may be good
-reason in applied research to take action on results of studies that use
-less conservative thresholds. Nevertheless there is a strong argument
-for designing such studies using the more conservative conventional
-levels but acknowledging that making inferences from the results may
-require a more relaxed approach to Type I or Type II error risks than is
-considered `normal' in academic research.
-
-\begin{verbatim}
-## Scale for 'y' is already present. Adding another scale for 'y', which
-## will replace the existing scale.
-\end{verbatim}
-
-\begin{figure}
-\centering
-\includegraphics{sizingDemandResponseTrialsNZ_files/figure-latex/ggHPSampleSizeFig80-1.pdf}
-\caption{\label{fig:ggHPSampleSizeFig80}Power analysis results (power =
-0.8)}
-\end{figure}
-
-\begin{verbatim}
-## Saving 6.5 x 4.5 in image
-\end{verbatim}
-
-As an illustration, \ref(fig:ggHPSampleSizeFig80) shows sample size
-calculations for power = 0.8 (80\%) using `Heat Pump' electricity demand
-extracted from the publicly available New Zealand Green Grid household
-electricity demand data (Anderson et al. 2018) for winter 2015 for the
-peak demand period (16:00 - 20:00) on weekdays.
-
-These results show that a trial comprising a control and intervention
-sample of 1000 households (each) would be able to detect an effect size
-of 14.18\% with p = 0.01 and power = 0.8. Were a study to be less risk
-averse in it's decision making then p = 0.1 may be acceptable in which
-case only \textasciitilde{} 450 households would be needed in each group
-(see \ref(fig:ggHPSampleSizeFig80)) but the risk of a Type I error would
-increase.
-
-\begin{verbatim}
-## Scale for 'y' is already present. Adding another scale for 'y', which
-## will replace the existing scale.
-\end{verbatim}
-
-\begin{figure}
-\centering
-\includegraphics{sizingDemandResponseTrialsNZ_files/figure-latex/ggHPSampleSizeFig70-1.pdf}
-\caption{\label{fig:ggHPSampleSizeFig70}Power analysis results (power =
-0.7)}
-\end{figure}
-
-\begin{verbatim}
-## Saving 6.5 x 4.5 in image
-\end{verbatim}
-
-Were we to reduce the statistical power to 0.7 then we would obtain the
-results shown in \ref(fig:ggHPSampleSizeFig70). In this case a trial
-comprising a control and intervention sample of 1000 households (each)
-would be able to detect an effect size of 12.76\% with p = 0.01 and
-power = 0.7. Were a study to be less risk averse in it's decision making
-then p = 0.1 may be acceptable in which case only \textasciitilde{} 425
-households would be needed in each group (see
-\ref(fig:ggHPSampleSizeFig70)) but as before the risk of a Type I error
-would increase. Similarly, reducing the statistical power used would
-also reduce the sample required for a given effect size tested at a
-given p value. However, as before the risk of a Type II error would
-increase.
-
-\section{Testing for differences: effect sizes, confidence intervals and
-p
-values}\label{testing-for-differences-effect-sizes-confidence-intervals-and-p-values}
-
-\subsection{\texorpdfstring{Getting it
-`wrong'}{Getting it wrong}}\label{getting-it-wrong}
-
-Let us imagine that we have not designed and implemented our sample
-recruitment according to \ref(fig:ggHPSampleSizeFig80) and instead
-decided, perhaps for cost reasons to recruit \textasciitilde{} 30
-households per group. Now we wish to test for differences between the
-control and intervention groups.
-
-\begin{table}[t]
-
-\caption{\label{tab:smallNTable}Number of households and summary statistics per group}
-\centering
-\begin{tabular}{l|r|r|r}
-\hline
-group & mean W & sd W & n households\\
-\hline
-Control & 162.66915 & 325.51171 & 28\\
-\hline
-Intervention 1 & 35.13947 & 83.90258 & 22\\
-\hline
-Intervention 2 & 58.80597 & 113.53102 & 26\\
-\hline
-Intervention 3 & 68.37439 & 147.37279 & 29\\
-\hline
-\end{tabular}
-\end{table}
-
-\includegraphics{sizingDemandResponseTrialsNZ_files/figure-latex/ggMeanDiffs-1.pdf}
-
-As a first step we plot the differences using the mean and 95\%
-confidence intervals as shown in \ref(fig:ggMeanDiffs). As we can see
-the interventions appear to have reduced demand quite substantially and
-the error bars indicate the uncertainty (variation) around the mean
-within each group. Based on this, we suspect that we are unlikely to see
-low p values when we use statistical tests of the differences as the
-error bars overlap substantially.
-
-Suppose a t-test of the difference between the Control and Intervention
-1 group produces the result shown below.
-
-\begin{verbatim}
-## 
-##  Welch Two Sample t-test
-## 
-## data:  testDT[group == "Intervention 1"]$meanW and testDT[group == "Control"]$meanW
-## t = -1.9907, df = 31.47, p-value = 0.05526
-## alternative hypothesis: true difference in means is not equal to 0
-## 95 percent confidence interval:
-##  -258.110005    3.050644
-## sample estimates:
-## mean of x mean of y 
-##  35.13947 162.66915
-\end{verbatim}
-
-The data shows that the mean power demand for the control group was
-162.67W and for Intervention 1 was 35.14W. This is a (very) large
-difference in the mean of 127.53. The results of the t test are:
-
-\begin{itemize}
-\tightlist
-\item
-  effect size = 128W or 78\% representing a \emph{substantial bang for
-  buck} for whatever caused the difference;
-\item
-  95\% confidence interval for the test = -258.11 to 3.05 representing
-  \emph{considerable} uncertainty/variation;
-\item
-  p value of 0.055 representing a \emph{relatively low} risk of a false
-  positive result but which (just) fails the conventional p \textless{}
-  0.05 threshold.
-\end{itemize}
-
-What would we have concluded? We have a large effect size, substantial
-uncertainty and a slightly raised risk of a false positive or Type I
-error when compared to conventional p value levels. From a narrow and
-conventional `p value testing' perspective we would have concluded that
-there was no statistically signficant difference between the groups.
-However this misses the crucial point that an organisation with a higher
-risk tolerance might conclude that the large effect size justifies
-implementing the intervention even though the risk of a false positive
-is slightly higher. If the p value had been 0.25 then this would have
-still been the case but would have warranted even further caution.
-
-But what about Intervention Group 2? In this case the t.test results are
-slightly different:
-
-\begin{verbatim}
-## 
-##  Welch Two Sample t-test
-## 
-## data:  testDT[group == "Intervention 2"]$meanW and testDT[group == "Control"]$meanW
-## t = -1.5876, df = 33.909, p-value = 0.1217
-## alternative hypothesis: true difference in means is not equal to 0
-## 95 percent confidence interval:
-##  -236.82848   29.10212
-## sample estimates:
-## mean of x mean of y 
-##  58.80597 162.66915
-\end{verbatim}
-
-Now:
-
-\begin{itemize}
-\tightlist
-\item
-  effect size = 104W or 63.85\% representing a still \emph{reasonable
-  bang for buck} for whatever caused the difference;
-\item
-  95\% confidence interval for the test = -236.83 to 29.1 representing
-  \emph{even greater} uncertainty/variation;
-\item
-  p value of 0.122 representing a \emph{higher} risk of a false positive
-  result which fails the conventional p \textless{} 0.05 threshold and
-  also the less conservative p \textless{} 0.1.
-\end{itemize}
-
-As before, the subsequent action we take depends on our tolerance of
-Type I (falso positive) risk. We still have a reasonably large effect
-size but we are less certain about it and we have a higher risk of it
-not being real. What do you think we should do?
-
-In both cases our decision-making is rather hampered by the small sample
-size even though we have extremely large effect sizes. As we can see
-from \ref(fig:ggHPSampleSizeFig80), to detect Intervention Group 2's
-effect size of 63.85\% would have required control and trial group sizes
-of 47 respectively.
-
-However, as the recent discussions of the role of the p value in
-decision making have made clear (Wasserstein and Lazar 2016) statistical
-analysis needs to report all of the result elements to enable
-contextually appropriate and defensible evidence-based decisions to be
-taken. Simply dismissing results on the basis of a failure to meet
-conventional statistical levels of significance risks levitating babies
-and bathwater\ldots{}
-
-\subsection{\texorpdfstring{Getting it
-`right'}{Getting it right}}\label{getting-it-right}
-
-Suppose instead that we had designed and implemented our sample
-recruitment according to \ref(fig:ggHPSampleSizeFig80) so that we have a
-reasonable chance of detecting a difference of \textasciitilde{} 14\%
-with power = 0.8 and at a significance level (p) of 0.05. This means we
-should have a sample of around 4000 households split equally (and
-randomly) between our trial and two intervention groups.
-
-\begin{table}[t]
-
-\caption{\label{tab:creatLargeN}Number of households and summary statistics per group}
-\centering
-\begin{tabular}{l|r|r|r}
-\hline
-group & mean W & sd W & n households\\
-\hline
-Control & 139.28889 & 292.78141 & 1137\\
-\hline
-Intervention 1 & 36.47888 & 84.60407 & 835\\
-\hline
-Intervention 2 & 60.40882 & 113.37202 & 1054\\
-\hline
-Intervention 3 & 73.64247 & 147.39290 & 1174\\
-\hline
-\end{tabular}
-\end{table}
-
-\begin{figure}
-\centering
-\includegraphics{sizingDemandResponseTrialsNZ_files/figure-latex/largeNmeanDiffs-1.pdf}
-\caption{\label{fig:largeNmeanDiffs}Mean W demand per group for large sample
-(Error bars = 95\% confidence intervals for the sample mean)}
-\end{figure}
-
-In comparison to \ref(fig:ggMeanDiffs) we can now see
-(\ref(fig:largeNmeanDiffs)) that the 95\% confidence intervals for the
-group means are much narrower. This is almost entirely due to the larger
-sample sizes. Re-running our previous test for differences now produces:
-
-\begin{verbatim}
-## 
-##  Welch Two Sample t-test
-## 
-## data:  largeTestDT[group == "Intervention 2"]$meanW and largeTestDT[group == "Control"]$meanW
-## t = -8.4284, df = 1491.1, p-value < 2.2e-16
-## alternative hypothesis: true difference in means is not equal to 0
-## 95 percent confidence interval:
-##  -97.23786 -60.52228
-## sample estimates:
-## mean of x mean of y 
-##  60.40882 139.28889
-\end{verbatim}
-
-In this case:
-
-\begin{itemize}
-\tightlist
-\item
-  effect size = 78.880071W or 56.63\% representing a still
-  \emph{reasonable bang for buck} for whatever caused the difference;
-\item
-  95\% confidence interval for the test = -97.24 to -60.52 representing
-  \emph{much less} uncertainty/variation;
-\item
-  p value of 0 representing a \emph{very low} risk of a false positive
-  result as it passes all conventional thresholds.
-\end{itemize}
-
-So now we are able to be much more confident in our decision to
-implement Intervention 2 since the average effect is reasonably large,
-the expected variation in the effect size is reasonably narrow and the
-risk of a Type I (false positive) error is extremely small.
-
-\section{Summary and recomendations}\label{summary-and-recomendations}
-
-\subsection{Statistical power and sample
-design}\label{statistical-power-and-sample-design}
-
-Get it right \emph{first time}: we should do the statistical power
-analysis before we start to make sure the study is even worth trying. If
-we don't have previous data to use, we \emph{justify} our choices
-through power analysis based on defensible assumptions.
-
-\subsection{Reporting statistical tests of difference
-(effects)}\label{reporting-statistical-tests-of-difference-effects}
-
-Report all three elements \emph{always}:
-
-\begin{itemize}
-\tightlist
-\item
-  average effect size
-\item
-  effect size confidence intervals
-\item
-  the p value (risk of Type I errors)
-\end{itemize}
-
-We should also report the statistical power used just to be clear on the
-risk of Type II errors.
-
-\subsection{Making inferences and taking
-decisions}\label{making-inferences-and-taking-decisions}
-
-Pay attention to all three elements \emph{always}:
-
-\begin{itemize}
-\tightlist
-\item
-  average effect size: what is the \emph{average bang for buck}?
-\item
-  effect size confidence intervals: \emph{how uncertain is the bang}?
-\item
-  the p value: \emph{what is the risk of a false positive}?
-\end{itemize}
-
-If we have ticked all the boxes so far then we have combined good study
-design based on statistical power analysis, with a nuanced understanding
-of what test statistic effect sizes, confidence intervals and p values
-can tell us. As a result we now have a robust, evidence-based,
-contextually meaningful and \emph{defensible} strategy.
-
-\section{Ackowledgements}\label{ackowledgements}
-
-We would like to thank collaborators and partners on a number of applied
-research projects for prodding us into thinking about these issues more
-deeply and clearly than we othweise would have done. We hope this paper
-helps to bring some clarity.
-
-\section{Runtime}\label{runtime}
-
-Analysis completed in 67.65 seconds ( 1.13 minutes) using
-\href{https://cran.r-project.org/package=knitr}{knitr} in
-\href{http://www.rstudio.com}{RStudio} with R version 3.5.1 (2018-07-02)
-running on x86\_64-apple-darwin15.6.0.
-
-\section{R environment}\label{r-environment}
-
-R packages used:
-
-\begin{itemize}
-\tightlist
-\item
-  base R - for the basics (R Core Team 2016)
-\item
-  data.table - for fast (big) data handling (Dowle et al. 2015)
-\item
-  lubridate - date manipulation (Grolemund and Wickham 2011)
-\item
-  ggplot2 - for slick graphics (Wickham 2009)
-\item
-  readr - for csv reading/writing (Wickham, Hester, and Francois 2016)
-\item
-  dplyr - for select and contains (Wickham and Francois 2016)
-\item
-  progress - for progress bars (Csárdi and FitzJohn 2016)
-\item
-  kableExtra - to create this document \& neat tables (Xie 2016)
-\item
-  GREENGrid - for local NZ GREEN Grid project utilities
-\end{itemize}
-
-Session info:
-
-\begin{verbatim}
-## R version 3.5.1 (2018-07-02)
-## Platform: x86_64-apple-darwin15.6.0 (64-bit)
-## Running under: macOS High Sierra 10.13.6
-## 
-## Matrix products: default
-## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
-## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
-## 
-## locale:
-## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
-## 
-## attached base packages:
-## [1] stats     graphics  grDevices utils     datasets  methods   base     
-## 
-## other attached packages:
-## [1] kableExtra_0.9.0  SAVEr_0.0.1.9000  lubridate_1.7.4   readr_1.1.1      
-## [5] ggplot2_3.0.0     dplyr_0.7.6       data.table_1.11.4 GREENGrid_0.1.0  
-## [9] GREENGridData_1.0
-## 
-## loaded via a namespace (and not attached):
-##  [1] Rcpp_0.12.18      lattice_0.20-35   tidyr_0.8.1      
-##  [4] prettyunits_1.0.2 png_0.1-7         utf8_1.1.4       
-##  [7] assertthat_0.2.0  rprojroot_1.3-2   digest_0.6.15    
-## [10] R6_2.2.2          cellranger_1.1.0  plyr_1.8.4       
-## [13] backports_1.1.2   evaluate_0.11     highr_0.7        
-## [16] httr_1.3.1        pillar_1.3.0      RgoogleMaps_1.4.2
-## [19] rlang_0.2.2       progress_1.2.0    lazyeval_0.2.1   
-## [22] readxl_1.1.0      rstudioapi_0.7    geosphere_1.5-7  
-## [25] rmarkdown_1.10    labeling_0.3      proto_1.0.0      
-## [28] stringr_1.3.1     munsell_0.5.0     broom_0.5.0      
-## [31] compiler_3.5.1    modelr_0.1.2      xfun_0.3         
-## [34] pkgconfig_2.0.2   htmltools_0.3.6   openssl_1.0.2    
-## [37] tidyselect_0.2.4  tibble_1.4.2      bookdown_0.7     
-## [40] fansi_0.3.0       viridisLite_0.3.0 crayon_1.3.4     
-## [43] withr_2.1.2       grid_3.5.1        nlme_3.1-137     
-## [46] jsonlite_1.5      gtable_0.2.0      magrittr_1.5     
-## [49] scales_1.0.0      cli_1.0.0         stringi_1.2.4    
-## [52] mapproj_1.2.6     reshape2_1.4.3    bindrcpp_0.2.2   
-## [55] sp_1.3-1          tidyverse_1.2.1   xml2_1.2.0       
-## [58] rjson_0.2.20      tools_3.5.1       forcats_0.3.0    
-## [61] ggmap_2.6.1       glue_1.3.0        purrr_0.2.5      
-## [64] maps_3.3.0        hms_0.4.2         jpeg_0.1-8       
-## [67] yaml_2.2.0        colorspace_1.3-2  rvest_0.3.2      
-## [70] knitr_1.20.13     bindr_0.1.1       haven_1.1.2
-\end{verbatim}
-
-\section*{References}\label{references}
-\addcontentsline{toc}{section}{References}
-
-\hypertarget{refs}{}
-\hypertarget{ref-AECOM2011Energy}{}
-AECOM. 2011. ``Energy Demand Research Project: Final Analysis.'' St
-Albans: AECOM.
-
-\hypertarget{ref-anderson_new_2018}{}
-Anderson, Ben, David Eyers, Rebecca Ford, Diana Giraldo Ocampo, Rana
-Peniamina, Janet Stephenson, Kiti Suomalainen, Lara Wilcocks, and
-Michael Jack. 2018. ``New Zealand GREEN Grid Household Electricity
-Demand Study 2014-2018,'' September.
-doi:\href{https://doi.org/10.5255/UKDA-SN-853334}{10.5255/UKDA-SN-853334}.
-
-\hypertarget{ref-CER2012Smart}{}
-CER. 2012. ``Smart Meter Electricity Consumer Behaviour Trial data.''
-Dublin: Irish Social Science Data Archive.
-\url{http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/}.
-
-\hypertarget{ref-progress}{}
-Csárdi, Gábor, and Rich FitzJohn. 2016. \emph{Progress: Terminal
-Progress Bars}. \url{https://CRAN.R-project.org/package=progress}.
-
-\hypertarget{ref-Delmas2013Information}{}
-Delmas, Magali A., Miriam Fischlein, and Omar I. Asensio. 2013.
-``Information strategies and energy conservation behavior: A
-meta-analysis of experimental studies from 1975 to 2012.'' \emph{Energy
-Policy} 61 (October): 729--39.
-doi:\href{https://doi.org/10.1016/j.enpol.2013.05.109}{10.1016/j.enpol.2013.05.109}.
-
-\hypertarget{ref-data.table}{}
-Dowle, M, A Srinivasan, T Short, S Lianoglou with contributions from R
-Saporta, and E Antonyan. 2015. \emph{Data.table: Extension of
-Data.frame}. \url{https://CRAN.R-project.org/package=data.table}.
-
-\hypertarget{ref-Frederiks2016Evaluating}{}
-Frederiks, Elisha R., Karen Stenner, Elizabeth V. Hobman, and Mark
-Fischle. 2016. ``Evaluating energy behavior change programs using
-randomized controlled trials: Best practice guidelines for
-policymakers.'' \emph{Energy Research \& Social Science} 22 (December):
-147--64.
-doi:\href{https://doi.org/10.1016/j.erss.2016.08.020}{10.1016/j.erss.2016.08.020}.
-
-\hypertarget{ref-Greenland2016}{}
-Greenland, Sander, Stephen J. Senn, Kenneth J. Rothman, John B. Carlin,
-Charles Poole, Steven N. Goodman, and Douglas G. Altman. 2016.
-``Statistical Tests, P Values, Confidence Intervals, and Power: A Guide
-to Misinterpretations.'' \emph{European Journal of Epidemiology} 31 (4):
-337--50.
-doi:\href{https://doi.org/10.1007/s10654-016-0149-3}{10.1007/s10654-016-0149-3}.
-
-\hypertarget{ref-lubridate}{}
-Grolemund, Garrett, and Hadley Wickham. 2011. ``Dates and Times Made
-Easy with lubridate.'' \emph{Journal of Statistical Software} 40 (3):
-1--25. \url{http://www.jstatsoft.org/v40/i03/}.
-
-\hypertarget{ref-baseR}{}
-R Core Team. 2016. \emph{R: A Language and Environment for Statistical
-Computing}. Vienna, Austria: R Foundation for Statistical Computing.
-\url{https://www.R-project.org/}.
-
-\hypertarget{ref-RockyMountainInstitute2006Automated}{}
-Rocky Mountain Institute. 2006. ``Automated demand response system
-pilot: Final report.''
-\url{https://www.smartgrid.gov/files/Aumated_Demd_Response_System_Pilot_Volume_1_Intro_Exec_Summa.pdf}.
-
-\hypertarget{ref-Schofield2015Experimental}{}
-Schofield, James, Richard Carmichael, Simon Tindemans, Matt Woolf, Mark
-Bilton, and Goran Strbac. 2015. ``Experimental validation of residential
-consumer responsiveness to dynamic time-of-use pricing.'' In \emph{23
-International Conference on Electricity Distribution}.
-
-\hypertarget{ref-Srivastava2018Assessing}{}
-Srivastava, Aman, Steven Van Passel, and Erik Laes. 2018. ``Assessing
-the Success of Electricity Demand Response Programs: A Meta-Analysis.''
-\emph{Energy Research \& Social Science} 40 (June): 110--17.
-doi:\href{https://doi.org/10.1016/j.erss.2017.12.005}{10.1016/j.erss.2017.12.005}.
-
-\hypertarget{ref-energyWiseT1}{}
-UKPN. 2017. ``The Final Energy Saving Trial Report.'' London: UK Power
-Networks.
-\url{http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/}.
-
-\hypertarget{ref-energyWiseT2}{}
----------. 2018. ``The Energy Shifting Trial Report.'' London: UK Power
-Networks.
-\url{http://innovation.ukpowernetworks.co.uk/innovation/en/Projects/tier-2-projects/Energywise/}.
-
-\hypertarget{ref-wasserstein2016}{}
-Wasserstein, Ronald L., and Nicole A. Lazar. 2016. ``The Asa's Statement
-on P-Values: Context, Process, and Purpose.'' \emph{The American
-Statistician} 70 (2). Taylor \& Francis: 129--33.
-doi:\href{https://doi.org/10.1080/00031305.2016.1154108}{10.1080/00031305.2016.1154108}.
-
-\hypertarget{ref-ggplot2}{}
-Wickham, Hadley. 2009. \emph{Ggplot2: Elegant Graphics for Data
-Analysis}. Springer-Verlag New York. \url{http://ggplot2.org}.
-
-\hypertarget{ref-dplyr}{}
-Wickham, Hadley, and Romain Francois. 2016. \emph{Dplyr: A Grammar of
-Data Manipulation}. \url{https://CRAN.R-project.org/package=dplyr}.
-
-\hypertarget{ref-readr}{}
-Wickham, Hadley, Jim Hester, and Romain Francois. 2016. \emph{Readr:
-Read Tabular Data}. \url{https://CRAN.R-project.org/package=readr}.
-
-\hypertarget{ref-knitr}{}
-Xie, Yihui. 2016. \emph{Knitr: A General-Purpose Package for Dynamic
-Report Generation in R}. \url{https://CRAN.R-project.org/package=knitr}.
-
-
-\end{document}