diff --git a/myFirstRScript.R b/myFirstRScript.R index 3422f02f796a0e8b27a1f350a914ff753be64857..4e4dac6062b142cdaede5fb9c681480f4e6f21f3 100644 --- a/myFirstRScript.R +++ b/myFirstRScript.R @@ -1,6 +1,7 @@ ### Title: myFirstRScript ---- # (c) 2018 Ben Anderson (@dataknut) -# Original code: +# Original code: https://git.soton.ac.uk/ba1e12/intro2R/blob/master/myFirstRScript.R +#Â Re-use: https://git.soton.ac.uk/ba1e12/intro2R/blob/master/LICENSE (MIT License) # Helpful documentation: # https://www.rdocumentation.org/ @@ -10,11 +11,18 @@ # http://r4ds.had.co.nz/ # Preliminaries ---- -# Print out the date - always useful to know -Sys.Date() -# Print out the time - also useful to know -Sys.time() +# R is a programing language (do not be afraid!) +# It has a lot of built-in commands/functions + +# Print out the date & time - always useful to know +Sys.time() # type the first few characters of this function in the console - what happens? + +# Take some time to explore the RStudio UI ---- +# > Console ---- +# > Files/Packages/Help (very useful!) ---- +# > Script editor ---- +# > Environment ---- # Play with some numbers ---- # Do some simple maths @@ -35,10 +43,15 @@ sqrt(result) # Before we create a data table we have to load the data.table library as it is not a core part of R ('base') library(data.table) +# if that fails we need to use: +# install.packages("data.table") # this will install the package from the default package archive (probably CRAN) + # OK, now we're ready # Create a new data table with one column called 'index' and n rows defined by.... nRows +# First define the number of rows nRows <- 100 -dt <- data.table(index = rep(1:nRows)) +# Now create the data table which we'll call 'dt' (we could call it (almost) anything) +dt <- data.table(index = rep(1:nRows)) # say hello to the really useful rep() function # check what it looks like dt @@ -65,28 +78,67 @@ summary(dt) # But we'd quite like to visualise the data - draw a simple plot plot(dt) -# Or maybe we'd like to test a correlation -cor.test(dt$index, dt$sqrtIndex) - -# Did you get bored typing dt twice? -with(dt, cor.test(index, sqrtIndex)) # use with() to tell R which object (usually a data.table) you want to work with. In this case it made for more typing but if your object name is long or you need to type a lot of variables it will save on finger-wear - # Now let's try a very simple linear regression model and catch the results in a model result object # Bonus points for telling me why a linear model is the _wrong_ choice! -linearModelResult <- lm(index ~ sqrtIndex, dt) +linearModelResult <- lm(index ~ sqrtIndex, dt) # Note the lm function forces us to specify the data to use summary(linearModelResult) # note we use 'summary' again but it knows what to with a model result object -# Now the fun part: loading data ---- +# Play with ggplot ---- # > Load an internal R dataset ---- -mtcarsDT <- data.table(mtcars) - +mtcarsDT <- data.table(mtcars) # Note I use 'DT' to remind me this object is a data table +# to find out which other datasets come with R run data() head(mtcarsDT) -# Create a simple (!) visualisation of all the variables -pairs(mtcarsDT) - -# > Load one from a local file ---- +# So far we've used R's base plot() function. +# This is OK but ggplot is a package that creates very fancy graphs - http://ggplot2.tidyverse.org/reference/ + +# Load ggplot +library(ggplot2) + +# if that fails we need to use: +# install.packages("ggplot2") # this will install the package from the default package archive (probably CRAN) + +#Â > Now let's create a basic ggplot object +myPlot <- ggplot(mtcarsDT) +#Â test it +myPlot # what happens? + +# We need to tell ggplot what the axes are +myPlot <- ggplot(mtcarsDT, aes(x = disp, y = mpg)) +myPlot # what happens? + +# We need to tell ggplot what to plot +myPlot <- ggplot(mtcarsDT, aes(x = disp, y = mpg)) + + geom_point() # a point plot - here are lots more http://ggplot2.tidyverse.org/reference/index.html#section-layer-geoms +myPlot + +# We want capitals on our axis labels and a caption. I want to use them again and again so we use variables to store them +myCaption <- paste0("R mtcars data: n obs = ", nrow(mtcarsDT)) #Â Note the use of paste0() to construct a caption from text and R +myXlabel <- "Displacement" +myYlabel <- "Miles per gallon" + +# Now add the axes labels & caption +myPlot <- myPlot + labs(x = myXlabel, y = myYlabel, caption = myCaption) # Note how we just add to myPlot +# see http://ggplot2.tidyverse.org/index.html for other things we can do +myPlot + +# Let's have some colour +myPlot <- myPlot + geom_point(colour = "blue") # see how we're building up layers? +myPlot + +# Let's use colour to tell us something +myPlot <- myPlot + geom_point(aes(colour = gear)) +myPlot + +#Â Now we're happy we can do all that in one call and add an outlier label: +# Lots more examples at http://r4ds.had.co.nz/data-visualisation.html +ggplot(mtcarsDT, aes(x = disp, y = mpg)) + + geom_point(aes(colour = gear)) + + labs(x = myXlabel, y = myYlabel, caption = myCaption) + + geom_label(x = 250, y = 23, label = "Outlier?") + +# > Load a dataset from a local file ---- # This will only work if you have this folder & file in the same place as the R script pressureLocalDT <- fread("data/pressure.csv") # fread comes with data.table, is VERY fast & automatically creates a data.table summary(pressureLocalDT) @@ -94,10 +146,58 @@ plot(pressureLocalDT) # > Load one from the internet ---- # This will only work if you have internet access! -pressureNetDT <- fread("https://git.soton.ac.uk/ba1e12/intro2R/raw/master/data/pressure.csv") # fread comes with data.table, is VERY fast & automatically creates a data.table -summary(pressureNetDT) +generationDTorig <- fread("https://www.emi.ea.govt.nz/Wholesale/Datasets/Generation/Generation_MD/201712_Generation_MD.csv") + +# reshape the data as it comes in a rather unhelpful form +generationDT <- melt(generationDTorig, + id.vars=c("Site_Code","POC_Code","Nwk_Code", "Gen_Code", "Fuel_Code", "Tech_Code","Trading_date"), + variable.name = "Time_Period", # converts TP1-48 + value.name = "MW" # Megawatts +) + +generationDT <- generationDT[, rDate := as.Date(Trading_date)] # fix the dates so R knows what they are # You will notice that these were .csv files. # R likes .csv # But if you must you can import from just about anything else - SAS, STATA, Matlab, SPSS, Excel (I recommend https://cran.r-project.org/web/packages/readxl/index.html) -# https://cran.r-project.org/doc/manuals/r-release/R-data.html \ No newline at end of file +# https://cran.r-project.org/doc/manuals/r-release/R-data.html + +# Power generation ---- +# > Have a look at the data ---- +summary(generationDT) + +#Â create an aggregated tabe summing generation by fuel, date and time period. This is what data.table is really good at +plotDT <- generationDT[, + .(totalMW = sum(MW), + nObs = .N), + keyby = .(Time_Period, rDate, Fuel_Code)] + +# > Use the new data to draw a chart of all generation in the data ---- +myCaption <- "Source: January 2018 wholesale generation data via EMI (NZ Electricity Authority)" +ggplot(plotDT, + aes(x = Time_Period, y = totalMW, colour = as.factor(rDate), group = rDate)) + + geom_point() + + facet_grid(Fuel_Code ~ .) + + labs(caption = myCaption) + +# > Use the new data to draw a chart of all generation during Christmas week ---- +myCaption <- paste0(myCaption, "\nChristmas week only") +ggplot(plotDT[rDate > "2017-12-20" & rDate < "2017-12-28"], + aes(x = Time_Period, y = totalMW, colour = as.factor(rDate), group = rDate)) + + geom_point() + + facet_grid(Fuel_Code ~ .) + + labs(caption = myCaption) + +# > hydro as a % of all ---- +plotDT <- plotDT[, sumAllMW := sum(totalMW), keyby = .(Time_Period, rDate)] # calculate total +plotDT <- plotDT[, pcTotalMW := 100*(totalMW/sumAllMW)] # calculate % + +# > Use the new data to draw a chart of hydro as a % of all during Christmas week ---- +ggplot(plotDT[rDate > "2017-12-20" & rDate < "2017-12-28" & Fuel_Code == "Hydro"], + aes(x = Time_Period, y = pcTotalMW, colour = as.factor(rDate), group = rDate)) + + geom_line() + + +# Now what about RMarkdown? + +# File -> New -> RMarkdown - what do you see?