Skip to content
Snippets Groups Projects
Commit 39784117 authored by Ben Anderson's avatar Ben Anderson
Browse files

amended r code

parent ae657ff5
No related branches found
No related tags found
No related merge requests found
### Title: myFirstRScript ----
# (c) 2018 Ben Anderson (@dataknut)
# Original code:
# Original code: https://git.soton.ac.uk/ba1e12/intro2R/blob/master/myFirstRScript.R
# Re-use: https://git.soton.ac.uk/ba1e12/intro2R/blob/master/LICENSE (MIT License)
# Helpful documentation:
# https://www.rdocumentation.org/
......@@ -10,11 +11,18 @@
# http://r4ds.had.co.nz/
# Preliminaries ----
# Print out the date - always useful to know
Sys.Date()
# Print out the time - also useful to know
Sys.time()
# R is a programing language (do not be afraid!)
# It has a lot of built-in commands/functions
# Print out the date & time - always useful to know
Sys.time() # type the first few characters of this function in the console - what happens?
# Take some time to explore the RStudio UI ----
# > Console ----
# > Files/Packages/Help (very useful!) ----
# > Script editor ----
# > Environment ----
# Play with some numbers ----
# Do some simple maths
......@@ -35,10 +43,15 @@ sqrt(result)
# Before we create a data table we have to load the data.table library as it is not a core part of R ('base')
library(data.table)
# if that fails we need to use:
# install.packages("data.table") # this will install the package from the default package archive (probably CRAN)
# OK, now we're ready
# Create a new data table with one column called 'index' and n rows defined by.... nRows
# First define the number of rows
nRows <- 100
dt <- data.table(index = rep(1:nRows))
# Now create the data table which we'll call 'dt' (we could call it (almost) anything)
dt <- data.table(index = rep(1:nRows)) # say hello to the really useful rep() function
# check what it looks like
dt
......@@ -65,28 +78,67 @@ summary(dt)
# But we'd quite like to visualise the data - draw a simple plot
plot(dt)
# Or maybe we'd like to test a correlation
cor.test(dt$index, dt$sqrtIndex)
# Did you get bored typing dt twice?
with(dt, cor.test(index, sqrtIndex)) # use with() to tell R which object (usually a data.table) you want to work with. In this case it made for more typing but if your object name is long or you need to type a lot of variables it will save on finger-wear
# Now let's try a very simple linear regression model and catch the results in a model result object
# Bonus points for telling me why a linear model is the _wrong_ choice!
linearModelResult <- lm(index ~ sqrtIndex, dt)
linearModelResult <- lm(index ~ sqrtIndex, dt) # Note the lm function forces us to specify the data to use
summary(linearModelResult) # note we use 'summary' again but it knows what to with a model result object
# Now the fun part: loading data ----
# Play with ggplot ----
# > Load an internal R dataset ----
mtcarsDT <- data.table(mtcars)
mtcarsDT <- data.table(mtcars) # Note I use 'DT' to remind me this object is a data table
# to find out which other datasets come with R run data()
head(mtcarsDT)
# Create a simple (!) visualisation of all the variables
pairs(mtcarsDT)
# > Load one from a local file ----
# So far we've used R's base plot() function.
# This is OK but ggplot is a package that creates very fancy graphs - http://ggplot2.tidyverse.org/reference/
# Load ggplot
library(ggplot2)
# if that fails we need to use:
# install.packages("ggplot2") # this will install the package from the default package archive (probably CRAN)
# > Now let's create a basic ggplot object
myPlot <- ggplot(mtcarsDT)
# test it
myPlot # what happens?
# We need to tell ggplot what the axes are
myPlot <- ggplot(mtcarsDT, aes(x = disp, y = mpg))
myPlot # what happens?
# We need to tell ggplot what to plot
myPlot <- ggplot(mtcarsDT, aes(x = disp, y = mpg)) +
geom_point() # a point plot - here are lots more http://ggplot2.tidyverse.org/reference/index.html#section-layer-geoms
myPlot
# We want capitals on our axis labels and a caption. I want to use them again and again so we use variables to store them
myCaption <- paste0("R mtcars data: n obs = ", nrow(mtcarsDT)) # Note the use of paste0() to construct a caption from text and R
myXlabel <- "Displacement"
myYlabel <- "Miles per gallon"
# Now add the axes labels & caption
myPlot <- myPlot + labs(x = myXlabel, y = myYlabel, caption = myCaption) # Note how we just add to myPlot
# see http://ggplot2.tidyverse.org/index.html for other things we can do
myPlot
# Let's have some colour
myPlot <- myPlot + geom_point(colour = "blue") # see how we're building up layers?
myPlot
# Let's use colour to tell us something
myPlot <- myPlot + geom_point(aes(colour = gear))
myPlot
# Now we're happy we can do all that in one call and add an outlier label:
# Lots more examples at http://r4ds.had.co.nz/data-visualisation.html
ggplot(mtcarsDT, aes(x = disp, y = mpg)) +
geom_point(aes(colour = gear)) +
labs(x = myXlabel, y = myYlabel, caption = myCaption) +
geom_label(x = 250, y = 23, label = "Outlier?")
# > Load a dataset from a local file ----
# This will only work if you have this folder & file in the same place as the R script
pressureLocalDT <- fread("data/pressure.csv") # fread comes with data.table, is VERY fast & automatically creates a data.table
summary(pressureLocalDT)
......@@ -94,10 +146,58 @@ plot(pressureLocalDT)
# > Load one from the internet ----
# This will only work if you have internet access!
pressureNetDT <- fread("https://git.soton.ac.uk/ba1e12/intro2R/raw/master/data/pressure.csv") # fread comes with data.table, is VERY fast & automatically creates a data.table
summary(pressureNetDT)
generationDTorig <- fread("https://www.emi.ea.govt.nz/Wholesale/Datasets/Generation/Generation_MD/201712_Generation_MD.csv")
# reshape the data as it comes in a rather unhelpful form
generationDT <- melt(generationDTorig,
id.vars=c("Site_Code","POC_Code","Nwk_Code", "Gen_Code", "Fuel_Code", "Tech_Code","Trading_date"),
variable.name = "Time_Period", # converts TP1-48
value.name = "MW" # Megawatts
)
generationDT <- generationDT[, rDate := as.Date(Trading_date)] # fix the dates so R knows what they are
# You will notice that these were .csv files.
# R likes .csv
# But if you must you can import from just about anything else - SAS, STATA, Matlab, SPSS, Excel (I recommend https://cran.r-project.org/web/packages/readxl/index.html)
# https://cran.r-project.org/doc/manuals/r-release/R-data.html
\ No newline at end of file
# https://cran.r-project.org/doc/manuals/r-release/R-data.html
# Power generation ----
# > Have a look at the data ----
summary(generationDT)
# create an aggregated tabe summing generation by fuel, date and time period. This is what data.table is really good at
plotDT <- generationDT[,
.(totalMW = sum(MW),
nObs = .N),
keyby = .(Time_Period, rDate, Fuel_Code)]
# > Use the new data to draw a chart of all generation in the data ----
myCaption <- "Source: January 2018 wholesale generation data via EMI (NZ Electricity Authority)"
ggplot(plotDT,
aes(x = Time_Period, y = totalMW, colour = as.factor(rDate), group = rDate)) +
geom_point() +
facet_grid(Fuel_Code ~ .) +
labs(caption = myCaption)
# > Use the new data to draw a chart of all generation during Christmas week ----
myCaption <- paste0(myCaption, "\nChristmas week only")
ggplot(plotDT[rDate > "2017-12-20" & rDate < "2017-12-28"],
aes(x = Time_Period, y = totalMW, colour = as.factor(rDate), group = rDate)) +
geom_point() +
facet_grid(Fuel_Code ~ .) +
labs(caption = myCaption)
# > hydro as a % of all ----
plotDT <- plotDT[, sumAllMW := sum(totalMW), keyby = .(Time_Period, rDate)] # calculate total
plotDT <- plotDT[, pcTotalMW := 100*(totalMW/sumAllMW)] # calculate %
# > Use the new data to draw a chart of hydro as a % of all during Christmas week ----
ggplot(plotDT[rDate > "2017-12-20" & rDate < "2017-12-28" & Fuel_Code == "Hydro"],
aes(x = Time_Period, y = pcTotalMW, colour = as.factor(rDate), group = rDate)) +
geom_line()
# Now what about RMarkdown?
# File -> New -> RMarkdown - what do you see?
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment