Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
CER
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SERG
CER
Commits
8a1fee1b
Commit
8a1fee1b
authored
9 years ago
by
Ben Anderson
Browse files
Options
Downloads
Patches
Plain Diff
testing data analysis
parent
32d24efe
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
analyseCERdata.R
+205
-0
205 additions, 0 deletions
analyseCERdata.R
with
205 additions
and
0 deletions
analyseCERdata.R
0 → 100644
+
205
−
0
View file @
8a1fee1b
# Meta -----
# Analyse Irish CER Smart Meter Trial data
# See: http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
# code by: b.anderson@soton.ak.uk (@dataknut) with additions by lsb1@soton.ac.uk
# Purpose: ----
# introductory code for use in teaching using this dataset
# introduce large scale time series data
# introduce data tables
# introduce sub-setting
# introduce zoo package for manipulating time series
# introduce autocorrelation
# Housekeeping ----
# clear the workspace
rm
(
list
=
ls
())
# install a couple of packages that we will need
# if you have them already, you can comment out these lines with a "#"
#install.packages("data.table")
library
(
data.table
)
#install.packages("zoo")
library
(
zoo
)
print
(
paste0
(
"Working directory is: "
,
getwd
()))
# change the working directory to where you put the data
# setwd("<where you put the data>")
# In my case:
setwd
(
"~/Documents/Work/Data/CER Smart Metering Project/data/processed"
)
# Load data & describe it ----
# Load the data
# loop over months and years
months
<-
c
(
"October"
,
"December"
)
years
<-
c
(
"2009"
,
"2010"
)
for
(
m
in
months
)
{
for
(
y
in
years
)
{
print
(
paste0
(
"Reading in CER_"
,
m
,
"_"
,
y
,
"_residential.csv"
))
# need to set this table name according to values of m and y
# why is this soooo hard in R? It must be a common need
name
<-
paste0
(
"resCER_"
,
m
,
"_"
,
y
,
"_DT"
)
assign
(
name
,
as.data.table
(
read.csv
(
paste0
(
"CER_"
,
m
,
"_"
,
y
,
"_residential.csv"
))))
}
}
# what's here?
tables
()
# combine the data into one big table in long format
# more use for summaries etc
l
=
list
(
resCER_October_2009_DT
,
resCER_December_2009_DT
,
resCER_October_2010_DT
,
resCER_December_2010_DT
)
resCER_kWh_DT
<-
as.data.table
(
rbindlist
(
l
))
# delete old tables to save memory
resCER_October_2009_DT
<-
NULL
resCER_December_2009_DT
<-
NULL
resCER_October_2010_DT
<-
NULL
resCER_December_2010_DT
<-
NULL
#L: We can create a date vector using R's date conversion functions
resCER_kWh_DT
$
l_datetime
<-
as.POSIXct
(
resCER_kWh_DT
$
datetime_start
,
tz
=
""
,
"%Y-%m-%d %H:%M:%S"
)
# now remove old datetime to save memory
resCER_kWh_DT
$
datetime_start
<-
NULL
# extract:
# year
# month
# date
# hour of the day variable (24 hour format)
# relies on the POSIXct time having a regular structure so that the hour is ALWAYS character 12 & 13
resCER_kWh_DT
$
year
<-
as.numeric
(
substr
(
resCER_kWh_DT
$
l_datetime
,
1
,
4
))
resCER_kWh_DT
$
month
<-
as.numeric
(
substr
(
resCER_kWh_DT
$
l_datetime
,
6
,
7
))
resCER_kWh_DT
$
date
<-
as.POSIXct
(
substr
(
resCER_kWh_DT
$
l_datetime
,
1
,
10
))
resCER_kWh_DT
$
hour
<-
as.numeric
(
substr
(
resCER_kWh_DT
$
l_datetime
,
12
,
13
))
# Plots ----
# . is a synonym for list
# we can quote functions and then eval them
toRun
=
quote
(
.
(
MeankWh
=
mean
(
kWh
)))
hourlykWh
<-
resCER_kWh_DT
[,
eval
(
toRun
),
by
=
.
(
hour
,
month
,
year
)]
# mean per half hour by month & year
# use min/max to set limits of plot correctly
minkwh
<-
min
(
hourlykWh
$
MeankWh
)
maxkwh
<-
max
(
hourlykWh
$
MeankWh
)
plot
(
hourlykWh
$
MeankWh
[
hourlykWh
$
year
==
2009
&
hourlykWh
$
month
==
10
],
ylim
=
c
(
minkwh
,
maxkwh
),
col
=
"1"
,
ylab
=
"Mean kWh"
,
lab
=
c
(
12
,
8
,
2
),
xlab
=
"Hour"
,
type
=
"l"
)
# familiar daily profile?
par
(
new
=
TRUE
)
plot
(
hourlykWh
$
MeankWh
[
hourlykWh
$
year
==
2010
&
hourlykWh
$
month
==
10
],
ylim
=
c
(
minkwh
,
maxkwh
),
col
=
"2"
,
lab
=
c
(
12
,
8
,
2
),
xlab
=
""
,
ylab
=
""
,
type
=
"l"
)
par
(
new
=
TRUE
)
plot
(
hourlykWh
$
MeankWh
[
hourlykWh
$
year
==
2009
&
hourlykWh
$
month
==
12
],
ylim
=
c
(
minkwh
,
maxkwh
),
col
=
"3"
,
lab
=
c
(
12
,
8
,
2
),
xlab
=
""
,
ylab
=
""
,
type
=
"l"
)
par
(
new
=
TRUE
)
plot
(
hourlykWh
$
MeankWh
[
hourlykWh
$
year
==
2010
&
hourlykWh
$
month
==
12
],
ylim
=
c
(
minkwh
,
maxkwh
),
col
=
"4"
,
lab
=
c
(
12
,
8
,
2
),
xlab
=
""
,
ylab
=
""
,
type
=
"l"
)
legend
(
"topleft"
,
col
=
c
(
1
:
4
),
lty
=
1
,
legend
=
c
(
"October 2009"
,
"October 2010"
,
"December 2009"
,
"December 2010"
)
)
# calculate mean kWh consumption for evening peak 16:00 - 19:00
# October 2009 (pre trial)
base_Oct2009kWhDT
<-
as.data.table
(
resCER_kWh_DT
[
resCER_kWh_DT
$
hour
>
15
&
resCER_kWh_DT
$
hour
<
20
&
resCER_kWh_DT
$
year
==
2009
&
resCER_kWh_DT
$
month
==
10
])
m
<-
mean
(
base_Oct2009kWhDT
$
kWh
)
# how skewed is that?
summary
(
base_Oct2009kWhDT
$
kWh
)
# sd
s
<-
sd
(
base_Oct2009kWhDT
$
kWh
)
# sample size calculation
# see ?power.t.test
# let us assume we see a 10% change in the mean due to higher evening prices
# we want to test for a reduction (1 sided)
# for now assume this is a two sample test
power.t.test
(
delta
=
0.1
*
m
,
sd
=
s
,
power
=
0.8
,
type
=
"two.sample"
,
alternative
=
"one.sided"
)
# Time series analysis ----
# create a subset for the first household only
hh_1002
<-
subset
(
resCER_kWh_DT
,
resCER_kWh_DT
$
ID
==
1002
)
# select just October - you'll see why in a minute
##hh_1024oct <- subset(hh_1024, hh_1024$oct == 1)
#L: we can select data from the dataframe by date range
date_start
<-
as.POSIXct
(
"2009-10-01"
,
tz
=
""
)
date_end
<-
as.POSIXct
(
"2009-10-31"
,
tz
=
""
)
hh_1002oct
<-
hh_1002
[
hh_1002
$
l_datetime
%in%
date_start
:
date_end
,
]
# need to check is sorted by datetime (always increasing) and evenly spaced
# create zoo (time series) object to do this
hh_1002oct_z
=
zoo
(
hh_1002oct
,
order.by
=
hh_1002oct
$
l_datetime
)
# is it a regular time series? (using zoo function)
is.regular
(
hh_1002oct_z
)
# if it wasn't, how could we get round this problem?
# hint: look at na.approx() and approx()
# plot data
plot
(
hh_1002oct
$
kwh
)
# run acf with the first household only up to just over 48 hours (96 half hours)
acf
(
hh_1002oct
$
kwh
,
lag.max
=
100
)
# what do we conclude from that?
# let's find the *partial autocorrelation function* (pacf)
# this is the effect of successive lags with the effect of previous lags removed
# It shows more clearly how the random variation depends on the previous lags
# see https://www.youtube.com/watch?v=R-oWTWdS1Jg
pacf
(
hh_1002oct
$
kwh
,
lag.max
=
100
)
# how many lags are significant in this case?
# what might happen if we excluded sleep time (00:00 - 06:00?)
# hint:
# hh_1024oct$my_hod<-(as.double(hh_1024oct$l_datetime)%%86400)/3600
# ...gives decimal hour of day
# then can use logical indexing to remove this data:
# (hh_1024oct$my_hod>6)
# Should we also filter out weekdays from weekends? Why?
# what kind of household is this?
table
(
hh_1002oct
$
ba_nadults
)
# how many adults?
table
(
hh_1002oct
$
ba_nchildren
)
# how many children?
table
(
hh_1002oct
$
ba_empl
)
# Employed?
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment