Week_10_practical_data_mining_solution.R

# Optional Lab - clustering households by electricity consumption

### Housekeeping  ------------------------

# clear the workspace
rm(list=ls())

#libraries
library(data.table)

# may need to install these
library(nFactors)
library(psych)
library(corrgram)
library(scatterplot3d)

# just to check where R thinks we are
getwd()

# data 
ifile <- "http://www.soton.ac.uk/~ba1e12/CER_wHourlyMeanByIDOct09j.csv"

# Load pre-processed data ----
# This contains mean kWh electricity consumption per hour per household ID in October 2009
# from a trial of smart meters
# Merged to a few household survey attributes

meanKwhDT <- as.data.table(read.csv(ifile))

dim(meanKwhDT)
names(meanKwhDT)

# Start cluster analysis here ----
# ref http://www.statmethods.net/advstats/cluster.html

# Create scaled data table of just the hourly values
# DT[,.(V2,V3)]
subsetDT <- subset(meanKwhDT,select = c(1:26))
subsetDT_s <- scale(subsetDT)

# Step 1 - calculate sum of squares to determine number of clusters
subsetDT_s_wss <- (nrow(subsetDT_s)-1)*sum(apply(subsetDT_s,2,var))
for (i in 2:15) subsetDT_s_wss[i] <- sum(kmeans(subsetDT_s,
                                            centers=i)$withinss)
plot(1:15, subsetDT_s_wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares",
     main = "Elec: n clusters vs within group sum of squares") 

# K-Means Cluster Analysis
kmeanssubsetDT_s <- kmeans(subsetDT_s, 10) # 10 cluster solution?
# get cluster means - probably best to write out as cvs for visualisation?
aggregate(subsetDT_s,by=list(kmeanssubsetDT_s$cluster),FUN=mean)

# append cluster assignment
# to scaled data
subsetDT_s <- data.frame(subsetDT_s, kmeanssubsetDT_s$cluster) 
# to original data
meanKwhDT <- data.frame(meanKwhDT, cluster = kmeanssubsetDT_s$cluster) 

# results path edit for your location
rpath <- "~/OneDrive - University of Southampton/PG/Southampton/FEEG6025 Data Analysis & Experimental Methods for Engineers/FEEG6025_github/results"

write.csv(aggregate(subsetDT_s,by=list(kmeanssubsetDT_s$cluster),FUN=mean),
          file = paste0(rpath,"/cer_clusterHourlyMeans_scaled.csv")
)

write.csv(aggregate(meanKwhDT,by=list(kmeanssubsetDT_s$cluster),FUN=mean),
          file = paste0(rpath,"/cer_clusterHourlyMeans_raw.csv")
)
# try a logit model to predict cluster membership based on household attributes
# you will need to recode the clusters to binary variables first!

# End