Select Git revision
mapping-only.rst
-
James Graham authoredJames Graham authored
Week_10_practical_data_mining_solution.R 2.37 KiB
# Optional Lab - clustering households by electricity consumption
### Housekeeping ------------------------
# clear the workspace
rm(list=ls())
#libraries
library(data.table)
# may need to install these
library(nFactors)
library(psych)
library(corrgram)
library(scatterplot3d)
# just to check where R thinks we are
getwd()
# data
ifile <- "http://www.soton.ac.uk/~ba1e12/CER_wHourlyMeanByIDOct09j.csv"
# Load pre-processed data ----
# This contains mean kWh electricity consumption per hour per household ID in October 2009
# from a trial of smart meters
# Merged to a few household survey attributes
meanKwhDT <- as.data.table(read.csv(ifile))
dim(meanKwhDT)
names(meanKwhDT)
# Start cluster analysis here ----
# ref http://www.statmethods.net/advstats/cluster.html
# Create scaled data table of just the hourly values
# DT[,.(V2,V3)]
subsetDT <- subset(meanKwhDT,select = c(1:26))
subsetDT_s <- scale(subsetDT)
# Step 1 - calculate sum of squares to determine number of clusters
subsetDT_s_wss <- (nrow(subsetDT_s)-1)*sum(apply(subsetDT_s,2,var))
for (i in 2:15) subsetDT_s_wss[i] <- sum(kmeans(subsetDT_s,
centers=i)$withinss)
plot(1:15, subsetDT_s_wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",
main = "Elec: n clusters vs within group sum of squares")
# K-Means Cluster Analysis
kmeanssubsetDT_s <- kmeans(subsetDT_s, 10) # 10 cluster solution?
# get cluster means - probably best to write out as cvs for visualisation?
aggregate(subsetDT_s,by=list(kmeanssubsetDT_s$cluster),FUN=mean)
# append cluster assignment
# to scaled data
subsetDT_s <- data.frame(subsetDT_s, kmeanssubsetDT_s$cluster)
# to original data
meanKwhDT <- data.frame(meanKwhDT, cluster = kmeanssubsetDT_s$cluster)
# results path edit for your location
rpath <- "~/OneDrive - University of Southampton/PG/Southampton/FEEG6025 Data Analysis & Experimental Methods for Engineers/FEEG6025_github/results"
write.csv(aggregate(subsetDT_s,by=list(kmeanssubsetDT_s$cluster),FUN=mean),
file = paste0(rpath,"/cer_clusterHourlyMeans_scaled.csv")
)
write.csv(aggregate(meanKwhDT,by=list(kmeanssubsetDT_s$cluster),FUN=mean),
file = paste0(rpath,"/cer_clusterHourlyMeans_raw.csv")
)
# try a logit model to predict cluster membership based on household attributes
# you will need to recode the clusters to binary variables first!
# End