Skip to content
Snippets Groups Projects
Select Git revision
  • 84384d1cad32d2f97b5787e762bf89fdb4b924b7
  • master default
2 results

EDF.java

Blame
  • Week_10_practical_data_mining_solution.R 2.37 KiB
    # Optional Lab - clustering households by electricity consumption
    
    ### Housekeeping  ------------------------
    
    # clear the workspace
    rm(list=ls())
    
    #libraries
    library(data.table)
    
    # may need to install these
    library(nFactors)
    library(psych)
    library(corrgram)
    library(scatterplot3d)
    
    # just to check where R thinks we are
    getwd()
    
    # data 
    ifile <- "http://www.soton.ac.uk/~ba1e12/CER_wHourlyMeanByIDOct09j.csv"
    
    # Load pre-processed data ----
    # This contains mean kWh electricity consumption per hour per household ID in October 2009
    # from a trial of smart meters
    # Merged to a few household survey attributes
    
    meanKwhDT <- as.data.table(read.csv(ifile))
    
    dim(meanKwhDT)
    names(meanKwhDT)
    
    # Start cluster analysis here ----
    # ref http://www.statmethods.net/advstats/cluster.html
    
    # Create scaled data table of just the hourly values
    # DT[,.(V2,V3)]
    subsetDT <- subset(meanKwhDT,select = c(1:26))
    subsetDT_s <- scale(subsetDT)
    
    # Step 1 - calculate sum of squares to determine number of clusters
    subsetDT_s_wss <- (nrow(subsetDT_s)-1)*sum(apply(subsetDT_s,2,var))
    for (i in 2:15) subsetDT_s_wss[i] <- sum(kmeans(subsetDT_s,
                                                centers=i)$withinss)
    plot(1:15, subsetDT_s_wss, type="b", xlab="Number of Clusters",
         ylab="Within groups sum of squares",
         main = "Elec: n clusters vs within group sum of squares") 
    
    # K-Means Cluster Analysis
    kmeanssubsetDT_s <- kmeans(subsetDT_s, 10) # 10 cluster solution?
    # get cluster means - probably best to write out as cvs for visualisation?
    aggregate(subsetDT_s,by=list(kmeanssubsetDT_s$cluster),FUN=mean)
    
    # append cluster assignment
    # to scaled data
    subsetDT_s <- data.frame(subsetDT_s, kmeanssubsetDT_s$cluster) 
    # to original data
    meanKwhDT <- data.frame(meanKwhDT, cluster = kmeanssubsetDT_s$cluster) 
    
    # results path edit for your location
    rpath <- "~/OneDrive - University of Southampton/PG/Southampton/FEEG6025 Data Analysis & Experimental Methods for Engineers/FEEG6025_github/results"
    
    write.csv(aggregate(subsetDT_s,by=list(kmeanssubsetDT_s$cluster),FUN=mean),
              file = paste0(rpath,"/cer_clusterHourlyMeans_scaled.csv")
    )
    
    write.csv(aggregate(meanKwhDT,by=list(kmeanssubsetDT_s$cluster),FUN=mean),
              file = paste0(rpath,"/cer_clusterHourlyMeans_raw.csv")
    )
    
    # try a logit model to predict cluster membership based on household attributes
    # you will need to recode the clusters to binary variables first!
    
    # End