# R in Action (2nd ed): Chapter 16
# Cluster analysis
# requires packaged NbClust, flexclust, rattle
# install.packages(c("NbClust", "flexclust", "rattle"))
#-----------------------------------------------------

par(ask=TRUE)
opar <- par(no.readonly=FALSE)

## Calculating Distances
data(nutrient, package="flexclust")
head(nutrient, 2)
d <- dist(nutrient)
as.matrix(d)[1:4,1:4]

## Hierarchical Cluster Analysis

# Average linkage clustering of nutrient data
data(nutrient, package="flexclust")
row.names(nutrient) <- tolower(row.names(nutrient))
nutrient.scaled <- scale(nutrient)                                  
d <- dist(nutrient.scaled)                                          
fit.average <- hclust(d, method="average")                          
plot(fit.average, hang=-1, cex=.8, main="Average Linkage Clustering")

# Selecting the number of clusters
library(NbClust)
nc <- NbClust(nutrient.scaled, distance="euclidean", 
              min.nc=2, max.nc=15, method="average")
par(opar)
table(nc$Best.n[1,])
barplot(table(nc$Best.n[1,]), 
        xlab="Numer of Clusters", ylab="Number of Criteria",
        main="Number of Clusters Chosen by 26 Criteria") 
        
# Obtaining the final cluster solution
clusters <- cutree(fit.average, k=5) 
table(clusters)
aggregate(nutrient, by=list(cluster=clusters), median) 
aggregate(as.data.frame(nutrient.scaled), by=list(cluster=clusters),
          median)
plot(fit.average, hang=-1, cex=.8,  
     main="Average Linkage Clustering\n5 Cluster Solution")
rect.hclust(fit.average, k=5)

## Patitioning Cluster Analysis

wssplot <- function(data, nc=15, seed=1234){
                    wss <- (nrow(data)-1)*sum(apply(data,2,var))
                    for (i in 2:nc){
                         set.seed(seed)
                         wss[i] <- sum(kmeans(data, centers=i)$withinss)}
                    plot(1:nc, wss, type="b", xlab="Number of Clusters",
                         ylab="Within groups sum of squares")}

# K-means clustering of wine data
data(wine, package="rattle")
head(wine)
df <- scale(wine[-1])  
wssplot(df)      
library(NbClust)
set.seed(1234)
nc <- NbClust(df, min.nc=2, max.nc=15, method="kmeans")
par(opar)
table(nc$Best.n[1,])
barplot(table(nc$Best.n[1,]), 
        xlab="Numer of Clusters", ylab="Number of Criteria",
        main="Number of Clusters Chosen by 26 Criteria") 
set.seed(1234)
fit.km <- kmeans(df, 3, nstart=25) 
fit.km$size
fit.km$centers                                               
aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean)

# evaluate clustering
ct.km <- table(wine$Type, fit.km$cluster)
ct.km   
library(flexclust)
indexRand(ct.km)

# Partitioning around mediods for the wine data
library(cluster)
set.seed(1234)
fit.pam <- pam(wine[-1], k=3, stand=TRUE)       
fit.pam$medoids                                 
clusplot(fit.pam, main="Bivariate Cluster Plot")

# evaluate clustering
ct.pam <- table(wine$Type, fit.pam$clustering)
ct.pam
randIndex(ct.pam)

## Avoiding non-existent clusters
library(fMultivar)
set.seed(1234)
df <- rnorm2d(1000, rho=.5)
df <- as.data.frame(df)
plot(df, main="Bivariate Normal Distribution with rho=0.5")

wssplot(df)
library(NbClust)
nc <- NbClust(df, min.nc=2, max.nc=15, method="kmeans")
par(opar)
barplot(table(nc$Best.n[1,]), 
        xlab="Numer of Clusters", ylab="Number of Criteria",
        main  ="Number of Clusters Chosen by 26 Criteria")

fit <- pam(df, k=2)
clusplot(fit)
df$clustering <- factor(fit$clustering)
qplot(x=V1, y=V2, data=df, color=clustering, 
      shape=clustering, 
      main="Clustering of Bivariate Normal Data")

plot(nc$All.index[,5], type="o", ylab="CCC",
     xlab="Number of clusters", col="blue")