# Read in and prepare the data - Chapter 22 Silhouette Analysis of IRIS dataset i.data <- iris # iris is a built-in dataset # Min-max normalization i.data$SL <- (i.data$Sepal.Length - min(i.data$Sepal.Length))/ (max(i.data$Sepal.Length) - min(i.data$Sepal.Length)) head(i.data) tail(i.data) i.data$SL sort(i.data$SL) i.data$SW <- (i.data$Sepal.Width - min(i.data$Sepal.Width))/ (max(i.data$Sepal.Width) - min(i.data$Sepal.Width)) i.data$PL <- (i.data$Petal.Length - min(i.data$Petal.Length))/ (max(i.data$Petal.Length) - min(i.data$Petal.Length)) i.data$PW <- (i.data$Petal.Width - min(i.data$Petal.Width))/ (max(i.data$Petal.Width) - min(i.data$Petal.Width)) sort(i.data$SW) sort(i.data$PL) sort(i.data$PW) # Silhouette values install.packages("cluster") library("cluster") km1 <- kmeans(i.data[,6:9], 3) km km$cluster km$centers km$size dist1 <- dist(i.data[,6:9], method="euclidean") sil1 <- silhouette(km1$cluster, dist1) plot(sil1,col = c("black", "red", "green"), main="Silhouette Plot: 3-Cluster K-Means Clustering of Iris Data") # k-means (k=2) km2 <- kmeans(i.data[,6:9],2) dist2 <- dist(i.data[,6:9], method="euclidean") sil2 <- silhouette(km2$cluster, dist2) plot(sil2,col = c("black", "red"), main="Silhouette Plot: 2-Cluster K-Means Clustering of Iris Data") # Plot silhouette values silval1 <- ifelse(sil1[,3] <= 0.33,0,1) silval1 plot(i.data$PL, i.data$PW, col = silval1 + 1, pch=16, main="Silhouette Valus, K=3", xlab = "Petal Length (min-max)", ylab = "Petal Width (min-max)") legend("topleft",col=c(1,2),pch=16,legend=c("<=0.33",">0.33")) silval2 <- ifelse(sil2[,3] <= 0.33,0,1) plot(i.data$PL, i.data$PW, col = silval2 + 1, pch=16, main="Silhouette Valus, K=2", xlab = "Petal Length (min-max)", ylab = "Petal Width (min-max)") legend("topleft",col=c(1,2),pch=16,legend=c("<=0.33",">0.33")) # Pseudo-F # Requires package 'clusterSim' n <- dim(i.data)[1] psF1 <- index.G1(i.data[,6:9],cl=km1$cluster) pf(psF1,2,n-2) psF2 <- index.G1(i.data[,6:9],cl=km2$cluster) pf(psF1,2,n-1)