options(repos=c(CRAN="http://mirrors.ustc.edu.cn/CRAN/"))
###########################################################
#        Use histogram and scatterplot to detect clusters
############################################################

## put histograms on the diagonal
panel.hist <- function(x, ...)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(usr[1:2], 0, 1.5) )
         h <- hist(x, plot = FALSE)
         breaks <- h$breaks; nB <- length(breaks)
         y <- h$counts; y <- y/max(y)
         rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
     }


## put bivariate density estimation on the upper panels,
panel.kernel.density <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), 
    cex = 1, ...) 
{
    points(x, y,pch = pch, col =col, bg = bg, cex = cex)
    ok <- is.finite(x) & is.finite(y)
    if (any(ok)) {
    	xl<-list(x1=x,x2=y)
    	xx<-cbind(x,y)
        d <- bkde2D(xx,bandwidth=sapply(xl,dpik))
     contour(x=d$x1,y=d$x2,d$fhat,add=T,...)  
}
}

#install.pckages("KernSmooth")
library(KernSmooth)

#################################################################
###########
###########           Hierarchical Clustering        
###########
#################################################################

###########################################
###  Cars example 
###########################################

# The mtcars data set is built into R:

help(mtcars)

# We will focus on the variables that are continuous in nature rather than discrete:

cars.data <- mtcars[,c(1,3,4,5,6,7)]

# Standardizing by dividing through by the sample range of each variable

samp.range <- function(x){
myrange <- diff(range(x))
return(myrange)
}
my.ranges <- apply(cars.data,2,samp.range)
cars.std <- sweep(cars.data,2,my.ranges,FUN="/") 

#visualizing the data, check the possible clusters

pairs(cars.std, lower.panel=panel.kernel.density, upper.panel=panel.kernel.density, pch = 24, bg="light blue",diag.panel=panel.hist, cex.labels = 2, font.labels=2)
#There is clear evidence of the presence of several separate groups 


# Getting distance matrix:

dist.cars <- dist(cars.std)

# Single linkage:

cars.single.link <- hclust(dist.cars, method='single')

# Plotting the single linkage dendrogram:

plot(cars.single.link, labels=row.names(cars.data), ylab="Distance")

windows() # opening new window while keeping previous one open

# complete linkage:

cars.complete.link <- hclust(dist.cars, method='complete')

# Plotting the complete linkage dendrogram:

plot(cars.complete.link, labels=row.names(cars.data), ylab="Distance")

windows() # opening new window while keeping previous one open

# Average linkage:

cars.avg.link <- hclust(dist.cars, method='average')

# Plotting the average linkage dendrogram:

plot(cars.avg.link, labels=row.names(cars.data), ylab="Distance")

# Centroid linkage:

cars.ct.link <- hclust(dist.cars, method='centroid')

# Plotting the average linkage dendrogram:

plot(cars.ct.link, labels=row.names(cars.data), ylab="Distance")

#Causes an inversion of the dendrogram


# Average Linkage dendrogram seems to indicate two major clusters, 
# Single Linkage dendrogram may indicate three.

#cut the dendrogram
# Single Linkage Solution:

cut.3 <- cutree(cars.single.link, k=3)
cut.3     # printing the "clustering vector"

#alternatively, printing the clusters in terms of the car names
cars.3.clust <- lapply(1:3, function(nc) row.names(cars.data)[cut.3==nc])  
cars.3.clust   

# Cluster 1 seems to be mostly compact cars, Cluster 2 is sports cars, Cluster 3 is large luxury sedans

############# Visualization of Clusters:
### using dendrogram objects
cars.com.d<-as.dendrogram(cars.complete.link)
plot(cars.com.d)

plot(cars.com.d,type="triangle")


### Via the scatterplot matrix:

pairs(cars.data, panel=function(x,y) text(x,y,cut.3))

# Cluster 1 cars tend to have high mileage, low displacement, low horsepower, low weight.
# Cluster 3 cars tend to have low mileage, high weight.


### load package ape; remember to install it: install.packages('ape')
library(ape)
# plot basic tree
plot(as.phylo(cars.complete.link), cex = 0.9)

# cladogram
plot(as.phylo(cars.complete.link), type = "cladogram", cex = 0.9)

plot(as.phylo(cars.complete.link), type = "unrooted")

#see plot.phylo for details.

### Via a plot of the scores on the first 2 principal components, 
### with the clusters separated by color:

cars.pc <- princomp(cars.data,cor=T)

carnames <- abbreviate(row.names(cars.data))

# Setting up the colors for the 3 clusters on the plot:
my.color.vector <- rep("green", times=nrow(cars.data))
my.color.vector[cut.3==2] <- "blue"
my.color.vector[cut.3==3] <- "red"

# Plotting the PC scores:

par(pty="s")
plot(cars.pc$scores[,1], cars.pc$scores[,2], ylim=range(cars.pc$scores[,1]), 
     xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
text(cars.pc$scores[,1], cars.pc$scores[,2], labels=carnames, cex=0.7, lwd=2,
     col=my.color.vector)

# It's clear that PC1 is more important in separating the clusters than PC2.

#alternatively,
#library(cluster)
clusplot(cars.data, cut.3,labels=1,lines=0)

#or
plot(cars.single.link,labels=row.names(cars.data), ylab="Distance",main="Single linkage")
rect.hclust(cars.single.link,k=3)

#################################################################
#################################################################
###########
###########           Partitioning Clustering        
###########
#################################################################
#################################################################


##########################
##
##  K-means clustering
##
##########################

# A K-means clustering with k = 3:

# Note that the stability of the result can be improved by increasing the maximum number 
# of iterations and using multiple random starts:

cars.k3 <- kmeans(cars.std, centers=3, iter.max=100, nstart=25)
cars.k3

cars.k3.clust <- lapply(1:3, function(nc) row.names(cars.data)[cars.k3$cluster==nc])  
cars.3.clust  
cars.k3.clust

#note that the class lables are different from the results before
############# Visualization of Clusters:

### Via the scatterplot matrix:

pairs(cars.data, panel=function(x,y) text(x,y,cars.k3$cluster))

# Cluster 1 cars tend to have high mileage, low displacement, low horsepower, low weight.
# Cluster 2 cars tend to have low mileage, high weight.


##########################
##
##  K-medoids clustering
##
##########################


# Consider the cars.data and cars.std data frames we created above.

# Let's cluster the cars into k groups using the K-medoids approach.
#The function "pam" is in the "cluster" package.

 
library(cluster)

# K-medoids directly on the (standardized) data matrix:
cars.kmed.3 <- pam(cars.std, k=3, diss=F)

# Or you can do K-medoids by inputting the distance matrix:
# cars.kmed.3 <- pam(dist.cars, k=3, diss=T)

cars.kmed.3$clustering  # printing the "clustering vector"

cars.kmed.3$silinfo$avg.width  #printing the average silhouette width

### A little function to calculate the average silhouette width
### for a variety of choices of k:

my.k.choices <- 2:8
avg.sil.width <- rep(0, times=length(my.k.choices))
for (ii in (1:length(my.k.choices)) ){
avg.sil.width[ii] <- pam(cars.std, k=my.k.choices[ii])$silinfo$avg.width
}
print( cbind(my.k.choices, avg.sil.width) )

# A LARGE average silhouette width indicates that the observations are properly clustered.

# Maybe k=2 is the best choice of k here?


cars.3.clust <- lapply(1:3, function(nc) row.names(cars.data)[cars.kmed.3$clustering==nc])  
cars.3.clust   # printing the clusters in terms of the car names

# Cluster 1 seems to be mostly compact cars, Cluster 2 is sports cars, Cluster 3 is large luxury sedans

############# Visualization of Clusters:

## Built-in plots available with the pam function:

# The "clusplot":

plot(cars.kmed.3, which.plots=1)

# The clusplot (in the "cluster" library) can actually be used with 
# any clustering partition by entering the data set and the clustering vector, e.g.:


# The "silhouette plot":

plot(cars.kmed.3, which.plots=2)

# This shows which observations are "best clustered."

 
#################################################################
#################################################################
###########
###########           Spectral Clustering  
###########
#################################################################
#################################################################

install.packages("kernlab")
library(kernlab)


##simulated data
x1<-seq(from=0.5,to=4.5,by=0.05)
y1<-c(2.5+sqrt(4-(x1-2.5)^2),2.5-sqrt(4-(x1-2.5)^2))
y1<-y1+rnorm(length(y1),sd=0.1)
plot(c(x1,x1),y1,xlab="x",ylab="y")

x2<-seq(from=1.5,to=3.5,by=0.05)
y2<-c(2.5+sqrt(1-(x2-2.5)^2),2.5-sqrt(1-(x2-2.5)^2))
y2<-y2+rnorm(length(y2),sd=0.1)
points(c(x2,x2),y2,xlab="x",ylab="y",col=2)

dd<-cbind(c(x1,x1,x2,x2),c(y1,y2))

#k-means fails
dd.k<-kmeans(dd,2)
plot(dd,col=dd.k$cluster)

#spectral cluster
sc<-specc(dd,centers=2)
plot(dd,col=sc)


#spirals data
data(spirals)
sc <- specc(spirals, centers=2)
plot(spirals, col=sc)

#k-means fails
dd.k<-kmeans(spirals,2)
plot(spirals,col=dd.k$cluster)


####################################################
####################################################
# 
# Choosing the number of clusters k using the
#   average silhouette width criterion.
#
####################################################
####################################################

# When using pam, the output will give you the average 
#silhouette width (see above code).

# We can also get the average silhouette width when using other algorithms:

### With a hierarchical method (Complete linkage here):

summary(silhouette(cutree(cars.complete.link, k=2), dist.cars))$avg.width
summary(silhouette(cutree(cars.complete.link, k=3), dist.cars))$avg.width
 

plot(cars.complete.link, labels=row.names(cars.data), ylab="Distance")
rect.hclust(cars.complete.link, 3)
### With k-means:

summary(silhouette(kmeans(cars.std, centers=2, iter.max=100, nstart=25)$cluster, dist.cars))$avg.width
summary(silhouette(kmeans(cars.std, centers=3, iter.max=100, nstart=25)$cluster, dist.cars))$avg.width

# In each case, we might choose the value of k associated with the LARGEST average silhouette width.


####################################################
####################################################
# 
#    Cluster Validation
#
####################################################
####################################################

  
##  Gap Statistic                                
 
gap.cars<-clusGap(cars.std,kmeans,B=500,K.max=10)
plot(gap.cars)
out.gap<-gap.cars$Tab
maxSE(out.gap[,3],out.gap[,4],method="globalmax")

maxSE(out.gap[,3],out.gap[,4],method="Tibs2001SEmax")
cars.k2 <- kmeans(cars.std, centers=2, iter.max=100, nstart=25)
cars.k2
 

####################################################
####################################################
## 
##  Plotting the WSS for several choices of k
##
####################################################
####################################################

# This is a recommended method for choosing k in K-means clustering.

# For the cars data, let's consider letting k vary up to 5.

#Enter name of the data matrix to be clustered here:
my.data.matrix <- cars.std  

my.k.choices <- 2:8
n <- length(my.data.matrix[,1])
wss1 <- (n-1)*sum(apply(my.data.matrix,2,var))
wss <- numeric(0)
for(i in my.k.choices) {
  W <- sum(kmeans(my.data.matrix,i)$withinss)
  wss <- c(wss,W)
}
wss <- c(wss1,wss)
plot(c(1,my.k.choices),wss,type='l',xlab='Number of clusters', ylab='Within-groups sum-of-squares', lwd=2)
# For what value of k does the elbow of the plot occur?


#CH index
#http://cran.r-project.org/web/packages/clusterCrit/vignettes/clusterCrit.pdf
install.packages("clusterCrit")
library(clusterCrit)
ch<-numeric(0)
for(k in my.k.choices)
 {  cl <- kmeans(cars.std, centers=k, iter.max=100, nstart=25)
 	ch<-c(ch,intCriteria(as.matrix(cars.std),cl$cluster,"Calinski_Harabasz")$calinski_harabasz)
 }
plot(my.k.choices,ch,type="l")
#  k=2 is the best choice of k

#Rand index
install.packages("fossil")
library(fossil)
rand.index(cars.k3$clus,cars.k2$clus)
adj.rand.index(cars.k3$clus,cars.k2$clus) 


#or use clusterCrit package
#Czekanowski_Dice(F measure) and Rand index
extCriteria(cars.k3$clus,cars.k2$clus,c("Rand","Czekanowski_Dice"))