\name{cluster.stats}
\alias{cluster.stats}
%- Also NEED an `\alias' for EACH other topic documented here.
\title{Cluster validation statistics}
\description{
  Computes a number of distance based statistics which can be used for cluster
  validation, comparison between clusterings and decision about
  the number of clusters: cluster sizes, cluster diameters,
  average distances within and between clusters, cluster separation,
  average silhouette widths, the best distance based statistics to
  decide about the number of clusters in a study of Milligan and Cooper
  (1985), Hubert's gamma coefficient, the Dunn index and the corrected
  rand index to assess the similarity of two clusterings.
}
\usage{
cluster.stats(d,clustering,alt.clustering=NULL,
                          silhouette=TRUE,G2=FALSE,G3=FALSE)
}
%- maybe also `usage' for other objects documented here.
\arguments{
  \item{d}{a distance object (as generated by \code{dist}) or a distance
    matrix between cases.}
  \item{clustering}{an integer vector of length of the number of cases,
    which indicates a clustering. The clusters have to be numbered
    from 1 to the number of clusters.}
  \item{alt.clustering}{an integer vector such as for
    \code{clustering}, indicating an alternative clustering. If provided, the
    corrected rand index for \code{clustering}
    vs. \code{alt.clustering} is computed.}
  \item{silhouette}{logical. If \code{TRUE}, the silhouette statistics
    are computed, which requires package \code{cluster}.}
  \item{G2}{logical. If \code{TRUE}, Goodman and Kruskal's index G2
    (cf. Gordon (1999), p. 62) is computed. This executes lots of
    sorting algorithms and can be very slow (it has been improved
    by R. Francois - thanks!)}
  \item{G3}{logical. If \code{TRUE}, the index G3
    (cf. Gordon (1999), p. 62) is computed. This executes \code{sort}
    on all distances and can be extremely slow.}    
}

\value{
  \code{cluster.stats} returns a list containing the components
  \code{n, cluster.number, cluster.size, diameter,
    average.distance, median.distance, separation, average.toother,
    separation.matrix, average.between, average.within,
    n.between, n.within, clus.avg.silwidths, avg.silwidth,
    g2, g3, hubertgamma, dunn, wb.ratio, corrected.rand}.

  \item{n}{number of cases.}
  \item{cluster.number}{number of clusters.}
  \item{cluster.size}{vector of cluster sizes (number of points).}
  \item{diameter}{vector of cluster diameters (maximum within cluster
    distances).}
  \item{average.distance}{vector of clusterwise
    within cluster average distances.}
  \item{median.distance}{vector of clusterwise
    within cluster distance medians.}
  \item{separation}{vector of clusterwise minimum distances of a point
    in the cluster to a point of another cluster.}
  \item{average.toother}{vector of clusterwise average distances of a point
    in the cluster to the points of other clusters.}
  \item{separation.matrix}{matrix of separation values between all pairs
    of clusters.}
  \item{average.between}{average distance between clusters.}
  \item{average.within}{average distance within clusters.}
  \item{n.between}{number of distances between clusters.}
  \item{n.within}{number of distances within clusters.}
  \item{clus.avg.silwidths}{vector of cluster average silhouette
    widths. See
    \code{\link{silhouette}}.}
  \item{avg.silwidth}{average silhouette
    width. See
    \code{\link{silhouette}}.}
  \item{g2}{Goodman and Kruskal's Gamma coefficient. See Milligan and
    Cooper (1985), Gordon (1999, p. 62).}
  \item{g3}{G3 coefficient. See Gordon (1999, p. 62).}
  \item{hubertgamma}{correlation between distances and a
    0-1-vector where 0 means same cluster, 1 means different clusters.
    See Haldiki et al. (2002).}
  \item{dunn}{minimum separation / maximum diameter. Dunn index, see
    Haldiki et al. (2002).}
  \item{wb.ratio}{\code{average.within/average.between}.}
  \item{corrected.rand}{corrected rand index (if \code{alt.clustering}
    has been specified), see Gordon (1999, p. 198).}
}
\references{
  Gordon, A. D. (1999) \emph{Classification}, 2nd ed. Chapman and Hall.

  Haldiki, M., Batistakis, Y., Vazirgiannis, M. (2002) Cluster validity
  methods, \emph{SIGMOD}, Record 31, 40-45.
  
  Milligan, G. W. and Cooper, M. C. (1985) An examination of procedures
  for determining the number of clusters. \emph{Psychometrika}, 50, 159-179.
}
\author{Christian Hennig
  \email{chrish@stats.ucl.ac.uk}
  \url{http://www.homepages.ucl.ac.uk/~ucakche/}
}
\seealso{
  \code{\link{silhouette}}, \code{\link{dist}}
  \code{\link{clusterboot}} computes clusterwise stability statistics by
  resampling.
}
\examples{  
  set.seed(20000)
  face <- rFace(200,dMoNo=2,dNoEy=0,p=2)
  dface <- dist(face)
  complete3 <- cutree(hclust(dface),3)
  cluster.stats(dface,complete3,
                alt.clustering=as.integer(attr(face,"grouping")))
  
}
\keyword{cluster}% at least one, from doc/KEYWORDS
\keyword{multivariate}



