% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/clustTrend.R
\name{clustTrend}
\alias{clustTrend}
\alias{ClusteredTrends}
\alias{print.ClusteredTrends}
\alias{plot.ClusteredTrends}
\alias{print.ClusteredTrends}
\alias{plot.ClusteredTrends}
\title{Cluster the genes dynamics into different dominant trends.}
\usage{
clustTrend(tcgs, expr, Subject_ID, TimePoint, threshold = 0.05,
  myproc = "BY", nbsimu_pval = 1e+06, baseline = NULL,
  only.signif = TRUE, group.var = NULL, Group_ID_paired = NULL,
  ref = NULL, group_of_interest = NULL, FUNcluster = NULL,
  clustering_metric = "euclidian", clustering_method = "ward", B = 100,
  max_trends = 4, aggreg.fun = "median", trend.fun = "median",
  methodOptiClust = "firstSEmax", indiv = "genes", verbose = TRUE)

\method{print}{ClusteredTrends}(x, ...)

\method{plot}{ClusteredTrends}(x, ...)
}
\arguments{
\item{tcgs}{a \bold{tcgsa} object for \code{clustTrend}, or a
\bold{\link{ClusteredTrends}} object for \code{print.ClusteredTrends} and
\code{plot.ClusteredTrends}.}

\item{expr}{either a matrix or dataframe of gene expression upon which
dynamics are to be calculated, or a list of gene sets estimation of gene
expression.  In the case of a matrix or dataframe, its dimension are \eqn{n}
x \eqn{p}, with the \eqn{p} sample in column and the \eqn{n} genes in row.
In the case of a list, its length should correspond to the number of gene
sets under scrutiny and each element should be an 3 dimension array of
estimated gene expression, such as for the list returned in the
\code{'Estimations'} element of \code{\link{TcGSA.LR}}.  See details.}

\item{Subject_ID}{a factor of length \eqn{p} that is in the same order as the
columns of \code{expr} (when it is a dataframe) and that contains the patient
identifier of each sample.}

\item{TimePoint}{a numeric vector or a factor of length \eqn{p} that is in
the same order as \code{Subject_ID} and the columns of \code{expr} (when it
is a dataframe), and that contains the time points at which gene expression
was measured.}

\item{threshold}{the threshold at which the FDR or the FWER should be
controlled.}

\item{myproc}{a vector of character strings containing the names of the
multiple testing procedures for which adjusted p-values are to be computed.
This vector should include any of the following: "\code{Bonferroni}",
"\code{Holm}", "\code{Hochberg}", "\code{SidakSS}", "\code{SidakSD}",
"\code{BH}", "\code{BY}", "\code{ABH}", "\code{TSBH}". See
\code{\link[multtest:mt.rawp2adjp]{mt.rawp2adjp}} for details.  Default is
"\code{BY}", the Benjamini & Yekutieli (2001) step-up FDR-controlling
procedure (general dependency structures).  In order to control the FWER(in
case of an analysis that is more a hypothesis confirmation than an
exploration of the expression data), we recommend to use "\code{Holm}", the
Holm (1979) step-down adjusted p-values for strong control of the FWER.}

\item{nbsimu_pval}{the number of observations under the null distribution to
be generated in order to compute the p-values.  Default is \code{1e+06}.}

\item{baseline}{a character string which is the value of \code{TimePoint}
that can be used as a baseline.  Default is \code{NULL}, in which case no
time point is used as a baseline value for gene expression.  Has to be
\code{NULL} when comparing two treatment groups.}

\item{only.signif}{logical flag for analyzing the trends in only the
significant gene sets.  If \code{FALSE}, all the gene sets from the
\bold{gmt} object contained in \code{x} are clustered.  Default is
\code{TRUE}.}

\item{group.var}{in the case of several treatment groups, this is a factor of
length \eqn{p} that is in the same order as \code{Timepoint},
\code{Subject_ID} and the columns of \code{expr}.  It indicates to which
treatment group each sample belongs to.  Default is \code{NULL}, which means
that there is only one treatment group.}

\item{Group_ID_paired}{a character vector of length \eqn{p} that is in the
same order as \code{Timepoint},  \code{Subject_ID}, \code{group.var} and the
columns of \code{expr}.  This argument must not be \code{NULL} in the case of
a paired analysis, and must be \code{NULL} otherwise.  Default is
\code{NULL}.}

\item{ref}{the group which is used as reference in the case of several
treatment groups.  Default is \code{NULL}, which means that reference is the
first group in alphabetical order of the labels of \code{group.var}.}

\item{group_of_interest}{the group of interest, for which dynamics are to be
computed in the case of several treatment groups.  Default is \code{NULL},
which means that group of interest is the second group in alphabetical order
of the labels of \code{group.var}.}

\item{FUNcluster}{the clustering function used to agglomerate genes in
trends.  Default is \code{NULL}, in which a hierarchical clustering is
performed via the function \code{\link[cluster:agnes]{agnes}}, using the
metric \code{clustering_metric} and the method \code{clustering_method}.  See
\code{\link[cluster:clusGap]{clusGap}}}

\item{clustering_metric}{character string specifying the metric to be used
for calculating dissimilarities between observations in the hierarchical
clustering when \code{FUNcluster} is \code{NULL}.  The currently available
options are \code{"euclidean"} and \code{"manhattan"}.  Default is
\code{"euclidean"}.  See \code{\link[cluster:agnes]{agnes}}.  Also, a \code{"sts"} option 
is available in TcGSA.  It implements the 'Short Time Series' distance 
[Moller-Levet et al., Fuzzy Clustering of short time series and unevenly distributed 
sampling points, \emph{Advances in Intelligent Data Analysis V}:330-340 Springer, 2003]
designed specifically for clustering time series.}

\item{clustering_method}{character string defining the agglomerative method
to be used in the hierarchical clustering when \code{FUNcluster} is
\code{NULL}.  The six methods implemented are \code{"average"} ([unweighted
pair-]group average method, UPGMA), \code{"single"} (single linkage),
\code{"complete"} (complete linkage), \code{"ward"} (Ward's method),
\code{"weighted"} (weighted average linkage).  Default is \code{"ward"}.  See
\code{\link[cluster:agnes]{agnes}}.}

\item{B}{integer specifying the number of Monte Carlo ("bootstrap") samples
used to compute the gap statistics.  Default is \code{500}.  See
\code{\link[cluster:clusGap]{clusGap}}.}

\item{max_trends}{integer specifying the maximum number of different clusters
to be tested.  Default is \code{4}.}

\item{aggreg.fun}{a character string such as \code{"mean"}, \code{"median"}
or the name of any other defined statistics function that returns a single
numeric value.  It specifies the function used to aggregate the observations
before the clustering.  Default is to \code{median}.  Default is
\code{"median"}.}

\item{trend.fun}{a character string such as \code{"mean"}, \code{"median"} or
the name of any other function that returns a single numeric value.  It
specifies the function used to calculate the trends of the identified
clustered.  Default is to \code{median}.}

\item{methodOptiClust}{character string indicating how the "optimal" number
of clusters is computed from the gap statistics and their standard
deviations. Possible values are \code{"globalmax"}, \code{"firstmax"},
\code{"Tibs2001SEmax"}, \code{"firstSEmax"} and \code{"globalSEmax"}.
Default is \code{"firstSEmax"}.  See \code{'method'} in
\code{\link[cluster:clusGap]{clusGap}}, Details and \emph{Tibshirani et al.,
2001} in References.}

\item{indiv}{a character string indicating by which unit observations are
aggregated (through \code{aggreg.fun}) before the clustering.  Possible
values are \code{"genes"} or \code{"patients"}.  Default is \code{"genes"}.}

\item{verbose}{logical flag enabling verbose messages to track the computing
status of the function.  Default is \code{TRUE}.}

\item{x}{an object of class '\code{ClusteredTrends}'.}

\item{\dots}{further arguments passed to or from other methods.}
}
\value{
An object of class \bold{\link{ClusteredTrends}} which is a list with
the 4 following components: \itemize{
\item \code{NbClust} a vector that contains the optimal number of clusters for
each analyzed gene sets.
\item \code{ClustsMeds} a list of the same length as \code{NsClust} (the
number of analyzed gene sets). Each element of the list is a data frame, in
which there is as many column as the optimal number of clusters for the
corresponding gene sets for each cluster.  Each column of the data frame
contains the median trend values for the corresponding cluster.
\item \code{GenesPartition} a list of the same length as \code{NsClust} (the
number of analyzed gene sets).  Each element of the list is a vector which
gives the partition of the genes inside the corresponding gene set.
\item \code{MaxNbClust} an integer storing the maximum number of different
clusters tested, as given by the argument \code{'max_trends'}.
}
}
\description{
This function clusters the genes dynamics of one gene sets into different
dominant trends.  The optimal number of clusters is computed thanks to the gap
statistics.  See \code{\link[cluster:clusGap]{clusGap}}.
}
\details{
If \code{expr} is a matrix or a dataframe, then the genes dynamics are
clustered on the "original" data.  On the other hand, if \code{expr} is a
list returned in the \code{'Estimations'} element of \code{\link{TcGSA.LR}},
then the dynamics are computed on the estimations made by the
\code{\link{TcGSA.LR}} function.

This function uses the Gap statistics to determine the optimal number of
clusters in the plotted gene set.  See
\code{\link[cluster:clusGap]{clusGap}}.
}
\examples{

\dontrun{
data(data_simu_TcGSA)

tcgsa_sim_1grp <- TcGSA.LR(expr=expr_1grp, gmt=gmt_sim, design=design, 
                          subject_name="Patient_ID", time_name="TimePoint",
                          time_func="linear", crossedRandom=FALSE)
 
CT <- clustTrend(tcgsa_sim_1grp,
    expr=expr_1grp, Subject_ID=design$Subject_ID, TimePoint=design$TimePoint)
CT
plot(CT)

CT$NbClust
CT$NbClust["Gene set 5"]
CT$ClustMeds[["Gene set 4"]]
CT$ClustMeds[["Gene set 5"]]
}

}
\references{
Tibshirani, R., Walther, G. and Hastie, T., 2001, Estimating the
number of data clusters via the Gap statistic, \emph{Journal of the Royal
Statistical Society, Series B (Statistical Methodology)}, \bold{63}, 2:
41--423.
}
\seealso{
\code{\link{plot1GS}}, \code{\link{TcGSA.LR}},
\code{\link[cluster:clusGap]{clusGap}}
}
\author{
Boris P. Hejblum
}
