\name{DiSC}
\alias{DiSC}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{
DiSC: A statistical tool for differential expression analyis of individual level single-cell RNA-Seq data
}
\description{
A statistical tool for differential expression analyis of individual level single-cell RNA-Seq data
}
\usage{
DiSC(data.mat, cell.ind, metadata, outcome, covariates = NULL,
     cell.id = "cell_id",
     individual.id = "individual", perm.no = 999,
     features = c('prev', 'm', 'nzsd'), verbose = TRUE,
     sequencing.data = TRUE)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
  \item{data.mat}{
 A data matrix for single-cell RNA sequencing data, or other single-cell data such as CyToF data. Rows - genes/features, columns - cells. Column names are cell ids.
}
  \item{cell.ind}{
A data frame of cell-individual relationship. It includes two columns for cell ids and individual ids. It links cell ids to individual ids.
}
  \item{metadata}{
A data frame of individual-level metadata. It includes a column for individual ids, a column for the outcome of interest and columns for other covariates if applicable.
}
  \item{outcome}{
A character string for the column name of the outcome variable in \code{metadata}.
}
  \item{covariates}{
A character string or vector of character strings for the covariates to be adjusted. Should be the column names in \code{metadata}. Default: \code{NULL}.
}
  \item{cell.id}{
A character string for the column name of cell ids in \code{cell.ind}.
}
  \item{individual.id}{
A character string for the column name of the individual ids in \code{cell.ind} and \code{metadata}.
}
  \item{perm.no}{
An integer, number of permutations used. Default: \code{999}. It can be reduced to \code{99} if adjusted P-values (false discovery rate) are the only interest.
}
  \item{features}{
Features of the distribution used to test for the differentially expressed genes. Choose from \code{"prev"} (logit(non-zero proportion), \code{"nzm"} (sqrt(non-zero mean)),
\code{"nzsd"} (sqrt(non-zero standard deviation)), \code{"m"} (overall mean)), \code{"sd"} (overall standard deviation), \code{"nzm^1", "nzsd^1", "m^1", "sd^1"} (
non-sqrt-transformed versions). Default: \code{"prev"}, \code{"m"} and \code{"nzsd"}.
}
  \item{verbose}{
Logical. Should the function print the processes? Default: \code{TRUE}.
}
  \item{sequencing.data}{
Logical. Is the data.mat a sequencing data matrix (i.e., count data)? If TRUE, the total sum scaling will be used to normalize the count data.
The users can normalize/transform the data themselves by setting it to be FALSE. Default: \code{TRUE}.
}
}

\value{
  \describe{
\item{call}{How was the function called?}
\item{R2}{Description of R2}
\item{F0}{Description of F0}
\item{RSS}{Description of RSS}
\item{df.model}{Description of df.model}
\item{df.residual}{Description of df.residual}
\item{coef.list}{Description of coef.list}
\item{p.raw}{Raw, unadjusted P-values.}
\item{p.adj.fdr}{P-values which have been adjusted for false discovery rate.}
\item{p.adj.fwer}{P-values which have been adjusted for family-wise error rate.}
}
}
\references{
Zhang, L., Yang, L., Ren, Y., Zhang, S., Guan, W., & Chen, J. (Bioinformatics): DiSC: a Statistical Tool for Fast Differential Expression Analysis of Individual-level Single-cell RNA-seq Data.
}
\author{
Jun Chen <\email{chen.jun2@mayo.edu}> and Lujun Zhang
}

\examples{
set.seed(seed = 1234556)
data(sim_data)

count_matrix <- sim_data$count_matrix
meta_cell <- sim_data$meta_cell
gene_index <- sim_data$gene_index
meta_ind <- sim_data$meta_ind

obj1 <- DiSC(data.mat = count_matrix, cell.ind = meta_cell,
             metadata = meta_ind, outcome = "phenotype",
             covariates = "RIN", cell.id = "cell_id",
             individual.id = "individual", perm.no = 999,
             features = c('prev', 'm', 'nzsd'), verbose = TRUE,
             sequencing.data = TRUE)
# Type I error (the nominal level: 0.05)
mean(obj1$p.raw[gene_index$EE_index] <= 0.05)
# True positive rate (based on raw P-values, the higher the better.)
mean(obj1$p.raw[gene_index$mean_index] <= 0.05)
mean(obj1$p.raw[gene_index$var_index] <= 0.05)
mean(obj1$p.raw[gene_index$mean_var_index] <= 0.05)
# False discovery rate (the nominal level: 0.10)
sum(obj1$p.adj.fdr[gene_index$EE_index] <= 0.10)/
  sum(obj1$p.adj.fdr <= 0.10)
# True positive rate (based on FDR-adjusted P-values, the higher the better.)
mean(obj1$p.adj.fdr[gene_index$mean_index] <= 0.10)
mean(obj1$p.adj.fdr[gene_index$var_index] <= 0.10)
mean(obj1$p.adj.fdr[gene_index$mean_var_index] <= 0.10)

# By default, DiSC normalizes the scRNA-seq data using TSS (total sum scaling),
# adjusted for log median sequencing depths
# Other user-specified normalization methods can also be used:
# log2 transformed, adjusted for log median sequencing depth
# data_mat_log <- log2(data_mat+1)
# inds <- unique(meta_cell[["individual"]])
# meta_ind <- meta_ind[base::match(inds, meta_ind[["individual"]]), ]
# data_mat <- count_matrix
# depth <- colSums(data_mat)
# cell.list <- list()
# for (ind in inds)
#   cell.list[[ind]] <- meta_cell[meta_cell$individual == ind, ][["cell_id"]]
# log_md_depth <- numeric(length = length(inds))
# names(log_md_depth) <- inds
# for(ind in inds)
#   log_md_depth[ind] <- log(median(depth[cell.list[[ind]]]))
# meta_ind$log_md_depth <- log_md_depth
# obj_log <-
#   DiSC(data.mat = data_mat_log, cell.ind = meta_cell,
#        metadata = meta_ind, outcome = "phenotype",
#        covariates = c("RIN", "log_md_depth"),
#        cell.id = "cell_id", individual.id = "individual",
#        perm.no = 999, verbose = FALSE,
#        sequencing.data = FALSE, # sequencing.data needs to be FALSE
#        features = c('prev', 'm', 'nzsd'))
# Size factor: DESeq2, adjusted for log median sequencing depths
# require(DESeq2)
# colData <- data.frame(condition = rep(meta_ind$phenotype, each = 375),
#                       row.names = colnames(data_mat))
# dds <- DESeq2::DESeqDataSetFromMatrix(countData = data_mat + 1,
                                      # avoid every gene contains at least one zero
#                                       colData = colData, design = ~ condition)
# dds <- DESeq2::estimateSizeFactors(dds)
# data_mat_des <- sweep(data_mat, 2, DESeq2::sizeFactors(dds), FUN = "/")
# obj_des <-
#   DiSC(data.mat = data_mat_des, cell.ind = meta_cell,
#        metadata = meta_ind, outcome = "phenotype",
#        covariates = c("RIN", "log_md_depth"),
#        cell.id = "cell_id", individual.id = "individual",
#        perm.no = 999, verbose = FALSE,
#        sequencing.data = FALSE, # sequencing.data needs to be FALSE
#        features = c('prev', 'm', 'nzsd'))
}

% Add one or more standard keywords, see file 'KEYWORDS' in the
% R documentation directory (show via RShowDoc("KEYWORDS")):
% \keyword{ ~kwd1 }
% \keyword{ ~kwd2 }
% Use only one keyword per line.
% For non-standard keywords, use \concept instead of \keyword:
% \concept{ ~cpt1 }
% \concept{ ~cpt2 }
% Use only one concept per line.
