% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/explanatory_performance.R
\name{Incremental}
\alias{Incremental}
\title{Incremental prediction performance in regression}
\usage{
Incremental(
  xdata,
  ydata,
  stability = NULL,
  family = NULL,
  implementation = NULL,
  prediction = NULL,
  resampling = "subsampling",
  n_predictors = NULL,
  K = 100,
  tau = 0.8,
  seed = 1,
  n_thr = NULL,
  ij_method = FALSE,
  time = 1000,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{xdata}{matrix of predictors with observations as rows and variables as
columns.}

\item{ydata}{optional vector or matrix of outcome(s). If \code{family} is set
to \code{"binomial"} or \code{"multinomial"}, \code{ydata} can be a vector
with character/numeric values or a factor.}

\item{stability}{output of \code{\link{VariableSelection}}. If
\code{stability=NULL} (the default), a model including all variables in
\code{xdata} as predictors is fitted. Argument \code{family} must be
provided in this case.}

\item{family}{type of regression model. Possible values include
\code{"gaussian"} (linear regression), \code{"binomial"} (logistic
regression), \code{"multinomial"} (multinomial regression), and
\code{"cox"} (survival analysis). If provided, this argument must be
consistent with input \code{stability}.}

\item{implementation}{optional function to refit the model. If
\code{implementation=NULL} and \code{stability} is the output of
\code{\link{VariableSelection}}, \code{\link[stats]{lm}} (linear
regression), \code{\link[survival]{coxph}} (Cox regression),
\code{\link[stats]{glm}} (logistic regression), or
\code{\link[nnet]{multinom}} (multinomial regression) is used.}

\item{prediction}{optional function to compute predicted values from the
model refitted with \code{implementation}.}

\item{resampling}{resampling approach to create the training set. The default
is \code{"subsampling"} for sampling without replacement of a proportion
\code{tau} of the observations. Alternatively, this argument can be a
function to use for resampling. This function must use arguments named
\code{data} and \code{tau} and return the IDs of observations to be
included in the resampled dataset.}

\item{n_predictors}{number of predictors to consider.}

\item{K}{number of training-test splits.}

\item{tau}{proportion of observations used in the training set.}

\item{seed}{value of the seed to ensure reproducibility of the results.}

\item{n_thr}{number of thresholds to use to construct the ROC curve. If
\code{n_thr=NULL}, all predicted probability values are iteratively used as
thresholds. For faster computations on large data, less thresholds can be
used. Only applicable to logistic regression.}

\item{ij_method}{logical indicating if the analysis should be done for only
one refitting/test split with variance of the concordance index should be
computed using the infinitesimal jackknife method as implemented in
\code{\link[survival]{concordance}}. If \code{ij_method=FALSE} (the
default), the concordance indices computed for different refitting/test
splits are reported. If \code{ij_method=TRUE}, the concordance index and
estimated confidence interval at level 0.05 are reported. Only applicable
to Cox regression.}

\item{time}{numeric indicating the time for which the survival probabilities
are computed. Only applicable to Cox regression.}

\item{verbose}{logical indicating if a loading bar and messages should be
printed.}

\item{...}{additional parameters passed to the function provided in
\code{resampling}.}
}
\value{
An object of class \code{incremental}.

  For logistic regression, a list with: \item{FPR}{A list with, for each of
  the models (sequentially added predictors), the False Positive Rates for
  different thresholds (columns) and different data splits (rows).}
  \item{TPR}{A list with, for each of the models (sequentially added
  predictors), the True Positive Rates for different thresholds (columns) and
  different data splits (rows).} \item{AUC}{A list with, for each of the
  models (sequentially added predictors), a vector of Area Under the Curve
  (AUC) values obtained with different data splits.} \item{Beta}{Estimated
  regression coefficients from visited models.} \item{names}{Names of the
  predictors by order of inclusion.}

  For Cox regression, a list with: \item{concordance}{If
  \code{ij_method=FALSE}, a list with, for each of the models (sequentially
  added predictors), a vector of concordance indices obtained with different
  data splits. If \code{ij_method=TRUE}, a vector of concordance indices for
  each of the models (sequentially added predictors).} \item{lower}{A vector
  of the lower bound of the confidence interval at level 0.05 for concordance
  indices for each of the models (sequentially added predictors). Only
  returned if \code{ij_method=TRUE}.} \item{upper}{A vector of the upper
  bound of the confidence interval at level 0.05 for concordance indices for
  each of the models (sequentially added predictors). Only returned if
  \code{ij_method=TRUE}.} \item{Beta}{Estimated regression coefficients from
  visited models.} \item{names}{Names of the predictors by order of
  inclusion.}

  For linear regression, a list with: \item{Q_squared}{A list with, for each
  of the models (sequentially added predictors), a vector of Q-squared
  obtained with different data splits.} \item{Beta}{Estimated regression
  coefficients from visited models.} \item{names}{Names of the predictors by
  order of inclusion.}
}
\description{
Computes the prediction performance of regression models where predictors are
sequentially added by order of decreasing selection proportion. This function
can be used to evaluate the marginal contribution of each of the selected
predictors over and above more stable predictors. Performances are evaluated
as in \code{\link{ExplanatoryPerformance}}.
}
\examples{
\donttest{
## Logistic regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 1000, pk = 50, family = "binomial")

# Balanced split: 50\% variable selection set and 50\% for evaluation of performances
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "binomial"
)
xtrain <- simul$xdata[ids_train, ]
ytrain <- simul$ydata[ids_train, ]
xtest <- simul$xdata[-ids_train, ]
ytest <- simul$ydata[-ids_train, ]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "binomial")

# Evaluating marginal contribution of the predictors
perf <- Incremental(xdata = xtest, ydata = ytest, stability = stab, K = 10)
summary(perf)

# Visualisation
PlotIncremental(perf)
plot(perf) # alternative formulation


## Partial Least Squares (single component)

# Stability selection
stab <- VariableSelection(
  xdata = xtrain, ydata = ytrain,
  implementation = SparsePLS,
  family = "binomial"
)
print(SelectedVariables(stab))

# Defining wrapping functions for PLS-DA
PLSDA <- function(xdata, ydata, family = "binomial") {
  model <- mixOmics::plsda(X = xdata, Y = as.factor(ydata), ncomp = 1)
  return(model)
}
PredictPLSDA <- function(xdata, model) {
  xdata <- xdata[, rownames(model$loadings$X), drop = FALSE]
  predicted <- predict(object = model, newdata = xdata)$predict[, 2, 1]
  return(predicted)
}

# Evaluation of the performances on refitted models (K=1)
incremental <- Incremental(
  xdata = xtest, ydata = ytest,
  stability = stab,
  implementation = PLSDA, prediction = PredictPLSDA,
  K = 10
)
PlotIncremental(incremental)


## Cox regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 1000, pk = 50, family = "binomial")
ydata <- cbind(
  time = runif(nrow(simul$ydata), min = 100, max = 2000),
  case = simul$ydata[, 1]
) # including dummy time to event

# Balanced split: 50\% variable selection set and 50\% for evaluation of performances
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "binomial"
)
xtrain <- simul$xdata[ids_train, ]
ytrain <- ydata[ids_train, ]
xtest <- simul$xdata[-ids_train, ]
ytest <- ydata[-ids_train, ]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "cox")

# Marginal contribution
perf <- Incremental(xdata = xtest, ydata = ytest, stability = stab, K = 10)
PlotIncremental(perf)

# Faster computations on a single data split
perf <- Incremental(xdata = xtest, ydata = ytest, stability = stab, ij_method = TRUE)
PlotIncremental(perf)


## Linear regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 1000, pk = 50, family = "gaussian")

# Balanced split: 50\% variable selection set and 50\% for evaluation of performances
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "gaussian"
)
xtrain <- simul$xdata[ids_train, ]
ytrain <- simul$ydata[ids_train, ]
xtest <- simul$xdata[-ids_train, ]
ytest <- simul$ydata[-ids_train, ]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "gaussian")

# Evaluating marginal contribution of the predictors
perf <- Incremental(xdata = xtest, ydata = ytest, stability = stab, K = 10)
PlotIncremental(perf)


## Partial Least Squares (single component)

# Stability selection
stab <- VariableSelection(
  xdata = xtrain, ydata = ytrain,
  implementation = SparsePLS,
  family = "gaussian"
)
print(SelectedVariables(stab))

# Evaluation of the performances on refitted models (K=1)
incremental <- Incremental(
  xdata = xtest, ydata = ytest,
  stability = stab,
  implementation = PLS, prediction = PredictPLS,
  K = 10
)
PlotIncremental(incremental)
}

}
\seealso{
\code{\link{VariableSelection}}, \code{\link{Refit}}

Other prediction performance functions: 
\code{\link{ExplanatoryPerformance}()},
\code{\link{PlotIncremental}()}
}
\concept{prediction performance functions}
