% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ffs.R
\name{ffs}
\alias{ffs}
\title{Forward feature selection}
\usage{
ffs(predictors, response, method = "rf",
  metric = ifelse(is.factor(response), "Accuracy", "RMSE"),
  maximize = ifelse(metric == "RMSE", FALSE, TRUE), withinSE = FALSE,
  trControl = caret::trainControl(), tuneLength = 3, tuneGrid = NULL,
  seed = sample(1:1000, 1), verbose = TRUE, ...)
}
\arguments{
\item{predictors}{see \code{\link{train}}}

\item{response}{see \code{\link{train}}}

\item{method}{see \code{\link{train}}}

\item{metric}{see \code{\link{train}}}

\item{maximize}{see \code{\link{train}}}

\item{withinSE}{Logical Models are only selected if they are better than the
currently best models Standard error}

\item{trControl}{see \code{\link{train}}}

\item{tuneLength}{see \code{\link{train}}}

\item{tuneGrid}{see \code{\link{train}}}

\item{seed}{A random number used for model training}

\item{verbose}{Logical. Should information about the progress be printed?}

\item{...}{arguments passed to the classification or regression routine
(such as randomForest). Errors will occur if values for tuning parameters are
passed here.}
}
\value{
A list of class train. Beside of the usual train content
the object contains the vector "selectedvars" and "selectedvars_perf"
that give the order of the best variables selected as well as their corresponding
performance (starting from the first two variables). It also contains "perf_all"
that gives the performance of all model runs.
}
\description{
A simple forward feature selection algorithm
}
\details{
Models with two predictors are first trained using all possible
pairs of predictor variables. The best model of these initial models is kept.
On the basis of this best model the predictor variables are iteratively
increased and each of the remaining variables is tested for its improvement
of the currently best model. The process stops if none of the remaining
variables increases the model performance when added to the current best model.

The internal cross validation can be run in parallel. See information
on parallel processing of carets train functions for details.

Using withinSE will favour models with less variables and
probably shorten the calculation time
}
\note{
This validation is particulary suitable for
leave-location-out cross validations where variable selection
MUST be based on the performance of the model on the hold out station.
See \href{https://doi.org/10.1016/j.envsoft.2017.12.001}{Meyer et al. (2018)}
for further details.
}
\examples{
\dontrun{
data(iris)
ffsmodel <- ffs(iris[,1:4],iris$Species)
ffsmodel$selectedvars
ffsmodel$selectedvars_perf
}

# or perform model with target-oriented validation (LLO CV)
#the example is taken from the GSIF package and is described
#in Gasch et al. (2015). The ffs approach for this dataset is described in
#Meyer et al. (2018). Due to high computation time needed, only a small and thus not robust example
#is shown here.

\dontrun{
#run the model on three cores:
library(doParallel)
cl <- makeCluster(3)
registerDoParallel(cl)

#load and prepare dataset:
dat <- get(load(system.file("extdata","Cookfarm.RData",package="CAST")))
trainDat <- dat[dat$altitude==-0.3&year(dat$Date)==2012&week(dat$Date)\%in\%c(13:14),]

#visualize dataset:
ggplot(data = trainDat, aes(x=Date, y=VW)) + geom_line(aes(colour=SOURCEID))

#create folds for Leave Location Out Cross Validation:
set.seed(10)
indices <- CreateSpacetimeFolds(trainDat,spacevar = "SOURCEID",k=3)
ctrl <- trainControl(method="cv",index = indices$index)

#define potential predictors:
predictors <- c("DEM","TWI","BLD","Precip_cum","cday","MaxT_wrcc",
"Precip_wrcc","NDRE.M","Bt","MinT_wrcc","Northing","Easting")

#run ffs model with Leave Location out CV
set.seed(10)
ffsmodel <- ffs(trainDat[,predictors],trainDat$VW,method="rf",
tuneLength=1,trControl=ctrl)
ffsmodel

#compare to model without ffs:
model <- ffs(trainDat[,predictors],trainDat$VW,method="rf",
tuneLength=1, trControl=ctrl)
model
stopCluster(cl)
}
}
\references{
\itemize{
\item Gasch, C.K., Hengl, T., Gräler, B., Meyer, H., Magney, T., Brown, D.J. (2015): Spatio-temporal interpolation of soil water, temperature, and electrical conductivity in 3D+T: the Cook Agronomy Farm data set. Spatial Statistics 14: 70-90.
\item Meyer, H., Reudenbach, C., Hengl, T., Katurji, M., Nauß, T. (2018): Improving performance of spatio-temporal machine learning models using forward feature selection and target-oriented validation. Environmental Modelling & Software 101: 1-9.
}
}
\seealso{
\code{\link{train}},
\code{\link{trainControl}},\code{\link{CreateSpacetimeFolds}}
}
\author{
Hanna Meyer
}
