% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/civis_ml_workflows.R
\name{civis_ml_gradient_boosting_regressor}
\alias{civis_ml_gradient_boosting_regressor}
\title{CivisML Gradient Boosting Regressor}
\usage{
civis_ml_gradient_boosting_regressor(x, dependent_variable,
  primary_key = NULL, excluded_columns = NULL, loss = c("ls", "lad",
  "huber", "quantile"), learning_rate = 0.1, n_estimators = 500,
  subsample = 1, criterion = c("friedman_mse", "mse", "mae"),
  min_samples_split = 2, min_samples_leaf = 1,
  min_weight_fraction_leaf = 0, max_depth = 2, min_impurity_split = 1e-07,
  random_state = 42, max_features = "sqrt", alpha = 0.9,
  max_leaf_nodes = NULL, presort = c("auto", TRUE, FALSE),
  fit_params = NULL, cross_validation_parameters = NULL,
  oos_scores_table = NULL, oos_scores_db = NULL,
  oos_scores_if_exists = c("fail", "append", "drop", "truncate"),
  model_name = NULL, cpu_requested = NULL, memory_requested = NULL,
  disk_requested = NULL, notifications = NULL, polling_interval = NULL,
  verbose = FALSE)
}
\arguments{
\item{x}{See the Data Sources section below.}

\item{dependent_variable}{The dependent variable of the training dataset.
For a multi-target problem, this should be a vector of column names of
dependent variables. Nulls in a single dependent variable will
automatically be dropped.}

\item{primary_key}{Optional, the unique ID (primary key) of the training
dataset. This will be used to index the out-of-sample scores. In
\code{predict.civis_ml}, the primary_key of the training task is used by
default \code{primary_key = NA}. Use \code{primary_key = NULL} to
explicitly indicate the data have no primary_key.}

\item{excluded_columns}{Optional, a vector of columns which will be
considered ineligible to be independent variables.}

\item{loss}{The loss function to be optimized. \code{ls} refers to least
squares regression. \code{lad} (least absolute deviation) is a highly
robust loss function solely based on order information of the input
variables. \code{huber} is a combination of the two. \code{quantile}
allows quantile regression (use \code{alpha} to specify the quantile).}

\item{learning_rate}{The learning rate shrinks the contribution of each tree
by \code{learning_rate}. There is a trade-off between \code{learning_rate}
and \code{n_estimators}.}

\item{n_estimators}{The number of boosting stages to perform. Gradient
boosting is fairly robust to over-fitting, so a large number usually
results in better predictive performance.}

\item{subsample}{The fraction of samples to be used for fitting individual
base learners. If smaller than 1.0, this results in Stochastic Gradient
Boosting. \code{subsample} interacts with the parameter \code{n_estimators}.
Choosing \code{subsample < 1.0} leads to a reduction of variance and an
increase in bias.}

\item{criterion}{The function to measure the quality of a split. The default
value \code{criterion = "friedman_mse"} is generally the best as it can
provide a better approximation in some cases.}

\item{min_samples_split}{The minimum number of samples required to split
an internal node. If an integer, then consider \code{min_samples_split}
as the minimum number. If a float, then \code{min_samples_split} is a
percentage and \code{ceiling(min_samples_split * n_samples)} are the
minimum number of samples for each split.}

\item{min_samples_leaf}{The minimum number of samples required to be in
a leaf node. If an integer, then consider \code{min_samples_leaf} as the
minimum number. If a float, the \code{min_samples_leaf} is a percentage
and \code{ceiling(min_samples_leaf * n_samples)} are the minimum number
of samples for each leaf node.}

\item{min_weight_fraction_leaf}{The minimum weighted fraction of the sum
total of weights required to be at a leaf node.}

\item{max_depth}{Maximum depth of the individual regression estimators. The
maximum depth limits the number of nodes in the tree. Tune this parameter
for best performance. The best value depends on the interaction of the
input variables.}

\item{min_impurity_split}{Threshold for early stopping in tree growth. A node
will split if its impurity is above the threshold, otherwise it is a leaf.}

\item{random_state}{The seed of the random number generator.}

\item{max_features}{The number of features to consider when looking for the
best split.
\describe{
  \item{integer}{consider \code{max_features} at each split.}
  \item{float}{then \code{max_features} is a percentage and
    \code{max_features * n_features} are considered at each split.}
  \item{auto}{then \code{max_features = sqrt(n_features)}}
  \item{sqrt}{then \code{max_features = sqrt(n_features)}}
  \item{log2}{then \code{max_features = log2(n_features)}}
  \item{NULL}{then \code{max_features = n_features}}
}}

\item{alpha}{The alpha-quantile of the \code{huber} loss function and the
\code{quantile} loss function. Ignored unless \code{loss = "huber"} or
\code{loss = "quantile"}}

\item{max_leaf_nodes}{Grow trees with \code{max_leaf_nodes} in best-first
fashion. Best nodes are defined as relative reduction to impurity. If
\code{max_leaf_nodes = NULL} then unlimited number of leaf nodes.}

\item{presort}{Whether to presort the data to speed up the finding of best
splits in fitting.}

\item{fit_params}{Optional, a mapping from parameter names in the model's
\code{fit} method to the column names which hold the data, e.g.
\code{list(sample_weight = 'survey_weight_column')}.}

\item{cross_validation_parameters}{Optional, parameter grid for learner
parameters, e.g. \code{list(n_estimators = c(100, 200, 500),
learning_rate = c(0.01, 0.1), max_depth = c(2, 3))}
or \code{"hyperband"} for supported models.}

\item{oos_scores_table}{Optional, if provided, store out-of-sample
predictions on training set data to this Redshift "schema.tablename".}

\item{oos_scores_db}{Optional, the name of the database where the
\code{oos_scores_table} will be created. If not provided, this will default
to \code{database_name}.}

\item{oos_scores_if_exists}{Optional, action to take if
\code{oos_scores_table} already exists. One of \code{"fail"}, \code{"append"}, \code{"drop"}, or \code{"truncate"}.
The default is \code{"fail"}.}

\item{model_name}{Optional, the prefix of the Platform modeling jobs.
It will have \code{" Train"} or \code{" Predict"} added to become the Script title.}

\item{cpu_requested}{Optional, the number of CPU shares requested in the
Civis Platform for training jobs or prediction child jobs.
1024 shares = 1 CPU.}

\item{memory_requested}{Optional, the memory requested from Civis Platform
for training jobs or prediction child jobs, in MiB.}

\item{disk_requested}{Optional, the disk space requested on Civis Platform
for training jobs or prediction child jobs, in GB.}

\item{notifications}{Optional, model status notifications. See
\code{\link{scripts_post_custom}} for further documentation about email
and URL notification.}

\item{polling_interval}{Check for job completion every this number of seconds.}

\item{verbose}{Optional, If \code{TRUE}, supply debug outputs in Platform
logs and make prediction child jobs visible.}
}
\value{
A \code{civis_ml} object, a list containing the following elements:
\item{job}{job metadata from \code{\link{scripts_get_custom}}.}
\item{run}{run metadata from \code{\link{scripts_get_custom_runs}}.}
\item{outputs}{CivisML metadata from \code{\link{scripts_list_custom_runs_outputs}} containing the locations of
 files produced by CivisML e.g. files, projects, metrics, model_info, logs, predictions, and estimators.}
\item{metrics}{Parsed CivisML output from \code{metrics.json} containing metadata from validation.
 A list containing the following elements:
  \itemize{
  \item run list, metadata about the run.
  \item data list, metadata about the training data.
  \item model list, the fitted scikit-learn model with CV results.
  \item metrics list, validation metrics (accuracy, confusion, ROC, AUC, etc).
  \item warnings list.
  \item data_platform list, training data location.
}}
\item{model_info}{Parsed CivisML output from \code{model_info.json} containing metadata from training.
 A list containing the following elements:
  \itemize{
  \item run list, metadata about the run.
  \item data list, metadata about the training data.
  \item model list, the fitted scikit-learn model.
  \item metrics empty list.
  \item warnings list.
  \item data_platform list, training data location.
  }}
}
\description{
CivisML Gradient Boosting Regressor
}
\section{Data Sources}{


For building models with \code{civis_ml}, the training data can reside in
four different places, a file in the Civis Platform, a CSV or feather-format file
on the local disk, a \code{data.frame} resident in local the R environment, and finally,
a table in the Civis Platform. Use the following helpers to specify the
data source when calling \code{civis_ml}:

\describe{
  \item{\code{data.frame}}{\code{civis_ml(x = df, ...)}}
  \item{local csv file}{\code{civis_ml(x = "path/to/data.csv", ...)}}
  \item{file in Civis Platform}{\code{civis_ml(x = civis_file(1234))}}
  \item{table in Civis Platform}{\code{civis_ml(x = civis_table(table_name = "schema.table", database_name = "database"))}}
}
}

\examples{
\dontrun{
data(ChickWeight)

m <- civis_ml_gradient_boosting_regressor(ChickWeight,
  dependent_variable = "weight",
  learning_rate = .01,
  n_estimators = 100,
  subsample = .5,
  max_depth = 5,
  max_features = NULL)
yhat <- fetch_oos_scores(m)

# Grid Search
cv_params <- list(
  n_estimators = c(100, 200, 500),
  learning_rate = c(.01, .1),
  max_depth = c(2, 3))

m <- civis_ml_gradient_boosting_regressor(ChickWeight,
  dependent_variable = "weight",
  subsample = .5,
  max_features = NULL,
  cross_validation_parameters = cv_params)

pred_info <- predict(m,  civis_table("schema.table", "my_database"),
   output_table = "schema.scores_table")
}
}
