% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ols_regression.R
\name{ols_regression}
\alias{ols_regression}
\title{OLS regression}
\usage{
ols_regression(
  ts_rdd,
  formula,
  weight = NULL,
  has_intercept = TRUE,
  ignore_const_vars = FALSE,
  const_var_threshold = 1e-12
)
}
\arguments{
\item{ts_rdd}{Timeseries RDD containing dependent and independent variables}

\item{formula}{An object of class "formula" (or one that can be coerced
to that class) which symbolically describes the model to be fitted, with
the left-hand-side being the column name of the dependent variable, and
the right-hand-side being column name(s) of independent variable(s)
delimited by `+`, e.g., `mpg ~ hp + weight + am` for predicting `mpg` based
on `hp`, `weight` and `am`}

\item{weight}{Name of the weight column if performing a weighted OLS
regression, or NULL if otherwise. Default: NULL.}

\item{has_intercept}{Whether to include an intercept term (default: TRUE).
If FALSE, then the resulting regression plane will always pass through the
origin.}

\item{ignore_const_vars}{Whether to ignore independent variables that are
constant or nearly constant based on const_threshold (default: FALSE).
If TRUE, the scalar fields of regression result are the same as if the
constant variables are not included as independent variables. The output
beta, tStat, stdErr columns will still have the same dimension number of
elements as the number of independent variables. However, entries
corresponding to independent variables that are considered constant will
have 0.0 for beta and stdErr; and Double.NaN for tStat.
If FALSE and at least one independent variable is considered constant, the
regression will output Double.NaN for all values. Note that if there are
multiple independent variables that can be considered constant and if the
resulting model should have an intercept term, then it is recommended to
set both ignore_const_vars and has_intercept to TRUE.}

\item{const_var_threshold}{Consider an independent variable `x` as constant
if ((number of observations) * variance(x)) is less than this value.
Default: 1e-12.}
}
\value{
A TimeSeries RDD with the following schema:
  * - "samples": [[LongType]], the number of samples
  * - "beta": [[ArrayType]] of [[DoubleType]], beta without the intercept
      component
  * - "intercept": [[DoubleType]], the intercept
  * - "hasIntercept": [[BooleanType]], whether the model has an intercept
      term
  * - "stdErr_intercept": [[DoubleType]], the standard error of the intercept
  * - "stdErr_beta": [[ArrayType]] of [[DoubleType]], the standard error of
      beta
  * - "rSquared": [[DoubleType]], the r-squared statistics
  * - "r": [[DoubleType]], the squared root of r-squared statistics
  * - "tStat_intercept": [[DoubleType]], the t-stats of the intercept
  * - "tStat_beta": [[ArrayType]] of [[DoubleType]], the t-stats of beta
  * - "logLikelihood": [[DoubleType]], the log-likelihood of the data given
      the fitted betas
  * - "akaikeIC": [[DoubleType]], the Akaike information criterion
  * - "bayesIC": [[DoubleType]], the Bayes information criterion
  * - "cond": [[DoubleType]], the condition number of the Gram matrix X^TX
      where X is the matrix formed by row vectors of independent variables
      (including a constant entry corresponding to the intercept if
      `has_intercept` is TRUE)
  * - "const_columns": [[ArrayType]] of [[StringType]], the list of
      independent variables that are considered constants
}
\description{
Ordinary least squares regression
}
\examples{

library(sparklyr)
library(sparklyr.flint)

sc <- try_spark_connect(master = "local")

if (!is.null(sc)) {
  mtcars_sdf <- copy_to(sc, mtcars, overwrite = TRUE) \%>\%
    dplyr::mutate(time = 0L)
  mtcars_ts <- from_sdf(mtcars_sdf, is_sorted = TRUE, time_unit = "SECONDS")
  model <- ols_regression(
    mtcars_ts, mpg ~ cyl + disp + hp + drat + wt + vs + am + gear + carb
  ) \%>\%
      collect()
} else {
  message("Unable to establish a Spark connection!")
}

}
\seealso{
Other summarizers: 
\code{\link{summarize_avg}()},
\code{\link{summarize_corr2}()},
\code{\link{summarize_corr}()},
\code{\link{summarize_count}()},
\code{\link{summarize_covar}()},
\code{\link{summarize_dot_product}()},
\code{\link{summarize_ema_half_life}()},
\code{\link{summarize_ewma}()},
\code{\link{summarize_geometric_mean}()},
\code{\link{summarize_kurtosis}()},
\code{\link{summarize_max}()},
\code{\link{summarize_min}()},
\code{\link{summarize_nth_central_moment}()},
\code{\link{summarize_nth_moment}()},
\code{\link{summarize_product}()},
\code{\link{summarize_quantile}()},
\code{\link{summarize_skewness}()},
\code{\link{summarize_stddev}()},
\code{\link{summarize_sum}()},
\code{\link{summarize_var}()},
\code{\link{summarize_weighted_avg}()},
\code{\link{summarize_weighted_corr}()},
\code{\link{summarize_weighted_covar}()},
\code{\link{summarize_z_score}()}
}
\concept{summarizers}
