% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fastcpd.R
\name{fastcpd}
\alias{fastcpd}
\title{Find change points efficiently}
\usage{
fastcpd(
  formula = y ~ . - 1,
  data,
  beta = NULL,
  segment_count = 10,
  trim = 0.025,
  momentum_coef = 0,
  k = function(x) 0,
  family = NULL,
  epsilon = 1e-10,
  min_prob = 10^10,
  winsorise_minval = -20,
  winsorise_maxval = 20,
  p = ncol(data) - 1,
  order = c(0, 0, 0),
  cost = NULL,
  cost_gradient = NULL,
  cost_hessian = NULL,
  cp_only = FALSE,
  vanilla_percentage = 0,
  warm_start = FALSE,
  lower = NULL,
  upper = NULL,
  line_search = c(1),
  ...
)
}
\arguments{
\item{formula}{A formula object specifying the model to be fitted. The
optional response variable should be on the left hand side of the formula
while the covariates should be on the right hand side. The intercept term
should be removed from the formula. The response variable is not
necessary if the data considered is not of regression type. For example,
a mean or variance change model does not necessarily have response
variables. By default an intercept column will be added to the data
similar to the \code{lm} function in \proglang{R}. Thus it is suggested
that user should remove the intercept term from the formula by appending
\code{- 1} to the formula. The default formula is suitable for regression
data sets with one-dimensional response variable and the rest being
covariates without intercept. The naming of variables used in the formula
should be consistent with the column names in the data frame provided in
\code{data}.}

\item{data}{A data frame containing the data to be segmented where each row
denotes each data point. In one-dimensional response variable regression
settings, the first column is the response variable while the rest are
covariates. The response is not necessary in the case of mean change or
variance change, in which case the formula will need to be adjusted
accordingly.}

\item{beta}{Initial cost value specified in the algorithm in the paper.
For the proper choice of a value, please refer to the paper. If not
specified, BIC criterion is used to obtain a proper value, i.e.,
\code{beta = (p + 1) * log(nrow(data)) / 2}.}

\item{segment_count}{Number of segments for initial guess. If not specified,
the initial guess on the number of segments is 10.}

\item{trim}{Trimming for the boundary change points so that a change point
close to the boundary will not be counted as a change point. This
parameter also specifies the minimum distance between two change points.
If.   several change points have mutual distances smaller than
\code{trim * nrow(data)}, those change points will be merged into one
single change point. The value of this parameter should be between
0 and 1.}

\item{momentum_coef}{Momentum coefficient to be applied to each update. This
parameter is used when the loss function is bad-shaped so that
maintaining a momentum from previous update is desired. Default value is
0, meaning the algorithm doesn't maintain a momentum by default.}

\item{k}{Function on number of epochs in SGD. \code{k} should be a function
taking only a parameter \code{x} meaning the current number of data
points considered since last segmentaion. The return value of the
function should be an integer indicating how many epochs should be
performed apart from the default update. By default the function returns
0, meaning no multiple epochs will be used to update the parameters.
Example usage:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{  k = function(x) \{
    if (x < n / segment_count / 4 * 1) 3
    else if (x < n / segment_count / 4 * 2) 2
    else if (x < n / segment_count / 4 * 3) 1
    else 0
  \}
}\if{html}{\out{</div>}}

This function will perform 3 epochs for the first quarter of the data, 2
epochs for the second quarter of the data, 1 epoch for the third quarter
of the data and no multiple epochs for the last quarter of the data.
Experiments show that performing multiple epochs will significantly
affect the performance of the algorithm. This parameter is left for the
users to tune the performance of the algorithm if the result is not
ideal. Details are discussed in the paper.}

\item{family}{Family of the model. Can be \code{"lm"}, \code{"binomial"},
\code{"poisson"}, \code{"lasso"}, \code{"custom"}, \code{"ar"},
\code{"var"}, \code{"ma"}, \code{"arima"}, \code{"garch"} or
\code{NULL}. For simplicity, user can also omit this parameter,
indicating that they will be using their own cost functions. Omitting the
parameter is the same as specifying the parameter to be \code{"custom"}
or \code{NULL}, in which case, users must specify the cost function, with
optional gradient and corresponding Hessian matrix functions.}

\item{epsilon}{Epsilon to avoid numerical issues. Only used for the Hessian
computation in Logistic Regression and Poisson Regression.}

\item{min_prob}{Minimum probability to avoid numerical issues. Only used
for Poisson Regression.}

\item{winsorise_minval}{Minimum value for the parameter in Poisson Regression
to be winsorised.}

\item{winsorise_maxval}{Maximum value for the parameter in Poisson Regression
to be winsorised.}

\item{p}{Number of covariates in the model. If not specified, the number of
covariates will be inferred from the data, i.e.,
\code{p = ncol(data) - 1}. This parameter is superseded by \code{order} in the
case of time series models: "ar", "var", "arima".}

\item{order}{Order of the AR(p), VAR(p) or ARIMA(p, d, q) model.}

\item{cost}{Cost function to be used. This and the following two parameters
should not be specified at the same time with \code{family}. If not
specified, the default is the negative log-likelihood for the
corresponding family. Custom cost functions can be provided in the
following two formats:
\itemize{
\item \code{cost = function(data) \{...\}}
\item \code{cost = function(data, theta) \{...\}}
}

In both methods, users should implement the cost value calculation based
on the data provided, where the data parameter can be considered as a
segment of the original data frame in the form of a matrix. The first
method is used when the cost function has an explicit solution, in which
case the cost function value can be calculated directly from the data.
The second method is used when the cost function does not have an
explicit solution, in which case the cost function value can be
calculated from the data and the estimated parameters. In the case of
only one \code{data} argument is provided, \code{fastcpd} performs the
vanilla PELT algorithm since no parameter updating is performed.}

\item{cost_gradient}{Gradient function for the custom cost function.
Example usage:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{  cost_gradient = function(data, theta) \{
    ...
    return(gradient)
  \}
}\if{html}{\out{</div>}}

The gradient function should take two parameters, the first one being a
segment of the data in the format of a matrix, the second one being the
estimated parameters. The gradient function should return the gradient of
the cost function with respect to the data and parameters.}

\item{cost_hessian}{Hessian function for the custom cost function. Similar to
the gradient function, the Hessian function should take two parameters,
the first one being a segment of the data in the format of a matrix, the
second one being the estimated parameters. The Hessian function should
return the Hessian matrix of the cost function with respect to the data
and parameters.}

\item{cp_only}{If \code{TRUE}, only the change points are returned.
Otherwise, the cost function values together with the estimated
parameters for each segment are also returned. By default the value is
set to be \code{FALSE} so that \code{plot} can be used to visualize the
results for a built-in model. \code{cp_only} has some performance impact
on the algorithm, since the cost values and estimated parameters for each
segment need to be calculated and stored. If the users are only
interested in the change points, setting \code{cp_only} to be \code{TRUE}
will help with the computational cost.}

\item{vanilla_percentage}{How many of the data should be processed through
vanilla PELT. Range should be between 0 and 1. The \code{fastcpd}
algorithm is based on gradient descent and thus a starting estimate can
be crucial. At the beginning of the algorithm, vanilla PELT can be
performed to obtain a relatively accurate estimate of the parameters
despite the small amount of the data being used. If set to be 0, all data
will be processed through sequential gradient descnet. If set to be 1,
all data will be processed through vaniall PELT. If the cost function
have an explicit solution, i.e. does not depend on coefficients like the
mean change case, this parameter will be set to be 1. If the value is set
to be between 0 and 1, the first \code{vanilla_percentage * nrow(data)}
data points will be processed through vanilla PELT and the rest will be
processed through sequential gradient descent.}

\item{warm_start}{If \code{TRUE}, the algorithm will use the estimated
parameters from the previous segment as the initial value for the
current segment. This parameter is only used for \code{"glm"} families.}

\item{lower}{Lower bound for the parameters. Used to specify the
domain of the parameter after each gradient descent step. If not specified,
the lower bound will be set to be \code{-Inf} for all parameters.}

\item{upper}{Upper bound for the parameters. Used to specify the
domain of the parameter after each gradient descent step. If not specified,
the upper bound will be set to be \code{Inf} for all parameters.}

\item{line_search}{If a vector of numeric values are provided, line
search will be performed to find the optimal step size for each update.}

\item{...}{Parameters specifically used for time series models. As of
the current implementation, only \code{include.mean} will not be ignored
and used in the ARIMA or GARCH model.}
}
\value{
A class \code{fastcpd} object.
}
\description{
\code{fastcpd} takes in formulas, data, families and extra
parameters and returns a \code{fastcpd} object.
}
\section{Gallery}{

\url{https://fastcpd.xingchi.li/articles/gallery.html}
}

\section{References}{

Zhang X, Dawn T (2023). ``Sequential Gradient Descent and Quasi-Newton's
Method for Change-Point Analysis.'' In Ruiz, Francisco, Dy, Jennifer,
van de Meent, Jan-Willem (eds.), \emph{Proceedings of The 26th International
Conference on Artificial Intelligence and Statistics}, volume 206 series
Proceedings of Machine Learning Research, 1129-1143.
\url{https://proceedings.mlr.press/v206/zhang23b.html}.
}

\examples{
\donttest{
if (!requireNamespace("ggplot2", quietly = TRUE)) utils::install.packages(
  "ggplot2", repos = "https://cloud.r-project.org", quiet = TRUE
)

### linear regression with one-dimensional covariate
library(fastcpd)
set.seed(1)
p <- 1
x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p))
theta_0 <- matrix(c(1, -1, 0.5))
y <- c(
  x[1:100, ] * theta_0[1, ] + rnorm(100, 0, 1),
  x[101:200, ] * theta_0[2, ] + rnorm(100, 0, 1),
  x[201:300, ] * theta_0[3, ] + rnorm(100, 0, 1)
)
result <- fastcpd(
  formula = y ~ . - 1,
  data = data.frame(y = y, x = x),
  family = "lm"
)
plot(result)
summary(result)

### custom logistic regression
library(fastcpd)
set.seed(1)
p <- 5
x <- matrix(rnorm(375 * p, 0, 1), ncol = p)
theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1))
y <- c(
  rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] \%*\% theta[1, ]))),
  rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] \%*\% theta[2, ])))
)
data <- data.frame(y = y, x = x)
result_builtin <- suppressWarnings(fastcpd(
  formula = y ~ . - 1,
  data = data,
  family = "binomial"
))
logistic_loss <- function(data, theta) {
  x <- data[, -1]
  y <- data[, 1]
  u <- x \%*\% theta
  nll <- -y * u + log(1 + exp(u))
  nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10]
  sum(nll)
}
logistic_loss_gradient <- function(data, theta) {
  x <- data[nrow(data), -1]
  y <- data[nrow(data), 1]
  c(-(y - 1 / (1 + exp(-x \%*\% theta)))) * x
}
logistic_loss_hessian <- function(data, theta) {
  x <- data[nrow(data), -1]
  prob <- 1 / (1 + exp(-x \%*\% theta))
  (x \%o\% x) * c((1 - prob) * prob)
}
result_custom <- fastcpd(
  formula = y ~ . - 1,
  data = data,
  epsilon = 1e-5,
  cost = logistic_loss,
  cost_gradient = logistic_loss_gradient,
  cost_hessian = logistic_loss_hessian
)
cat(
  "Change points detected by built-in logistic regression model: ",
  result_builtin@cp_set, "\n",
  "Change points detected by custom logistic regression model: ",
  result_custom@cp_set, "\n",
  sep = ""
)
result_custom_two_epochs <- fastcpd(
  formula = y ~ . - 1,
  data = data,
  k = function(x) 1,
  epsilon = 1e-5,
  cost = logistic_loss,
  cost_gradient = logistic_loss_gradient,
  cost_hessian = logistic_loss_hessian
)
summary(result_custom_two_epochs)

### custom cost function huber regression
library(fastcpd)
set.seed(1)
n <- 400 + 300 + 500
p <- 5
x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p))
theta <- rbind(
  mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)),
  mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)),
  mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3))
)
theta <- cbind(theta, matrix(0, 3, 3))
theta <- theta[rep(seq_len(3), c(400, 300, 500)), ]
y_true <- rowSums(x * theta)
factor <- c(
  2 * stats::rbinom(400, size = 1, prob = 0.95) - 1,
  2 * stats::rbinom(300, size = 1, prob = 0.95) - 1,
  2 * stats::rbinom(500, size = 1, prob = 0.95) - 1
)
y <- factor * y_true + stats::rnorm(n)
data <- cbind.data.frame(y, x)
huber_threshold <- 1
huber_loss <- function(data, theta) {
  residual <- data[, 1] - data[, -1, drop = FALSE] \%*\% theta
  indicator <- abs(residual) <= huber_threshold
  sum(
    residual^2 / 2 * indicator +
      huber_threshold * (
        abs(residual) - huber_threshold / 2
      ) * (1 - indicator)
  )
}
huber_loss_gradient <- function(data, theta) {
  residual <- c(data[nrow(data), 1] - data[nrow(data), -1] \%*\% theta)
  if (abs(residual) <= huber_threshold) {
    -residual * data[nrow(data), -1]
  } else {
    -huber_threshold * sign(residual) * data[nrow(data), -1]
  }
}
huber_loss_hessian <- function(data, theta) {
  residual <- c(data[nrow(data), 1] - data[nrow(data), -1] \%*\% theta)
  if (abs(residual) <= huber_threshold) {
    outer(data[nrow(data), -1], data[nrow(data), -1])
  } else {
    0.01 * diag(length(theta))
  }
}
huber_regression_result <- fastcpd(
  formula = y ~ . - 1,
  data = data,
  beta = (p + 1) * log(n) / 2,
  cost = huber_loss,
  cost_gradient = huber_loss_gradient,
  cost_hessian = huber_loss_hessian
)
summary(huber_regression_result)
}
}
