% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_cut.R
\name{data_cut}
\alias{data_cut}
\alias{data_cut.numeric}
\alias{data_cut.data.frame}
\title{Recode (or "cut") data into groups of values.}
\usage{
data_cut(x, ...)

\method{data_cut}{numeric}(
  x,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  labels = NULL,
  verbose = TRUE,
  ...
)

\method{data_cut}{data.frame}(
  x,
  split = "median",
  n_groups = NULL,
  range = NULL,
  lowest = 1,
  labels = NULL,
  select = NULL,
  exclude = NULL,
  force = FALSE,
  append = FALSE,
  verbose = TRUE,
  ...
)
}
\arguments{
\item{x}{A data frame, numeric vector or factor.}

\item{...}{not used.}

\item{split}{Character vector, indicating at which breaks to split variables,
or numeric values with values indicating breaks. If character, may be one
of \code{"median"}, \code{"mean"}, \code{"quantile"}, \code{"equal_length"}, or \code{"equal_range"}.
\code{"median"} or \code{"mean"} will return dichotomous variables, split at their
mean or median, respectively. \code{"quantile"} and \code{"equal_length"} will split
the variable into \code{n_groups} groups, where each group refers to an interval
of a specific range of values. Thus, the length of each interval will be
based on the number of groups. \code{"equal_range"} also splits the variable
into multiple groups, however, the length of the interval is given, and
the number of resulting groups (and hence, the number of breaks) will be
determined by how many intervals can be generated, based on the full range
of the variable.}

\item{n_groups}{If \code{split} is \code{"quantile"} or \code{"equal_length"}, this defines
the number of requested groups (i.e. resulting number of levels or values)
for the recoded variable(s). \code{"quantile"} will define intervals based
on the distribution of the variable, while \code{"equal_length"} tries to
divide the range of the variable into pieces of equal length.}

\item{range}{If \code{split = "equal_range"}, this defines the range of values
that are recoded into a new value.}

\item{lowest}{Minimum value of the recoded variable(s). If \code{NULL} (the default),
for numeric variables, the minimum of the original input is preserved. For
factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the
default minimum is always \code{1}, unless specified otherwise in \code{lowest}.}

\item{labels}{Character vector of value labels. If not \code{NULL}, \code{data_cut()}
will returns factors instead of numeric variables, with \code{labels} used
for labelling the factor levels.}

\item{verbose}{Toggle warnings and messages on or off.}

\item{select}{Character vector of column names. If \code{NULL} (the default), all
variables will be selected.}

\item{exclude}{Character vector of column names to be excluded from selection.}

\item{force}{Logical, if \code{TRUE}, forces recoding of factors as well.}

\item{append}{Logical or string. If \code{TRUE}, recoded variables get new
column names (with the suffix \code{"_r"}) and are appended (column bind) to \code{x},
thus returning both the original and the recoded variables. If \code{FALSE},
original variables in \code{x} will be overwritten by their recoded versions.
If a character value, recoded variables are appended with new column
names (using the defined suffix) to the original data frame.}
}
\value{
\code{x}, recoded into groups. By default \code{x} is numeric, unless \code{labels}
is specified. In this case, a factor is returned, where the factor levels
(i.e. recoded groups are labelled accordingly.
}
\description{
This functions divides the range of variables into intervals and recodes
the values inside these intervals according to their related interval.
It is basically a wrapper around base R's \code{cut()}, providing a simplified
and more accessible way to define the interval breaks (cut-off values).
}
\details{
\subsection{Splits and breaks (cut-off values)}{
Breaks are in general \emph{exclusive}, this means that these values indicate
the lower bound of the next group or interval to begin. Take a simple
example, a numeric variable with values from 1 to 9. The median would be 5,
thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
would turn into 2 (compare \code{cbind(1:9, data_cut(1:9))}). The same variable,
using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3.67
and 6.33 (see \verb{quantile(1:9, probs = c(1/3, 2/3)}), which means that values
from 1 to 3 belong to the first interval and are recoded into 1 (because
the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
}

\subsection{Recoding into groups with equal size or range}{
\code{split = "equal_length"} and \code{split = "equal_range"} try to divide the
range of \code{x} into intervals of similar (or same) length. The difference is
that \code{split = "equal_length"} will divide the range of \code{x} into \code{n_groups}
pieces and thereby defining the intervals used as breaks (hence, it is
equivalent to \code{cut(x, breaks = n_groups)}), while  \code{split = "equal_range"}
will cut \code{x} into intervals that all have the length of \code{range}, where the
first interval by defaults starts at \code{1}. The lowest (or starting) value
of that interval can be defined using the \code{lowest} argument.
}
}
\examples{
set.seed(123)
x <- sample(1:10, size = 50, replace = TRUE)

table(x)

# by default, at median
table(data_cut(x))

# into 3 groups, based on distribution (quantiles)
table(data_cut(x, split = "quantile", n_groups = 3))

# into 3 groups, user-defined break
table(data_cut(x, split = c(3, 5)))

set.seed(123)
x <- sample(1:100, size = 500, replace = TRUE)

# into 5 groups, try to recode into intervals of similar length,
# i.e. the range within groups is the same for all groups
table(data_cut(x, split = "equal_length", n_groups = 5))

# into 5 groups, try to return same range within groups
# i.e. 1-20, 21-40, 41-60, etc. Since the range of "x" is
# 1-100, and we have a range of 20, this results into 5
# groups, and thus is for this particular case identical
# to the previous result.
table(data_cut(x, split = "equal_range", range = 20))

# return factor with value labels instead of numeric value
set.seed(123)
x <- sample(1:10, size = 30, replace = TRUE)
data_cut(x, "equal_length", n_groups = 3)
data_cut(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high"))
}
