% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/f_dataSquashing.R
\name{squashData}
\alias{squashData}
\title{Squash data for hyperparameter estimation}
\usage{
squashData(data, count = 1, bin_size = 50, keep_pts = 100, min_bin = 50,
  min_pts = 500)
}
\arguments{
\item{data}{A data frame (typically from \code{\link{processRaw}} or a
previous call to \code{\link{squashData}}) containing columns named
\emph{N}, \emph{E}, and (possibly) \emph{weight}. Can contain additional
columns, which will be ignored.}

\item{count}{A non-negative scalar whole number for the count size, \emph{N},
used for binning}

\item{bin_size}{A scalar whole number (>= 2)}

\item{keep_pts}{A nonnegative scalar whole number for number of points with
the largest expected counts to leave unsquashed. Used to help prevent
\dQuote{oversquashing}.}

\item{min_bin}{A positive scalar whole number for the minimum number of bins
needed. Used to help prevent \dQuote{oversquashing}.}

\item{min_pts}{A positive scalar whole number for the minimum number of
original (unsquashed) points needed for squashing. Used to help prevent
\dQuote{oversquashing}.}
}
\value{
A data frame with column names \emph{N}, \emph{E}, and
  \emph{weight} containing the reduced data set.
}
\description{
\code{squashData} squashes data by binning expected counts, \emph{E}, for a
  given actual count, \emph{N}, using bin means as the expected counts for
  the reduced data set. The squashed points are weighted by bin size. Data
  can be squashed to reduce computational burden (see DuMouchel et al.,
  2001) when estimating the hyperparameters.
}
\details{
Can be used iteratively (count = 1, then 2, etc.).

The \emph{N} column in \code{data} will be coerced using
  \code{\link{as.integer}}, and \emph{E} will be coerced using
  \code{\link{as.numeric}}. Missing data are not allowed.

Since the distribution of expected counts, \emph{E}, tends to be
  skewed to the right, the largest \emph{E}s are not squashed by default.
  This behavior can be changed by setting the \code{keep_bins} argument to
  zero (0); however, this is not recommended. Squashing the largest \emph{E}s
  could result in a large loss of information, so it is recommended to use a
  value of one (1) or more for \code{keep_bins}.

Values for \code{keep_bins}, \code{min_bin}, and \code{min_pts}
  should typically be at least as large as the default values.
}
\examples{
set.seed(483726)
dat <- data.frame(var1 = letters[1:26], var2 = LETTERS[1:26],
                  N = c(rep(0, 11), rep(1, 10), rep(2, 4), rep(3, 1)),
                  E = round(abs(c(rnorm(11, 0), rnorm(10, 1), rnorm(4, 2),
                            rnorm(1, 3))), 3)
                  )
(zeroes <- squashData(dat, count = 0, bin_size = 3, keep_pts = 1,
                      min_bin = 2, min_pts = 2))
(ones <- squashData(zeroes, bin_size = 2, keep_pts = 1,
                    min_bin = 2, min_pts = 2))
(twos <- squashData(ones, count = 2, bin_size = 2, keep_pts = 1,
                    min_bin = 2, min_pts = 2))

squashData(zeroes, bin_size = 2, keep_pts = 0,
           min_bin = 2, min_pts = 2)
squashData(zeroes, bin_size = 2, keep_pts = 1,
           min_bin = 2, min_pts = 2)
squashData(zeroes, bin_size = 2, keep_pts = 2,
           min_bin = 2, min_pts = 2)
squashData(zeroes, bin_size = 2, keep_pts = 3,
           min_bin = 2, min_pts = 2)

}
\references{
DuMouchel W, Pregibon D (2001). "Empirical Bayes Screening for
  Multi-item Associations." In \emph{Proceedings of the Seventh ACM SIGKDD
  International Conference on Knowledge Discovery and Data Mining}, KDD '01,
  pp. 67-76. ACM, New York, NY, USA. ISBN 1-58113-391-X.
}
\seealso{
\code{\link{processRaw}} for data preparation
}
\keyword{openEBGM}
