% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sample_int_R.R, R/sample_int_ccrank.R, R/sample_int_crank.R, R/sample_int_expj.R, R/sample_int_expjs.R, R/sample_int_rank.R, R/sample_int_rej.R
\name{sample_int_R}
\alias{sample_int_R}
\alias{sample_int_ccrank}
\alias{sample_int_crank}
\alias{sample_int_expj}
\alias{sample_int_expjs}
\alias{sample_int_rank}
\alias{sample_int_rej}
\title{Weighted sampling without replacement}
\usage{
sample_int_R(n, size, prob)

sample_int_ccrank(n, size, prob)

sample_int_crank(n, size, prob)

sample_int_expj(n, size, prob)

sample_int_expjs(n, size, prob)

sample_int_rank(n, size, prob)

sample_int_rej(n, size, prob)
}
\arguments{
\item{n}{a positive number, the number of items to choose from.  See
    \sQuote{Details.}}

\item{size}{a non-negative integer giving the number of items to choose.}

\item{prob}{A vector of probability weights for obtaining the elements
    of the vector being sampled.}
}
\value{
An integer vector of length \code{size} with elements from
  \code{1:n}.
}
\description{
These functions implement weighted sampling without replacement using various
algorithms, i.e., they take a sample of the specified
\code{size} from the elements of \code{1:n} without replacement, using the
weights defined by \code{prob}.  The call
\code{sample_int_*(n, size, prob)} is equivalent
to \code{sample.int(n, size, replace=F, prob)}.  (The results will
most probably be different for the same random seed, but the
returned samples are distributed identically for both calls.)
Except for \code{sample_int_R} (which
has quadratic complexity as of this writing), all functions have complexity
\eqn{O(n \log n)}{O(n log n)} or better and
often run faster than R's implementation, especially when \code{n} and
\code{size} are large.
}
\details{
\code{sample_int_R} is a simple wrapper for \code{\link[base]{sample.int}}.

\code{sample_int_expj} and \code{sample_int_expjs}
  implement one-pass random sampling with a reservor with exponential jumps
  (Efraimidis and Spirakis, 2006, Algorithm A-ExpJ).  Both functions are
  implemented in \code{Rcpp}; \code{*_expj} uses log-transformed keys,
  \code{*_expjs} implements the algorithm in the paper verbatim
  (at the cost of numerical stability).

\code{sample_int_rank}, \code{sample_int_crank} and
  \code{sample_int_ccrank} implement one-pass random sampling
  (Efraimidis and Spirakis, 2006, Algorithm A).  The first function is
  implemented purely in R, the other two are optimized \code{Rcpp}
  implementations (\code{_crank} uses R vectors internally, while
  \code{*_ccrank} uses \code{std::vector}; surprisingly, \code{*_crank} seems
  to be faster on most inputs). It can be
  shown that the order statistic of \eqn{U^(1/w_i)} has the same
  distribution as random sampling without replacement (U=uniform(0,1)
  distribution). To increase numerical stability, \eqn{\log(U) /
  w_i}{log(U) / w_i} is computed instead; the log transform does not
  change the order statistic.

\code{sample_int_rej} uses repeated weighted sampling with
  replacement and a variant of rejection sampling. It is implemented purely
  in R.
  This function simulates weighted sampling without replacement using
  somewhat more draws \emph{with} replacement, and then discarding
  duplicate values (rejection sampling).  If too few items are
  sampled, the routine calls itself recursively on a (hopefully) much
  smaller problem.  See also
  \url{http://stats.stackexchange.com/q/20590/6432}.
}
\examples{
s <- sample_int_R(2000, 1000, runif(2000))
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_R(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
## Algorithm A, Rcpp version using std::vector
s <- sample_int_ccrank(200000, 100000, runif(200000))
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_ccrank(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
## Algorithm A, Rcpp version using R vectors
s <- sample_int_crank(200000, 100000, runif(200000))
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_crank(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
## Algorithm A-ExpJ (with log-transformed keys)
\dontrun{
s <- sample_int_expj(200000, 100000, runif(200000))
}
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_expjs(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
## Algorithm A-ExpJ (paper version)
\dontrun{
s <- sample_int_expjs(200000, 100000, runif(200000))
}
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_expjs(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
## Algorithm A
s <- sample_int_rank(200000, 100000, runif(200000))
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_rank(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
## Rejection sampling
s <- sample_int_rej(200000, 100000, runif(200000))
stopifnot(unique(s) == s)
p <- c(995, rep(1, 5))
n <- 1000
set.seed(42)
stopifnot(abs(table(replicate(sample_int_rej(6, 3, p), n=n)) / n -
  c(1, rep(0.4, 5))) < 0.04)
}
\author{
Kirill Müller (for \code{_*expj*})

Dinre (for \code{*_rank}), Kirill Müller
  (for \code{*_*crank})

Kirill Müller (for \code{*_int_rej})
}
\references{
\url{http://stackoverflow.com/q/15113650/946850}

Efraimidis, Pavlos S., and Paul G. Spirakis. "Weighted
random sampling with a reservoir." \emph{Information Processing
Letters} 97, no. 5 (2006): 181-185.

Efraimidis, Pavlos S., and Paul G. Spirakis. "Weighted
random sampling with a reservoir." \emph{Information Processing
Letters} 97, no. 5 (2006): 181-185.
}
\seealso{
\code{\link[base]{sample.int}}
}

