% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/binsglm.R
\name{binsglm}
\alias{binsglm}
\title{Data-Driven Binscatter Generalized Linear Regression with Robust Inference Procedures and Plots}
\usage{
binsglm(y, x, w = NULL, data = NULL, at = NULL, family = gaussian(),
  deriv = 0, nolink = F, dots = NULL, dotsgrid = 0, dotsgridmean = T,
  line = NULL, linegrid = 20, ci = NULL, cigrid = 0, cigridmean = T,
  cb = NULL, cbgrid = 20, polyreg = NULL, polyreggrid = 20,
  polyregcigrid = 0, by = NULL, bycolors = NULL, bysymbols = NULL,
  bylpatterns = NULL, legendTitle = NULL, legendoff = F, nbins = NULL,
  binspos = "qs", binsmethod = "dpi", nbinsrot = NULL, pselect = NULL,
  sselect = NULL, samebinsby = F, randcut = NULL, nsims = 500,
  simsgrid = 20, simsseed = NULL, vce = "HC1", cluster = NULL,
  asyvar = F, level = 95, noplot = F, dfcheck = c(20, 30),
  masspoints = "on", weights = NULL, subset = NULL, plotxrange = NULL,
  plotyrange = NULL, ...)
}
\arguments{
\item{y}{outcome variable. A vector.}

\item{x}{independent variable of interest. A vector.}

\item{w}{control variables. A matrix, a vector or a \code{\link{formula}}.}

\item{data}{an optional data frame containing variables in the model.}

\item{at}{value of \code{w} at which the estimated function is evaluated.  The default is \code{at="mean"}, which corresponds to
the mean of \code{w}. Other options are: \code{at="median"} for the median of \code{w}, \code{at="zero"} for a vector of zeros.
\code{at} can also be a vector of the same length as the number of columns of \code{w} (if \code{w} is a matrix) or a data frame containing the same variables as specified in \code{w} (when
\code{data} is specified). Note that when \code{at="mean"} or \code{at="median"}, all factor variables (if specified) are excluded from the evaluation (set as zero).}

\item{family}{a description of the error distribution and link function to be used in the generalized linear model. (See \code{\link{family}} for details of family functions.)}

\item{deriv}{derivative order of the regression function for estimation, testing and plotting.
The default is \code{deriv=0}, which corresponds to the function itself. If \code{nolink=TRUE}, \code{deriv} cannot be greater than 1.}

\item{nolink}{if true, the function within the inverse link function is reported instead of the conditional mean function for the outcome.}

\item{dots}{a vector or a logical value. If \code{dots=c(p,s)}, a piecewise polynomial of degree \code{p} with
\code{s} smoothness constraints is used for point estimation and plotting as "dots".
The default is \code{dots=c(0,0)}, which corresponds to piecewise constant (canonical binscatter).
If \code{dots=T}, the default \code{dots=c(0,0)} is used unless the degree \code{p} and smoothness \code{s} selection
is requested via the option \code{pselect} (see more details in the explanation of \code{pselect}).
If \code{dots=F} is specified, the dots are not included in the plot.}

\item{dotsgrid}{number of dots within each bin to be plotted. Given the choice, these dots are point estimates
evaluated over an evenly-spaced grid within each bin. The default is \code{dotsgrid=0}, and only
the point estimates at the mean of \code{x} within each bin are presented.}

\item{dotsgridmean}{If true, the dots corresponding to the point estimates evaluated at the mean of \code{x} within each bin
are presented. By default, they are presented, i.e., \code{dotsgridmean=T}.}

\item{line}{a vector or a logical value. If \code{line=c(p,s)}, a piecewise polynomial of degree \code{p} with \code{s} smoothness constraints
is used for plotting as a "line". If \code{line=T} is specified, \code{line=c(0,0)} is used unless the degree \code{p} and smoothness \code{s}
selection is requested via the option \code{pselect} (see more details in the explanation of \code{pselect}).
If \code{line=F} or \code{line=NULL} is specified, the line is not included in the plot.  The default is \code{line=NULL}.}

\item{linegrid}{number of evaluation points of an evenly-spaced grid within each bin used for evaluation of
the point estimate set by the \code{line=c(p,s)} option. The default is \code{linegrid=20},
which corresponds to 20 evenly-spaced evaluation points within each bin for fitting/plotting the line.}

\item{ci}{a vector or a logical value. If \code{ci=c(p,s)} a piecewise polynomial of degree \code{p} with \code{s} smoothness constraints is used for
constructing confidence intervals. If \code{ci=T} is specified, \code{ci=c(1,1)} is used unless the degree \code{p} and smoothness \code{s}
selection is requested via the option \code{pselect} (see more details in the explanation of \code{pselect}).
If \code{ci=F} or \code{ci=NULL} is specified, the confidence intervals are not included in the plot.  The default is \code{ci=NULL}.}

\item{cigrid}{number of evaluation points of an evenly-spaced grid within each bin used for evaluation of the point
estimate set by the \code{ci=c(p,s)} option. The default is \code{cigrid=1}, which corresponds to 1
evenly-spaced evaluation point within each bin for confidence interval construction.}

\item{cigridmean}{If true, the confidence intervals corresponding to the point estimates evaluated at the mean of \code{x} within each bin
are presented. The default is \code{cigridmean=T}.}

\item{cb}{a vector or a logical value. If \code{cb=c(p,s)}, a the piecewise polynomial of degree \code{p} with \code{s} smoothness constraints is used for
constructing the confidence band. If the option \code{cb=T} is specified, \code{cb=c(1,1)} is used unless the degree \code{p} and smoothness \code{s}
selection is requested via the option \code{pselect} (see more details in the explanation of \code{pselect}).
If \code{cb=F} or \code{cb=NULL} is specified, the confidence band is not included in the plot. The default is \code{cb=NULL}.}

\item{cbgrid}{number of evaluation points of an evenly-spaced grid within each bin used for evaluation of the point
estimate set by the \code{cb=c(p,s)} option. The default is \code{cbgrid=20}, which corresponds
to 20 evenly-spaced evaluation points within each bin for confidence interval construction.}

\item{polyreg}{degree of a global polynomial regression model for plotting. By default, this fit is not included
in the plot unless explicitly specified. Recommended specification is \code{polyreg=3}, which
adds a cubic (global) polynomial fit of the regression function of interest to the binned scatter plot.}

\item{polyreggrid}{number of evaluation points of an evenly-spaced grid within each bin used for evaluation of
the point estimate set by the \code{polyreg=p} option. The default is \code{polyreggrid=20},
which corresponds to 20 evenly-spaced evaluation points within each bin for confidence
interval construction.}

\item{polyregcigrid}{number of evaluation points of an evenly-spaced grid within each bin used for constructing
confidence intervals based on polynomial regression set by the \code{polyreg=p} option.
The default is \code{polyregcigrid=0}, which corresponds to not plotting confidence
intervals for the global polynomial regression approximation.}

\item{by}{a vector containing the group indicator for subgroup analysis; both numeric and string variables
are supported. When \code{by} is specified, \code{binsreg} implements estimation and inference for each subgroup
separately, but produces a common binned scatter plot. By default, the binning structure is selected for each
subgroup separately, but see the option \code{samebinsby} below for imposing a common binning structure across subgroups.}

\item{bycolors}{an ordered list of colors for plotting each subgroup series defined by the option \code{by}.}

\item{bysymbols}{an ordered list of symbols for plotting each subgroup series defined by the option \code{by}.}

\item{bylpatterns}{an ordered list of line patterns for plotting each subgroup series defined by the option \code{by}.}

\item{legendTitle}{String, title of legend.}

\item{legendoff}{If true, no legend is added.}

\item{nbins}{number of bins for partitioning/binning of \code{x}.  If \code{nbins=T} or \code{nbins=NULL} (default) is specified, the number
of bins is selected via the companion command \code{\link{binsregselect}} in a data-driven, optimal way whenever possible.
If a vector with more than one number is specified, the number of bins is selected within this vector via the companion command \code{\link{binsregselect}}.}

\item{binspos}{position of binning knots. The default is \code{binspos="qs"}, which corresponds to quantile-spaced
binning (canonical binscatter).  The other options are \code{"es"} for evenly-spaced binning, or
a vector for manual specification of the positions of inner knots (which must be within the range of
\code{x}).}

\item{binsmethod}{method for data-driven selection of the number of bins. The default is \code{binsmethod="dpi"},
which corresponds to the IMSE-optimal direct plug-in rule.  The other option is: \code{"rot"}
for rule of thumb implementation.}

\item{nbinsrot}{initial number of bins value used to construct the DPI number of bins selector.
If not specified, the data-driven ROT selector is used instead.}

\item{pselect}{vector of numbers within which the degree of polynomial \code{p} for point estimation is selected.
 Piecewise polynomials of the selected optimal degree \code{p} are used to construct dots or line
 if \code{dots=T} or \code{line=T} is specified,
 whereas piecewise polynomials of degree \code{p+1} are used to construct confidence intervals
 or confidence band if \code{ci=T} or \code{cb=T} is specified.
 \emph{Note:} To implement the degree or smoothness selection, in addition to \code{pselect} or \code{sselect},
\code{nbins=#} must be specified.}

\item{sselect}{vector of numbers within which the number of smoothness constraints \code{s} for point estimation is selected.
Piecewise polynomials with the selected optimal \code{s} smoothness constraints are used to construct dots or line
 if \code{dots=T} or \code{line=T} is specified,
whereas piecewise polynomials with \code{s+1} constraints are used to construct
confidence intervals or confidence band if \code{ci=T} or \code{cb=T} is
specified.  If not specified, for each value \code{p} supplied in the option \code{pselect}, only the piecewise polynomial
with the maximum smoothness is considered, i.e., \code{s=p}.}

\item{samebinsby}{if true, a common partitioning/binning structure across all subgroups specified by the option \code{by} is forced.
The knots positions are selected according to the option \code{binspos} and using the full sample. If \code{nbins}
is not specified, then the number of bins is selected via the companion command \code{\link{binsregselect}} and
using the full sample.}

\item{randcut}{upper bound on a uniformly distributed variable used to draw a subsample for bins/degree/smoothness selection.
Observations for which \code{runif()<=#} are used. # must be between 0 and 1.  By default, \code{max(5,000, 0.01n)} observations
are used if the samples size \code{n>5,000}.}

\item{nsims}{number of random draws for constructing confidence bands. The default is
\code{nsims=500}, which corresponds to 500 draws from a standard Gaussian random vector of size
\code{[(p+1)*J - (J-1)*s]}. A larger number of draws is recommended to obtain the final results.}

\item{simsgrid}{number of evaluation points of an evenly-spaced grid within each bin used for evaluation of
the supremum operation needed to construct confidence bands. The default is \code{simsgrid=20}, which corresponds to 20 evenly-spaced
evaluation points within each bin for approximating the supremum operator.
A larger number of evaluation points is recommended to obtain the final results.}

\item{simsseed}{seed for simulation.}

\item{vce}{Procedure to compute the variance-covariance matrix estimator. Options are
\itemize{
\item \code{"const"} homoskedastic variance estimator.
\item \code{"HC0"} heteroskedasticity-robust plug-in residuals variance estimator
                   without weights.
\item \code{"HC1"} heteroskedasticity-robust plug-in residuals variance estimator
                   with hc1 weights. Default.
\item \code{"HC2"} heteroskedasticity-robust plug-in residuals variance estimator
                   with hc2 weights.
\item \code{"HC3"} heteroskedasticity-robust plug-in residuals variance estimator
                   with hc3 weights.
}}

\item{cluster}{cluster ID. Used for compute cluster-robust standard errors.}

\item{asyvar}{if true, the standard error of the nonparametric component is computed and the uncertainty related to control
variables is omitted. Default is \code{asyvar=FALSE}, that is, the uncertainty related to control variables is taken into account.}

\item{level}{nominal confidence level for confidence interval and confidence band estimation. Default is \code{level=95}.}

\item{noplot}{if true, no plot produced.}

\item{dfcheck}{adjustments for minimum effective sample size checks, which take into account number of unique
values of \code{x} (i.e., number of mass points), number of clusters, and degrees of freedom of
the different stat models considered. The default is \code{dfcheck=c(20, 30)}.
See \href{https://nppackages.github.io/references/Cattaneo-Crump-Farrell-Feng_2022_Stata.pdf}{Cattaneo, Crump, Farrell and Feng (2022b)} for more details.}

\item{masspoints}{how mass points in \code{x} are handled. Available options:
\itemize{
\item \code{"on"} all mass point and degrees of freedom checks are implemented. Default.
\item \code{"noadjust"} mass point checks and the corresponding effective sample size adjustments are omitted.
\item \code{"nolocalcheck"} within-bin mass point and degrees of freedom checks are omitted.
\item \code{"off"} "noadjust" and "nolocalcheck" are set simultaneously.
\item \code{"veryfew"} forces the function to proceed as if \code{x} has only a few number of mass points (i.e., distinct values).
                       In other words, forces the function to proceed as if the mass point and degrees of freedom checks were failed.
}}

\item{weights}{an optional vector of weights to be used in the fitting process. Should be \code{NULL} or
a numeric vector. For more details, see \code{\link{lm}}.}

\item{subset}{optional rule specifying a subset of observations to be used.}

\item{plotxrange}{a vector. \code{plotxrange=c(min, max)} specifies a range of the x-axis for binscatter plot. Observations outside the range are dropped in the plot.}

\item{plotyrange}{a vector. \code{plotyrange=c(min, max)} specifies a range of the y-axis for binscatter plot. Observations outside the range are dropped in the plot.}

\item{...}{optional arguments used by \code{\link{glm}}.}
}
\value{
\item{\code{bins_plot}}{A \code{ggplot} object for binscatter plot.}
       \item{\code{data.plot}}{A list containing data for plotting. Each item is a sublist of data frames for each group. Each sublist may contain the following data frames:
       \itemize{
       \item \code{data.dots} Data for dots. It contains: \code{x}, evaluation points; \code{bin}, the indicator of bins;
                              \code{isknot}, indicator of inner knots; \code{mid}, midpoint of each bin; and \code{fit}, fitted values.
       \item \code{data.line} Data for line. It contains: \code{x}, evaluation points; \code{bin}, the indicator of bins;
                               \code{isknot}, indicator of inner knots; \code{mid}, midpoint of each bin; and \code{fit}, fitted values.
       \item \code{data.ci} Data for CI. It contains: \code{x}, evaluation points; \code{bin}, the indicator of bins;
                               \code{isknot}, indicator of inner knots; \code{mid}, midpoint of each bin;
                               \code{ci.l} and \code{ci.r}, left and right boundaries of each confidence intervals.
       \item \code{data.cb} Data for CB. It contains: \code{x}, evaluation points; \code{bin}, the indicator of bins;
                               \code{isknot}, indicator of inner knots; \code{mid}, midpoint of each bin;
                               \code{cb.l} and \code{cb.r}, left and right boundaries of the confidence band.
       \item \code{data.poly} Data for polynomial regression. It contains: \code{x}, evaluation points;
                               \code{bin}, the indicator of bins;
                               \code{isknot}, indicator of inner knots; \code{mid}, midpoint of each bin; and
                               \code{fit}, fitted values.
       \item \code{data.polyci} Data for confidence intervals based on polynomial regression. It contains: \code{x}, evaluation points;
                               \code{bin}, the indicator of bins;
                               \code{isknot}, indicator of inner knots; \code{mid}, midpoint of each bin;
                               \code{polyci.l} and \code{polyci.r}, left and right boundaries of each confidence intervals.}}
       \item{\code{imse.var.rot}}{Variance constant in IMSE, ROT selection.}
       \item{\code{imse.bsq.rot}}{Bias constant in IMSE, ROT selection.}
       \item{\code{imse.var.dpi}}{Variance constant in IMSE, DPI selection.}
       \item{\code{imse.bsq.dpi}}{Bias constant in IMSE, DPI selection.}
       \item{\code{cval.by}}{A vector of critical values for constructing confidence band for each group.}
       \item{\code{opt}}{ A list containing options passed to the function, as well as \code{N.by} (total sample size for each group),
                          \code{Ndist.by} (number of distinct values in \code{x} for each group), \code{Nclust.by} (number of clusters for each group),
                          and \code{nbins.by} (number of bins for each group), and \code{byvals} (number of distinct values in \code{by}).
                          The degree and smoothness of polynomials for dots, line, confidence intervals and confidence band for each group are saved
                          in \code{dots}, \code{line}, \code{ci}, and \code{cb}.}
}
\description{
\code{binsglm} implements binscatter generalized linear regression with robust inference procedures and plots, following the
            results in \href{https://arxiv.org/abs/1902.09608}{Cattaneo, Crump, Farrell and Feng (2022a)}.
            Binscatter provides a flexible way to describe the relationship between two variables, after
            possibly adjusting for other covariates, based on partitioning/binning of the independent variable of interest.
            The main purpose of this function is to generate binned scatter plots with curve estimation with robust pointwise confidence intervals and
            uniform confidence band. If the binning scheme is not set by the user, the companion function
            \code{\link{binsregselect}} is used to implement binscatter in a data-driven way. Hypothesis testing about the function of interest can be conducted via the companion
            function \code{\link{binstest}}.
}
\examples{
 x <- runif(500); d <- 1*(runif(500)<=x)
 ## Binned scatterplot
 binsglm(d, x, family=binomial())
}
\references{
Cattaneo, M. D., R. K. Crump, M. H. Farrell, and Y. Feng. 2022a: \href{https://arxiv.org/abs/1902.09608}{On Binscatter}. Working Paper.

Cattaneo, M. D., R. K. Crump, M. H. Farrell, and Y. Feng. 2022b: \href{https://arxiv.org/abs/1902.09615}{Binscatter Regressions}. Working Paper.
}
\seealso{
\code{\link{binsregselect}}, \code{\link{binstest}}.
}
\author{
Matias D. Cattaneo, Princeton University, Princeton, NJ. \email{cattaneo@princeton.edu}.

Richard K. Crump, Federal Reserve Bank of New York, New York, NY. \email{richard.crump@ny.frb.org}.

Max H. Farrell, University of Chicago, Chicago, IL. \email{max.farrell@chicagobooth.edu}.

Yingjie Feng (maintainer), Tsinghua University, Beijing, China. \email{fengyingjiepku@gmail.com}.
}
