% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/gap.R
\name{gap}
\alias{gap}
\title{Gap Analysis}
\usage{
gap(
  variable,
  data,
  groupA = "default",
  groupB = "default",
  percentiles = NULL,
  achievementLevel = NULL,
  achievementDiscrete = FALSE,
  stDev = FALSE,
  targetLevel = NULL,
  weightVar = NULL,
  jrrIMax = 1,
  varMethod = c("jackknife"),
  dropOmittedLevels = TRUE,
  defaultConditions = TRUE,
  recode = NULL,
  referenceDataIndex = 1,
  returnVarEstInputs = FALSE,
  returnSimpleDoF = FALSE,
  returnSimpleN = FALSE,
  returnNumberOfPSU = FALSE,
  noCov = FALSE,
  pctMethod = c("unbiased", "symmetric", "simple"),
  includeLinkingError = FALSE,
  omittedLevels = deprecated()
)
}
\arguments{
\item{variable}{a character indicating the variable to be compared,
potentially with a subject scale or subscale}

\item{data}{an \code{edsurvey.data.frame}, a \code{light.edsurvey.data.frame}, or an \code{edsurvey.data.frame.list}}

\item{groupA}{an expression or character expression that defines a condition for the subset.
This subset will be compared to \code{groupB}. If not specified, it will define
a whole sample as in \code{data}.}

\item{groupB}{an expression or character expression that defines a condition for the subset.
This subset will be compared to \code{groupA}. If not specified, it will define
a whole sample as in \code{data}. If set to \code{NULL}, estimates for the second group
will be dropped.}

\item{percentiles}{a numeric vector. The \code{gap} function calculates the
mean when this
argument is omitted or set to \code{NULL}. Otherwise,
the gap at the percentile given is calculated.}

\item{achievementLevel}{the achievement level(s) at which percentages
should be calculated}

\item{achievementDiscrete}{a logical indicating if the achievement level
specified in the \code{achievementLevel}
argument should be interpreted as discrete
so that
just the percentage in that particular achievement
level
will be included. Defaults to \code{FALSE}
so that
the percentage at or above that achievement level
will be
included in the percentage.}

\item{stDev}{a logical, set to \code{TRUE} to calculate the gap in standard deviations.}

\item{targetLevel}{a character string. When specified, calculates the gap in
the percentage of students at
\code{targetLevel} in the \code{variable} argument. This is useful for
comparing the gap in the percentage of students at a
survey response level.}

\item{weightVar}{a character indicating the weight variable to use.
See Details.}

\item{jrrIMax}{a numeric value; when using the jackknife variance estimation method, the default estimation option, \code{jrrIMax=1}, uses the
sampling variance from the first plausible value as the component for sampling variance estimation. The \code{Vjrr}
term, or sampeling variance term, can be estimated with any number of plausible values, and values larger than the number of
plausible values on the survey (including \code{Inf}) will result in all plausible values being used.
Higher values of \code{jrrIMax} lead to longer computing times and more accurate variance estimates.}

\item{varMethod}{deprecated parameter, \code{gap} always uses the jackknife variance estimation}

\item{dropOmittedLevels}{a logical value. When set to the default value of
\code{TRUE}, drops those levels of
all factor variables.
Use \code{print} on an \code{edsurvey.data.frame}
to see the omitted levels.}

\item{defaultConditions}{a logical value. When set to the default value
of \code{TRUE}, uses the default
conditions stored in \code{edsurvey.data.frame}
to subset the data.
Use \code{print} on an \code{edsurvey.data.frame}
to see the default conditions.}

\item{recode}{a list of lists to recode variables. Defaults to \code{NULL}.
Can be set as
\code{recode} \code{=} \code{list(var1} \code{=}
\code{list(from} \code{=} \code{c("a",} \code{"b",}
\code{"c"),} \code{to} \code{=} \code{"d"))}.}

\item{referenceDataIndex}{a numeric used only when the \code{data} argument is an
\code{edsurvey.data.frame.list},
indicating which dataset is the reference
dataset that other datasets are compared with.
Defaults to 1.}

\item{returnVarEstInputs}{a logical value; set to \code{TRUE} to return the
inputs to the jackknife and imputation variance
estimates which allows for the
computation
of covariances between estimates.}

\item{returnSimpleDoF}{a logical value set to \code{TRUE} to return the degrees
of freedom for some statistics (see Value
section) that do not have a
\emph{t}-test; useful primarily for further computation}

\item{returnSimpleN}{a logical value set to \code{TRUE} to add the count
(\emph{n}-size) of observations included in groups A and B
in the percentage object}

\item{returnNumberOfPSU}{a logical value set to \code{TRUE} to return the number of
PSUs used in the calculation}

\item{noCov}{set the covariances to zero in result}

\item{pctMethod}{a character that is one of \code{unbiased} or \code{simple}.
See the help for \code{\link{percentile}} for more information.}

\item{includeLinkingError}{a logical value set to \code{TRUE} to include the
linking error in variance estimation.
Standard errors (e.g., \code{diffAAse}, \code{diffBBse},
and \code{diffABABse}) and \emph{p}-values (e.g., \code{diffAApValue},
\code{diffBBpValue}, and \code{diffABABpValue}) would be adjusted for
comparisons between digitally based assessments (DBA) and
paper-based assessments (PBA) data.
This option is supported only for NAEP data.}

\item{omittedLevels}{this argument is deprecated. Use \code{dropOmittedLevels}.}
}
\value{
The return type depends on if the class of the \code{data} argument is an
\code{edsurvey.data.frame} or an \code{edsurvey.data.frame.list}. Both
include the call (called \code{call}), a list called \code{labels},
an object named \code{percentage}
that shows the percentage in \code{groupA} and \code{groupB}, and an object
that shows the gap called \code{results}.

The labels include the following elements:
  \item{definition}{the definitions of the groups}
  \item{nFullData}{the \emph{n}-size for the full dataset (before applying the definition)}
  \item{nUsed}{the \emph{n}-size for the data after the group is subsetted and other
               restrictions (such as omitted values) are applied}
  \item{nPSU}{the number of PSUs used in calculation--only returned when
              \code{returnNumberOfPSU} \code{=} \code{TRUE}}

The percentages are computed according to the vignette titled
\href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{\emph{Statistical Methods Used in EdSurvey}}
 in the section
\dQuote{Estimation of Weighted Percentages When Plausible Values Are Not Present.}
The standard errors are calculated according to
\dQuote{Estimation of the Standard Error of Weighted Percentages When Plausible Values Are Not Present, Using the Jackknife Method.}
Standard errors of differences are calculated as the square root of the typical
variance formula
\deqn{Var(A-B) = Var(A) + Var(B) - 2 Cov(A,B)}
where the covariance term is calculated as described in the vignette titled
\href{https://www.air.org/sites/default/files/EdSurvey-Statistics.pdf}{\emph{Statistical Methods Used in EdSurvey}}
 in the section
\dQuote{Estimation of Covariances.} These degrees of freedom are available only
with the jackknife variance estimation. The degrees of freedom used for hypothesis testing
are always set to the number of jackknife replicates in the data.

\strong{the data argument is an edsurvey.data.frame}
  When the \code{data} argument is an \code{edsurvey.data.frame},
  \code{gap} returns an S3 object of class \code{gap}.

  The \code{percentage} object is a numeric vector with the following elements:
    \item{pctA}{the percentage of respondents in \code{groupA} compared with the whole sample in \code{data}}
    \item{pctAse}{the standard error on the percentage of respondents in
                      \code{groupA}}
    \item{dofA}{degrees of freedom appropriate for a \emph{t}-test involving \code{pctA}.
                This value is returned only if
                \code{returnSimpleDoF}\code{=}\code{TRUE}.}
    \item{pctB}{the percentage of respondents in \code{groupB}.}
    \item{pctBse}{the standard error on the percentage of respondents in
                      \code{groupB}}
    \item{dofB}{degrees of freedom appropriate for a \emph{t}-test involving \code{pctA}.
                This value is returned only if
                \code{returnSimpleDoF}\code{=}\code{TRUE}.}
    \item{diffAB}{the value of \code{pctA} minus \code{pctB}}
    \item{covAB}{the covariance of \code{pctA} and \code{pctB}; used in
                 calculating \code{diffABse}.}
    \item{diffABse}{the standard error of \code{pctA}
                           minus \code{pctB}}
    \item{diffABpValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffAB}
                        is zero}
    \item{dofAB}{degrees of freedom used in calculating
                      \code{diffABpValue}}

  The \code{results} object is a numeric data frame with the following elements:
    \item{estimateA}{the mean estimate of \code{groupA} (or the percentage estimate
                     if \code{achievementLevel} or \code{targetLevel} is specified)}
    \item{estimateAse}{the standard error of \code{estimateA}}
    \item{dofA}{degrees of freedom appropriate for a \emph{t}-test involving \code{meanA}.
                This value is returned only if
                \code{returnSimpleDoF}\code{=}\code{TRUE}.}
    \item{estimateB}{the mean estimate of \code{groupB} (or the percentage estimate
                     if \code{achievementLevel} or \code{targetLevel} is specified)}
    \item{estimateBse}{the standard error of \code{estimateB}}
    \item{dofB}{degrees of freedom appropriate for a \emph{t}-test involving \code{meanB}.
                This value is returned only if
                \code{returnSimpleDoF}\code{=}\code{TRUE}.}
    \item{diffAB}{the value of \code{estimateA} minus \code{estimateB}}
    \item{covAB}{the covariance of \code{estimateA} and \code{estimateB}. Used in
                 calculating \code{diffABse}.}
    \item{diffABse}{the standard error of \code{diffAB}}
    \item{diffABpValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffAB}
                        is zero.}
    \item{dofAB}{degrees of freedom used for the \emph{t}-test on \code{diffAB}}

  If the gap was in  achievement levels or percentiles and more
  than one percentile or achievement level is requested,
  then an additional column
  labeled \code{percentiles} or \code{achievementLevel} is included
  in the \code{results} object.

  When \code{results} has a single row and when \code{returnVarEstInputs}
  is \code{TRUE}, the additional elements \code{varEstInputs} and
  \code{pctVarEstInputs} also are returned. These can be used for calculating
  covariances with \code{\link{varEstToCov}}.

\strong{the data argument is an edsurvey.data.frame.list}
  When the \code{data} argument is an \code{edsurvey.data.frame.list},
  \code{gap} returns an S3 object of class \code{gapList}.

  The \code{results} object in the \code{edsurveyResultList} is
  a \code{data.frame}. Each row regards a particular dataset from the
  \code{edsurvey.data.frame}, and a reference dataset is dictated by
  the \code{referenceDataIndex} argument.

  The \code{percentage} object is a \code{data.frame} with the following elements:
    \item{covs}{a data frame with a column for each column in the \code{covs}. See previous
                section for more details.}
    \item{...}{all elements in the \code{percentage} object in the
               previous section}
    \item{diffAA}{the difference in \code{pctA} between the reference data
                  and this dataset. Set to \code{NA} for the
                  reference dataset.}
    \item{covAA}{the covariance of \code{pctA} in the reference data and
                 \code{pctA} on this row. Used in
                 calculating \code{diffAAse}.}
    \item{diffAAse}{the standard error for \code{diffAA}}
    \item{diffAApValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffAA}
                        is zero}
    \item{diffBB}{the difference in \code{pctB} between the reference data
                  and this dataset. Set to \code{NA} for the
                  reference dataset.}
    \item{covBB}{the covariance of \code{pctB} in the reference data and
                 \code{pctB} on this row. Used in
                 calculating \code{diffAAse}.}
    \item{diffBBse}{the standard error for \code{diffBB}}
    \item{diffBBpValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffBB}
                        is zero}
    \item{diffABAB}{the value of \code{diffAB} in the reference dataset
                           minus the value of \code{diffAB} in this dataset. Set
                           to \code{NA} for the reference dataset.}
    \item{covABAB}{the covariance of \code{diffAB} in the reference data and
                   \code{diffAB} on this row. Used in
                   calculating \code{diffABABse}.}
    \item{diffABABse}{the standard error for \code{diffABAB}}
    \item{diffABABpValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffABAB}
                        is zero}

  The \code{results} object is a \code{data.frame} with the following elements:
    \item{...}{all elements in the \code{results} object in the
               previous section}
    \item{diffAA}{the value of \code{groupA} in the reference dataset minus
                         the value in this dataset. Set to \code{NA} for the
                         reference dataset.}
    \item{covAA}{the covariance of \code{meanA} in the reference data and
                 \code{meanA} on this row. Used in
                 calculating \code{diffAAse}.}
    \item{diffAAse}{the standard error for \code{diffAA}}
    \item{diffAApValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffAA}
                        is zero}
    \item{diffBB}{the value of \code{groupB} in the reference dataset minus
                         the value in this dataset. Set to \code{NA} for the
                         reference dataset.}
    \item{covBB}{the covariance of \code{meanB} in the reference data and
                 \code{meanB} on this row. Used in
                 calculating \code{diffBBse}.}
    \item{diffBBse}{the standard error for \code{diffBB}}
    \item{diffBBpValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffBB}
                        is zero}
    \item{diffABAB}{the value of \code{diffAB} in the reference dataset
                           minus the value of \code{diffAB}
                           in this dataset. Set
                           to \code{NA} for the reference dataset.}
    \item{covABAB}{the covariance of \code{diffAB} in the reference data and
                   \code{diffAB} on this row. Used in
                   calculating \code{diffABABse}.}
    \item{diffABABse}{the standard error for \code{diffABAB}}
    \item{diffABABpValue}{the \emph{p}-value associated with the \emph{t}-test used
                        for the hypothesis test that \code{diffABAB}
                        is zero}
    \item{sameSurvey}{a logical value indicating if this line uses the same
                      survey as the reference line. Set to \code{NA} for the
                      reference line.}
}
\description{
Compares the average levels of a variable between two groups
             that potentially share members.
}
\details{
This function calculates the gap between \code{groupA} and \code{groupB} (which
may be omitted to indicate the full sample). The gap is
calculated for one of four statistics:
\describe{
  \item{the gap in means}{The mean score gap (in the score
     variable) identified in the \code{variable} argument.
     This is the default. The means and their standard errors are
     calculated using the methods
     described in the \code{\link{lm.sdf}} function documentation.}
  \item{the gap in percentiles}{The gap between respondents at
     the percentiles specified in the \code{percentiles} argument.
     This is returned when the \code{percentiles} argument is
     defined. The mean and standard error are computed as described in the
     \code{\link{percentile}} function documentation.}
  \item{the gap in achievement levels}{The gap in the percentage of
     students at (when \code{achievementDiscrete} is \code{TRUE}) or at
     or above (when \code{achievementDiscrete} is \code{FALSE}) a
     particular achievement level. This is used when the
     \code{achievementLevel} argument is defined. The mean and standard error
     are calculated as described in the \code{\link{achievementLevels}}
     function documentation.}
  \item{the gap in a survey response}{The gap in the percentage of
     respondents responding at \code{targetLevel} to
     \code{variable}. This is used when \code{targetLevel} is
     defined. The mean and standard deviation are calculated as described in
     the \code{\link{edsurveyTable}} function documentation.}
}
}
\examples{
\dontrun{
# read in the example data (generated, not real student data)
sdf <- readNAEP(system.file("extdata/data", "M36NT2PM.dat", package = "NAEPprimer"))

# find the mean score gap in the primer data between males and females
gap("composite", sdf, dsex=="Male", dsex=="Female")

# find the score gap of the quartiles in the primer data between males and females
gap("composite", sdf, dsex=="Male", dsex=="Female", percentile=50)
gap("composite", sdf, dsex=="Male", dsex=="Female", percentile=c(25, 50, 75))

# find the percent proficient (or higher) gap in the primer data between males and females
gap("composite", sdf, dsex=="Male", dsex=="Female", 
    achievementLevel=c("Basic", "Proficient", "Advanced"))

# find the discrete achievement level gap--this is harder to interpret
gap("composite", sdf, dsex=="Male", dsex=="Female",
    achievementLevel="Proficient", achievementDiscrete=TRUE)

# find the percent talk about studies at home (b017451) never or hardly
# ever gap in the primer data between males and females
gap("b017451", sdf, dsex=="Male", dsex=="Female", 
    targetLevel="Never or hardly ever")

# example showing how to compare multiple levels
gap("b017451",sdf, dsex=="Male", dsex=="Female", targetLevel="Infrequently",
    recode=list(b017451=list(from=c("Never or hardly ever",
                                    "Once every few weeks",
                                    "About once a week"),
                             to=c("Infrequently"))))

# make subsets of sdf by scrpsu, "Scrambled PSU and school code"
sdfA <- subset(sdf, scrpsu \%in\% c(5,45,56))
sdfB <- subset(sdf, scrpsu \%in\% c(75,76,78))
sdfC <- subset(sdf, scrpsu \%in\% 100:200)
sdfD <- subset(sdf, scrpsu \%in\% 201:300)

sdfl <- edsurvey.data.frame.list(list(sdfA, sdfB, sdfC, sdfD),
                                 labels=c("A locations", "B locations",
                                          "C locations", "D locations"))

gap("composite", sdfl, dsex=="Male", dsex=="Female", percentile=c(50))
}

\dontrun{
# example showing using linking error with gap
# load Grade 4 math data
# requires NAEP RUD license with these files in the folder the user is currectly in
g4math2015 <- readNAEP("M46NT1AT.dat")
g4math2017 <- readNAEP("M48NT1AT.dat")
g4math2019 <- readNAEP("M50NT1AT.dat")

# make an edsurvey.data.frame.list from math grade 4 2015, 2017, and 2019 data
g4math <- edsurvey.data.frame.list(list(g4math2019, g4math2017, g4math2015),
                                   labels = c("2019", "2017", "2015"))

# gap analysis with linking error in variance estimation across surveys
gap("composite", g4math, dsex == "Male", dsex == "Female", includeLinkingError=TRUE)
gap("composite", g4math, dsex == "Male", dsex == "Female", percentiles = c(10, 25), 
    includeLinkingError=TRUE)
gap("composite", g4math, dsex == "Male", dsex == "Female", 
    achievementDiscrete = TRUE, achievementLevel=c("Basic", "Proficient", "Advanced"), 
    includeLinkingError=TRUE)
}
}
\author{
Paul Bailey, Trang Nguyen, and Huade Huo
}
