% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/summarize_balances.R
\name{summarize_balances}
\alias{summarize_balances}
\title{Summarize group balances}
\usage{
summarize_balances(
  data,
  group_cols,
  cat_cols = NULL,
  num_cols = NULL,
  id_cols = NULL,
  summarize_size = TRUE,
  include_normalized = FALSE,
  rank_weights = NULL,
  cat_levels_rank_weights = NULL,
  num_normalize_fn = function(x) {     rearrr::min_max_scale(x, old_min = quantile(x,
    0.025), old_max = quantile(x, 0.975), new_min = 0, new_max = 1) }
)
}
\arguments{
\item{data}{\code{data.frame} with group columns to summarize
by.

Can be \emph{grouped} (see \code{\link[dplyr:group_by]{dplyr::group_by()}}),
in which case the function is applied group-wise. This is not to
be confused with \code{`group_cols`}.}

\item{group_cols}{Names of columns with group identifiers to summarize columns
in \code{`data`} by.}

\item{cat_cols}{Names of categorical columns to summarize.

Each categorical level is counted per group.

To distinguish between levels with the same name from different
\code{`cat_col`} columns, we prefix the count column name for each
categorical level with parts of the name of the categorical column.
This amount can be controlled with \code{`max_cat_prefix_chars`}.

Normalization when \code{`include_normalized`} is enabled:
The counts of each categorical level is normalized with \code{log(1 + count)}.}

\item{num_cols}{Names of numerical columns to summarize.

For each column, the \code{mean} and \code{sum} is calculated per group.

Normalization when \code{`include_normalized`} is enabled:
Each column is normalized with \code{`num_normalize_fn`} before
calculating the \code{mean} and \code{sum} per group.}

\item{id_cols}{Names of \code{factor} columns with IDs to summarize.

The number of unique IDs are counted per group.

Normalization when \code{`include_normalized`} is enabled:
The count of unique IDs is normalized with \code{log(1 + count)}.}

\item{summarize_size}{Whether to summarize the number of rows per group.}

\item{include_normalized}{Whether to calculate and include the
normalized summary in the output.}

\item{rank_weights}{A named \code{vector} with weights for averaging the rank columns when calculating the \code{`SD_rank`} column.
The name is one of the balancing columns and the number is its weight. Non-specified columns are given the weight \code{1}.
The weights are automatically scaled to sum to 1.

When summarizing size (see \code{`summarize_size`}), name its weight \code{"size"}.

E.g. \code{c("size" = 1, "a_cat_col" = 2, "a_num_col" = 4, "an_id_col" = 2)}.}

\item{cat_levels_rank_weights}{Weights for averaging ranks of the categorical levels in \code{`cat_cols`}.
Given as a named \code{list} with a named \code{vector} for each column in \code{`cat_cols`}.
Non-specified levels are given the weight \code{1}.
The weights are automatically scaled to sum to 1.

E.g. \code{list("a_cat_col" = c("a" = 3, "b" = 5), "b_cat_col" = c("1" = 3, "2" = 9))}}

\item{num_normalize_fn}{Function for normalizing the \code{`num_cols`} columns before
calculating normalized group summaries.

Only used when \code{`include_normalized`} is enabled.}
}
\value{
\code{list} with two/three \code{data.frames}:

\subsection{Groups}{
A summary per group.

\code{`cat_cols`}: Each level has its own column with the count
of the level per group.

\code{`num_cols`}: The \code{mean} and \code{sum} per group.

\code{`id_cols`}: The count of unique IDs per group.
}

\subsection{Summary}{
Statistical descriptors of the columns in \code{`Groups`}.

Contains the \code{mean}, \code{median}, standard deviation (\code{SD}),
interquartile range (\code{IQR}), \code{min}, and \code{max} measures.

Especially the standard deviations and IQR measures can tell us about how
balanced the groups are. When comparing multiple \code{`group_cols`},
the group column with the lowest \code{SD} and \code{IQR}
can be considered the most balanced.
}

\subsection{Normalized Summary}{
(Disabled by default)

Same statistical descriptors as in \code{`Summary`} but for a
"normalized" version of the group summaries. The motivation
is that these normalized measures can more easily be compared
or combined to a single "balance score".

First, we normalize each balance column:

\code{`cat_cols`}: The level counts in the original group summaries are
normalized with with \code{log(1 + count)}. This eases comparison
of the statistical descriptors (especially standard deviations)
of levels with very different count scales.

\code{`num_cols`}: The numeric columns are normalized prior to
summarization by group, using the \code{`num_normalize_fn`} function.
By default this applies MinMax scaling to columns such that ~95\% of the values
are expected to be in the \code{[0, 1]} range.

\code{`id_cols`}: The counts of unique IDs in the original group summaries are
normalized with \code{log(1 + count)}.

Contains the \code{mean}, \code{median}, standard deviation (\code{SD}),
interquartile range (\code{IQR}), \code{min}, and \code{max} measures.
}
}
\description{
\Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}

Summarize the balances of numeric, categorical, and ID columns
in and between groups in one or more group columns.

This tool allows you to quickly and thoroughly assess the balance
of different columns between groups. This is for instance useful
after creating groups with \code{\link[groupdata2:fold]{fold()}},
\code{\link[groupdata2:partition]{partition()}}, or
\code{\link[groupdata2:collapse_groups]{collapse_groups()}} to
check how well they did and to compare multiple
groupings.

The output contains:
\enumerate{
\item \code{`Groups`}: a summary per group (per grouping column).
\item \code{`Summary`}: statistical descriptors of the group summaries.
\item \code{`Normalized Summary`}: statistical descriptors of a set of
"normalized" group summaries. (Disabled by default)
}

When comparing how balanced the grouping columns are, we can use
the standard deviations of the group summary columns. The lower a standard
deviation is, the more similar the groups are in that column. To quickly
extract these standard deviations, ordered by an aggregated rank,
use \code{\link[groupdata2:ranked_balances]{ranked_balances()}} on the
\code{"Summary" data.frame} in the output.
}
\examples{
# Attach packages
library(groupdata2)
library(dplyr)

set.seed(1)

# Create data frame
df <- data.frame(
  "participant" = factor(rep(c("1", "2", "3", "4", "5", "6"), 3)),
  "age" = rep(sample(c(1:100), 6), 3),
  "diagnosis" = factor(rep(c("a", "b", "a", "a", "b", "b"), 3)),
  "score" = sample(c(1:100), 3 * 6)
)
df <- df \%>\% arrange(participant)
df$session <- rep(c("1", "2", "3"), 6)

# Using fold()

## Without balancing
set.seed(1)
df_folded <- fold(data = df, k = 3)

# Check the balances of the various columns
# As we have not used balancing in `fold()`
# we should not expect it to be amazingly balanced
df_folded \%>\%
  dplyr::ungroup() \%>\%
  summarize_balances(
    group_cols = ".folds",
    num_cols = c("score", "age"),
    cat_cols = "diagnosis",
    id_cols = "participant"
  )

## With balancing
set.seed(1)
df_folded <- fold(
  data = df,
  k = 3,
  cat_col = "diagnosis",
  num_col = 'score',
  id_col = 'participant'
)

# Now the balance should be better
# although it may be difficult to get a good balance
# the 'score' column when also balancing on 'diagnosis'
# and keeping all rows per participant in the same fold
df_folded \%>\%
  dplyr::ungroup() \%>\%
  summarize_balances(
    group_cols = ".folds",
    num_cols = c("score", "age"),
    cat_cols = "diagnosis",
    id_cols = "participant"
  )

# Comparing multiple grouping columns
# Create 3 fold column that only balance "score"
set.seed(1)
df_folded <- fold(
  data = df,
  k = 3,
  num_fold_cols = 3,
  num_col = 'score'
)

# Summarize all three grouping cols at once
(summ <- df_folded \%>\%
  dplyr::ungroup() \%>\%
  summarize_balances(
    group_cols = paste0(".folds_", 1:3),
    num_cols = c("score")
  )
)

# Extract the across-group standard deviations
# The group column with the lowest standard deviation(s)
# is the most balanced group column
summ \%>\% ranked_balances()

}
\seealso{
Other summarization functions: 
\code{\link{ranked_balances}()},
\code{\link{summarize_group_cols}()}
}
\author{
Ludvig Renbo Olsen, \email{r-pkgs@ludvigolsen.dk}
}
\concept{summarization functions}
