% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/FamiliarObjectConversion.R
\name{as_familiar_data}
\alias{as_familiar_data}
\alias{as_familiar_data,familiarData-method}
\alias{as_familiar_data,familiarEnsemble-method}
\alias{as_familiar_data,familiarModel-method}
\alias{as_familiar_data,list-method}
\alias{as_familiar_data,character-method}
\alias{as_familiar_data,ANY-method}
\title{Conversion to familiarData object.}
\usage{
as_familiar_data(object, ...)

\S4method{as_familiar_data}{familiarData}(object, ...)

\S4method{as_familiar_data}{familiarEnsemble}(object, name = NULL, ...)

\S4method{as_familiar_data}{familiarModel}(object, ...)

\S4method{as_familiar_data}{list}(object, ...)

\S4method{as_familiar_data}{character}(object, ...)

\S4method{as_familiar_data}{ANY}(object, ...)
}
\arguments{
\item{object}{A \code{familiarData} object, or a \code{familiarEnsemble} or
\code{familiarModel} objects that will be internally converted to a
\code{familiarData} object. Paths to such objects can also be provided.}

\item{...}{
  Arguments passed on to \code{\link[=extract_data]{extract_data}}
  \describe{
    \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that
constitutes the data that are assessed.}
    \item{\code{is_pre_processed}}{Flag that indicates whether the data was already
pre-processed externally, e.g. normalised and clustered. Only used if the
\code{data} argument is a \code{data.table} or \code{data.frame}.}
    \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then
used to speed up computation through parallellisation.}
    \item{\code{time_max}}{Time point which is used as the benchmark for e.g. cumulative
risks generated by random forest, or the cut-off value for Uno's concordance
index. If not provided explicitly, this parameter is read from settings used
at creation of the underlying \code{familiarModel} objects. Only used for
\code{survival} outcomes.}
    \item{\code{evaluation_times}}{One or more time points that are used for in analysis of
survival problems when data has to be assessed at a set time, e.g.
calibration. If not provided explicitly, this parameter is read from
settings used at creation of the underlying \code{familiarModel} objects. Only
used for \code{survival} outcomes.}
    \item{\code{aggregation_method}}{Method for aggregating variable importances for the
purpose of evaluation. Variable importances are determined during feature
selection steps and after training the model. Both types are evaluated, but
feature selection variable importance is only evaluated at run-time.

See the documentation for the \code{vimp_aggregation_method} argument in
\code{summon_familiar} for information concerning the different available
methods.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{rank_threshold}}{The threshold used to  define the subset of highly
important features during evaluation.

See the documentation for the \code{vimp_aggregation_rank_threshold} argument in
\code{summon_familiar} for more information.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{ensemble_method}}{Method for ensembling predictions from models for the
same sample. Available methods are:
\itemize{
\item \code{median} (default): Use the median of the predicted values as the ensemble
value for a sample.
\item \code{mean}: Use the mean of the predicted values as the ensemble value for a
sample.
}}
    \item{\code{metric}}{One or more metrics for assessing model performance. See the
vignette on performance metrics for the available metrics. If not provided
explicitly, this parameter is read from settings used at creation of the
underlying \code{familiarModel} objects.}
    \item{\code{feature_cluster_method}}{The method used to perform clustering. These are
the same methods as for the \code{cluster_method} configuration parameter:
\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}.

\code{none} cannot be used when extracting data regarding mutual correlation or
feature expressions.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in
\code{hclust} and \code{agnes}. These are the same methods as for the
\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single},
\code{complete}, \code{weighted}, and \code{ward}.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{feature_cluster_cut_method}}{The method used to divide features into
separate clusters. The available methods are the same as for the
\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and
\code{dynamic_cut}.

\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only
applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and
\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only
be used with \code{agnes} and \code{hclust}.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise
similarity that is required to form feature clusters with the \code{fixed_cut}
method.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity
between features. Similarity is computed in the same manner as for
clustering, and \code{feature_similarity_metric} therefore has the same options
as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2},
\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{sample_cluster_method}}{The method used to perform clustering based on
distance between samples. These are the same methods as for the
\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and
\code{pam}.

\code{none} cannot be used when extracting data for feature expressions.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{sample_linkage_method}}{The method used for agglomerative clustering in
\code{hclust} and \code{agnes}. These are the same methods as for the
\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single},
\code{complete}, \code{weighted}, and \code{ward}.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{sample_similarity_metric}}{Metric to determine pairwise similarity
between samples. Similarity is computed in the same manner as for
clustering, but \code{sample_similarity_metric} has different options that are
better suited to computing distance between samples instead of between
features: \code{gower}, \code{euclidean}.

The underlying feature data is scaled to the \eqn{[0, 1]} range (for
numerical features) using the feature values across the samples. The
normalisation parameters required can optionally be computed from feature
data with the outer 5\% (on both sides) of feature values trimmed or
winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to
the metric name. This reduces the effect of outliers somewhat.

If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{icc_type}}{String indicating the type of intraclass correlation
coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for
features in repeated measurements during the evaluation of univariate
importance. These types correspond to the types in Shrout and Fleiss (1979).
If not provided explicitly, this parameter is read from settings used at
creation of the underlying \code{familiarModel} objects.}
    \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the
computation and extraction of various data elements.}
    \item{\code{message_indent}}{Number of indentation steps for messages shown during
computation and extraction of various data elements.}
    \item{\code{data_element}}{String indicating which data elements are to be extracted.
Default is \code{all}, but specific elements can be specified to speed up
computations if not all elements are to be computed. This is an internal
parameter that is set by, e.g. the \code{export_model_vimp} method.}
    \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples
that are used during evaluation steps. Cannot be less than 20.

This setting can be specified per data element by providing a parameter
value in a named list with data elements, e.g.
\code{list("sample_similarity"=100, "permutation_vimp"=1000)}.

This parameter can be set for the following data elements:
\code{sample_similarity} and \code{ice_data}.}
    \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed
and aggregated.
\itemize{
\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all
models in the ensemble. This means that, for example, bias-corrected
estimates of model performance are assessed by creating (at least) 20
bootstraps and computing the model performance of the ensemble model for
each bootstrap.
\item \code{hybrid} (default): Results are computed at the level of models in an
ensemble. This means that, for example, bias-corrected estimates of model
performance are directly computed using the models in the ensemble. If there
are at least 20 trained models in the ensemble, performance is computed for
each model, in contrast to \code{ensemble} where performance is computed for the
ensemble of models. If there are less than 20 trained models in the
ensemble, bootstraps are created so that at least 20 point estimates can be
made.
\item \code{model}: Results are computed at the model level. This means that, for
example, bias-corrected estimates of model performance are assessed by
creating (at least) 20 bootstraps and computing the performance of the model
for each bootstrap.
}

Note that each level of detail has a different interpretation for bootstrap
confidence intervals. For \code{ensemble} and \code{model} these are the confidence
intervals for the ensemble and an individual model, respectively. That is,
the confidence interval describes the range where an estimate produced by a
respective ensemble or model trained on a repeat of the experiment may be
found with the probability of the confidence level. For \code{hybrid}, it
represents the range where any single model trained on a repeat of the
experiment may be found with the probability of the confidence level. By
definition, confidence intervals obtained using \code{hybrid} are at least as
wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if
the goal of the analysis is to assess the result of a single, unspecified,
model.

\code{hybrid} is generally computationally less expensive then \code{ensemble}, which
in turn is somewhat less expensive than \code{model}.

A non-default \code{detail_level} parameter can be specified for separate
evaluation steps by providing a parameter value in a named list with data
elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}.
This parameter can be set for the following data elements: \code{auc_data},
\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp},
\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.}
    \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be
possible. This has the following options:
\itemize{
\item \code{point}: Point estimates.
\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected
estimate is computed from (at least) 20 point estimates, and \code{familiar} may
bootstrap the data to create them.
\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected
estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The
number of point estimates required depends on the \code{confidence_level}
parameter, and \code{familiar} may bootstrap the data to create them.
}

As with \code{detail_level}, a non-default \code{estimation_type} parameter can be
specified for separate evaluation steps by providing a parameter value in a
named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following
data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance},
\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.}
    \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results
should be aggregated during evaluation. If \code{estimation_type} is
\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected
estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci},
aggregation leads to a single bias-corrected estimate with lower and upper
boundaries of the confidence interval. This has no effect if
\code{estimation_type} is \code{point}.

The default value is equal to \code{TRUE} except when assessing metrics to assess
model performance, as the default violin plot requires underlying data.

As with \code{detail_level} and \code{estimation_type}, a non-default
\code{aggregate_results} parameter can be specified for separate evaluation steps
by providing a parameter value in a named list with data elements, e.g.
\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists
for the same elements as \code{estimation_type}.}
    \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which
confidence intervals are determined. In the case bootstraps are used to
determine the confidence intervals bootstrap estimation, \code{familiar} uses the
rule of thumb \eqn{n = 20 / ci.level} to determine the number of required
bootstraps.

The default value is \code{0.95}.}
    \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap
confidence intervals (Efron and Hastie, 2016). The following methods are
implemented:
\itemize{
\item \code{percentile} (default): Confidence intervals obtained using the percentile
method.
\item \code{bc}: Bias-corrected confidence intervals.
}

Note that the standard method is not implemented because this method is
often not suitable due to non-normal distributions. The bias-corrected and
accelerated (BCa) method is not implemented yet.}
    \item{\code{stratification_method}}{(\emph{optional}) Method for determining the
stratification threshold for creating survival groups. The actual,
model-dependent, threshold value is obtained from the development data, and
can afterwards be used to perform stratification on validation data.

The following stratification methods are available:
\itemize{
\item \code{median} (default): The median predicted value in the development cohort
is used to stratify the samples into two risk groups. For predicted outcome
values that build a continuous spectrum, the two risk groups in the
development cohort will be roughly equal in size.
\item \code{mean}: The mean predicted value in the development cohort is used to
stratify the samples into two risk groups.
\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values
where the 5\% lowest and 5\% highest values are discarded. This reduces the
effect of outliers.
\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where
the 5\% lowest and 5\% highest values are winsorised. This reduces the effect
of outliers.
\item \code{fixed}: Samples are stratified based on the sample quantiles of the
predicted values. These quantiles are defined using the
\code{stratification_threshold} parameter.
\item \code{optimised}: Use maximally selected rank statistics to determine the
optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to
stratify samples into two optimally separated risk groups.
}

One or more stratification methods can be selected simultaneously.

This parameter is only relevant for \code{survival} outcomes.}
    \item{\code{dynamic_model_loading}}{(\emph{optional}) Enables dynamic loading of models
during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic
loading of models may reduce the overall memory footprint, at the cost of
increased disk or network IO. Models can only be dynamically loaded if they
are found at an accessible disk or network location. Setting this parameter
to \code{TRUE} may help if parallel processing causes out-of-memory issues during
evaluation.}
  }}

\item{name}{Name of the \code{familiarData} object. If not set, a name is
automatically generated.}
}
\value{
A \code{familiarData} object.
}
\description{
Creates \code{familiarData} a object from \code{familiarEnsemble} or
\code{familiarModel} objects.
}
\details{
The \code{data} argument is required if \code{familiarEnsemble} or
\code{familiarModel} objects are provided.
}
