# Handling CRAN warnings for data.table syntax:
if (getRversion() >= '2.15.1') utils::globalVariables(c(
    'allele.matrix.row.index', 'dosage.with.replaced.missing', 'dosage', 'PGS',
    'PGS.with.normalized.missing', 'PGS.with.replaced.missing', 'n.pgm.sites', 'n.missing.genotypes',
    'percent.missing.genotypes', 'n.non.missing.alleles'
    ));

validate.vcf.input <- function(vcf.data, vcf.long.format) {
    if (vcf.long.format) {
        # check that inputs are data.frames
        if (!is.data.frame(vcf.data)) {
            stop('vcf.data must be a data.frame');
            }

        # check that vcf.data contains required columns
        required.vcf.columns <- c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles');
        if (!all(required.vcf.columns %in% colnames(vcf.data))) {
            stop(paste0('vcf.data must contain columns named ', paste(required.vcf.columns, collapse = ', ')));
            }

        # check that all samples have variant data represented for all variants
        n.samples <- length(unique(vcf.data$Indiv));
        n.variants <- length(unique(paste(vcf.data$CHROM, vcf.data$POS, vcf.data$REF, vcf.data$ALT, sep = ':')));
        if (nrow(vcf.data) != n.samples * n.variants) {
            stop('Number of vcf data rows is not equivalent to number of samples times number of variants. Please ensure that all samples have variant data represented for all variants.');
            }

        } else {
            # check that vcf.data is a list of two matrices
            if (!is.list(vcf.data)) {
                stop('vcf.data must be a list of a data.frame and a matrix');
                }

            # check that vcf.data contains expected named elements
            expected.names <- c('genotyped.alleles', 'vcf.fixed.fields');
            if (!all(expected.names %in% names(vcf.data))) {
                stop(paste('vcf.data must contain named elements:', paste(expected.names, collapse = ', ')));
                }

            if (!is.data.frame(vcf.data$vcf.fixed.fields)) {
                stop('vcf.data must be a list of a data.frame and a matrix');
                }

            if (!is.matrix(vcf.data$genotyped.alleles)) {
                stop('vcf.data must be a list of a data.frame and a matrix');
                }

            # check that vcf.fixed.fields contains required columns
            required.vcf.fixed.columns <- c('CHROM', 'POS', 'REF', 'ALT', 'allele.matrix.row.index');
            if (!all(required.vcf.fixed.columns %in% colnames(vcf.data$vcf.fixed.fields))) {
                stop(paste0('vcf.data$vcf.fixed.fields must contain columns named ', paste(required.vcf.fixed.columns, collapse = ', ')));
                }

            # check that genotyped.alleles and vcf.fixed.fields have the same number of rows
            if (nrow(vcf.data$genotyped.alleles) != nrow(vcf.data$vcf.fixed.fields)) {
                stop('vcf.data$genotyped.alleles and vcf.data$vcf.fixed.fields must have the same number of rows');
                }

            }

    }

validate.pgs.data.input <- function(pgs.weight.data, use.external.effect.allele.frequency, correct.strand.flips, remove.ambiguous.allele.matches, remove.mismatched.indels) {
    if (!is.data.frame(pgs.weight.data)) {
        stop('pgs.weight.data must be a data.frame');
        }

    required.pgs.columns <- c('CHROM', 'POS', 'effect_allele', 'beta');

    if (!all(required.pgs.columns %in% colnames(pgs.weight.data))) {
        stop('pgs.weight.data must contain columns named CHROM, POS, effect_allele, and beta');
        }

    # additional required columns if strand flip correction is enabled
    if (correct.strand.flips || remove.ambiguous.allele.matches || remove.mismatched.indels) {
        if (!('other_allele' %in% colnames(pgs.weight.data))) {
            stop('pgs.weight.data must contain a column named other_allele if correct.strand.flips, remove.ambiguous.allele.matches, or remove.mismatched.indels is TRUE');
            }
        }

    if (use.external.effect.allele.frequency) {
        required.eaf.column <- 'allelefrequency_effect';
        if (!(required.eaf.column %in% colnames(pgs.weight.data))) {
            stop('pgs.weight.data must contain a column named allelefrequency_effect if use.external.effect.allele.frequency is TRUE');
            }
        }

    # check for duplicate variants in PGS data
    if (any(duplicated(paste(pgs.weight.data$CHROM, pgs.weight.data$POS, pgs.weight.data$effect_allele, sep = ':')))) {
        stop('Duplicate variants detected in the PGS weight data. Please ensure only unique coordinate:effect allele combinations are present.');
        }

    # check for duplicate coordinates in PGS data
    if (any(duplicated(paste(pgs.weight.data$CHROM, pgs.weight.data$POS, sep = ':')))) {
        warning('Duplicate variants detected in the PGS weight data. These will be treated as multiallelic sites.');
        }
    }

validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.columns, vcf.data, vcf.long.format) {
    if (!is.null(phenotype.data)) {
        if (!is.data.frame(phenotype.data)) {
            stop('phenotype.data must be a data.frame');
            }

        required.phenotype.columns <- 'Indiv';
        if (!all(required.phenotype.columns %in% colnames(phenotype.data))) {
                stop('phenotype.data must contain columns named Indiv');
                }

        if (vcf.long.format) {
            # check for at least one matching Indiv between phenotype.data and vcf.data
            if (length(intersect(phenotype.data$Indiv, vcf.data$Indiv)) == 0) {
                stop('No matching Indiv between phenotype.data and vcf.data');
                }
            } else {
                # check for at least one matching Indiv between phenotype.data and vcf.data$genotyped.alleles columns
                if (length(intersect(phenotype.data$Indiv, colnames(vcf.data$genotyped.alleles))) == 0) {
                    stop('No matching Indiv between phenotype.data and vcf.data');
                    }
            }

        # validate phenotype.analysis.columns if provided
        if (!is.null(phenotype.analysis.columns)) {
            if (!all(phenotype.analysis.columns %in% colnames(phenotype.data))) {
                stop('phenotype.analysis.columns must be columns in phenotype.data');
                }
            }

        } else if (!is.null(phenotype.analysis.columns)) {
            stop('phenotype.analysis.columns provided but no phenotype data detected');
            }

    }

#' @title Apply polygenic score to VCF data
#' @description Apply a polygenic score model to VCF data.
#' @param vcf.data VCF genotype data as formatted by \code{import.vcf()}. Two formats are accepted: wide format (a list of elements named \code{genotyped.alleles} and \code{vcf.fixed.fields}) or long format(a data frame). See \code{vcf.import} for more details.
#' @param vcf.long.format A logical indicating whether \code{vcf.data} is provided in long format. Default is \code{FALSE}.
#' @param pgs.weight.data A data.frame containing PGS weight data as formatted by \code{import.pgs.weight.file()}.
#' @param phenotype.data A data.frame containing phenotype data. Must have an Indiv column matching vcf.data. Default is \code{NULL}.
#' @param phenotype.analysis.columns A character vector of phenotype columns from phenotype.data to analyze in a regression analsyis. Default is \code{NULL}.
#' Phenotype variables are automatically classified as continuous, binary, or neither based on data type and number of unique values. The calculated PGS is associated
#' with each phenotype variable using linear or logistic regression for continuous or binary phenotypes, respectively. See \code{run.pgs.regression} for more details.
#' If no phenotype.analysis.columns are provided, no regression analysis is performed.
#' @param correct.strand.flips A logical indicating whether to check PGS weight data/VCF genotype data matches for strand flips and correct them. Default is \code{TRUE}.
#' The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.
#' @param remove.ambiguous.allele.matches A logical indicating whether to remove PGS variants with ambiguous allele matches between PGS weight data and VCF genotype data. Default is \code{FALSE}.
#' The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.
#' @param max.strand.flips An integer indicating the number of unambiguous strand flips that need to be detected in order to discard all variants with ambiguous allele matches. Only applies if \code{return.ambiguous.as.missing == TRUE}.
#' Default is \code{0} which means that all ambiguous variants are removed regardless of the status of any other variant.
#' @param remove.mismatched.indels A logical indicating whether to remove indel variants that are mismatched between PGS weight data and VCF genotype data. Default is \code{FALSE}.
#' The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.
#' @param output.dir A character string indicating the directory to write output files. Separate files are written for per-sample pgs results and optional regression results.
#' Files are tab-separate .txt files. Default is NULL in which case no files are written.
#' @param file.prefix A character string to prepend to the output file names. Default is \code{NULL}.
#' @param missing.genotype.method A character string indicating the method to handle missing genotypes. Options are "mean.dosage", "normalize", or "none". Default is "mean.dosage".
#' @param use.external.effect.allele.frequency A logical indicating whether to use an external effect allele frequency for calculating mean dosage when handling missing genotypes. Default is \code{FALSE}.
#' Provide allele frequency as a column is \code{pgs.weight.data} named \code{allelefrequency_effect}.
#' @param n.percentiles An integer indicating the number of percentiles to calculate for the PGS. Default is \code{NULL}.
#' @param analysis.source.pgs A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none".
#' When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection.
#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application.
#' If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.
#' @return A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided.
#'
#' \strong{Output Structure}
#'
#' The outputed list contains the following elements:
#' \itemize{
#' \item pgs.output: A data.frame containing the PGS per sample and optional phenotype data.
#' \item regression.output: A data.frame containing the results of the regression analysis if phenotype.analysis.columns are provided, otherwise \code{NULL}.
#' }
#'
#' pgs.output columns:
#' \itemize{
#' \item \code{Indiv}: A character string indicating the sample ID.
#' \item \code{PGS}: A numeric vector indicating the PGS per sample. (only if missing.genotype.method includes "none")
#' \item \code{PGS.with.normalized.missing}: A numeric vector indicating the PGS per sample with missing genotypes normalized. (only if missing.genotype.method includes "normalize")
#' \item \code{PGS.with.replaced.missing}: A numeric vector indicating the PGS per sample with missing genotypes replaced by mean dosage. (only if missing.genotype.method includes "mean.dosage")
#' \item \code{percentile}: A numeric vector indicating the percentile rank of the PGS.
#' \item \code{decile}: A numeric vector indicating the decile rank of the PGS.
#' \item \code{quartile}: A numeric vector indicating the quartile rank of the PGS.
#' \item \code{percentile.X:} A numeric vector indicating the user-specified percentile rank of the PGS where "X" is substituted by \code{n.percentiles}. (only if \code{n.percentiles} is specified)
#' \item \code{n.missing.genotypes}: A numeric vector indicating the number of missing genotypes per sample.
#' \item \code{percent.missing.genotypes}: A numeric vector indicating the percentage of missing genotypes per sample.
#' \item All columns in \code{phenotype.data} if provided.
#' }
#'
#' regression.output columns:
#' \itemize{
#' \item phenotype: A character vector of phenotype names.
#' \item \code{model}: A character vector indicating the regression model used. One of "logistic.regression" or "linear.regression".
#' \item \code{beta}: A numeric vector indicating the beta coefficient of the regression analysis.
#' \item \code{se}: A numeric vector indicating the standard error of the beta coefficient.
#' \item \code{p.value}: A numeric vector indicating the p-value of the beta coefficient.
#' \item \code{r.squared}: A numeric vector indicating the r-squared value of linear regression analysis. NA for logistic regression.
#' \item \code{AUC}: A numeric vector indicating the area under the curve of logistic regression analysis. NA for linear regression.
#' }
#'
#' \strong{PGS Calculation}
#'
#' PGS for each individual \emph{i} is calculated as the sum of the product of the dosage and beta coefficient for each variant in the PGS:
#' \deqn{PGS_i = \sum_{m=1}^{M} \left( \beta_m \times dosage_{im} \right)}
#' Where \emph{m} is a PGS component variant out of a total \emph{M} variants.
#'
#' \strong{Missing Genotype Handling}
#'
#' VCF genotype data are matched to PGS data by chromosome and position. If a SNP cannot be matched by genomic coordinate,
#' an attempt is made to match by rsID (if available). If a SNP from the PGS weight data is not found in the VCF data after these two matching attempts,
#' it is considered a cohort-wide missing variant.
#'
#' Missing genotypes (in individual samples) among successfully matched variants are handled by three methods:
#'
#' \code{none}: Missing genotype dosages are excluded from the PGS calculation.
#' This is equivalent to assuming that all missing genotypes are homozygous for the non-effect allele, resulting in a dosage of 0.
#'
#' \code{normalize}: Missing genotypes are excluded from score calculation but the final score is normalized by the number of non-missing alleles.
#' The calculation assumes a diploid genome:
#' \deqn{PGS_i = \dfrac{\sum \left( \beta_m \times dosage_{im} \right)}{P_i * M_{non-missing}}}
#' Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes.
#'
#' \code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome:
#' \deqn{\overline{dosage_{k}} = EAF_k * P}
#' where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2
#' This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium.
#' By default, the effect allele frequency is calculated from the provided VCF data.
#' For variants that are missing in all individuals (cohort-wide), dosage is assumed to be zero (homozygous non-reference) for all individuals.
#' An external allele frequency can be provided in the \code{pgs.weight.data} as a column named \code{allelefrequency_effect} and by setting \code{use.external.effect.allele.frequency} to \code{TRUE}.
#'
#' \strong{Multiallelic Site Handling}
#'
#' If a PGS weight file provides weights for multiple effect alleles, the appropriate dosage is calculated for the alleles that each individual carries.
#' It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted.
#' VCF data can be formatted to merged format using external tools for VCF file manipulation.
#'
#' \strong{Allele Mismatch Handling}
#' Variants from the PGS weight data are merged with records in the VCF data by genetic coordinate.
#' After the merge is complete, there may be cases where the VCF reference (REF) and alternative (ALT) alleles do not match their conventional counterparts in the
#' PGS weight data (other allele and effect allele, respectively).
#' This is usually caused by a strand flip: the variant in question was called against opposite DNA reference strands in the PGS training data and the VCF data.
#' Strand flips can be detected and corrected by flipping the affected allele to its reverse complement.
#' \code{apply.polygenic.score} uses \code{assess.pgs.vcf.allele.match} to assess allele concordance, and is controlled through the following arguments:
#'
#' \itemize{
#' \item \code{correct.strand.flips}: When \code{TRUE}, detected strand flips are corrected by flipping the affected value in the \code{effect_allele} column prior to dosage calling.
#' \item \code{remove.ambiguous.allele.matches}: Corresponds to the \code{return.ambiguous.as.missing} argument in \code{assess.pgs.vcf.allele.match}. When \code{TRUE}, non-INDEL allele
#' mismatches that cannot be resolved (due to palindromic alleles or causes other than strand flips) are removed by marking the affected value in the \code{effect_allele} column as missing
#' prior to dosage calling and missing genotype handling. The corresponding dosage is set to NA and the variant is handled according to the chosen missing genotype method.
#' \item \code{max.strand.flips}: This argument only applies when \code{remove.ambiguous.allele.matches} is on and modifies its behavior. In cases where none or very few unambiguous strand flips are detected,
#' it is likely that all ambiguous allele matches are simply palindromic effect size flips. This option facilitates handling of ambiguous allele matches conditional on a maximum number of unambiguous strand flips.
#' Variants with ambiguous strand flips will be marked as missing only if the number of unambiguous strand flips is greater than or equal to \code{max.strand.flips}.
#' \item \code{remove.mismatched.indels}: Corresponds to the \code{return.indels.as.missing} argument in \code{assess.pgs.vcf.allele.match}. When \code{TRUE}, INDEL allele mismatches
#' (which cannot be assessed for strand flips) are removed by marking the affected value in the \code{effect_allele} column as missing prior to dosage calling and missing genotype handling.
#' The corresponding dosage is set to NA and the variant is handled according to the chosen missing genotype method.
#' }
#'
#' Note that an allele match assessment requires the presence of both the \code{other_allele} and \code{effect_allele} in the PGS weight data.
#' The \code{other_allele} column is not required by the PGS Catalog, and so is not always available.
#'
#' @examples
#' # Example VCF
#' vcf.path <- system.file(
#'     'extdata',
#'     'HG001_GIAB.vcf.gz',
#'     package = 'ApplyPolygenicScore',
#'     mustWork = TRUE
#'     );
#' vcf.import <- import.vcf(vcf.path, long.format = TRUE);
#'
#' # Example pgs weight file
#' pgs.weight.path <- system.file(
#'     'extdata',
#'     'PGS000662_hmPOS_GRCh38.txt.gz',
#'     package = 'ApplyPolygenicScore',
#'     mustWork = TRUE
#'     );
#' pgs.import <- import.pgs.weight.file(pgs.weight.path);
#'
#' pgs.data <- apply.polygenic.score(
#'     vcf.data = vcf.import$split.wide.vcf.matrices,
#'     pgs.weight.data = pgs.import$pgs.weight.data,
#'     missing.genotype.method = 'none'
#'     );
#'
#' # Use long format
#' pgs.data <- apply.polygenic.score(
#'     vcf.data = vcf.import$combined.long.vcf.df$dat,
#'     vcf.long.format = TRUE,
#'     pgs.weight.data = pgs.import$pgs.weight.data,
#'     missing.genotype.method = 'none'
#'     );
#'
#' # Specify different methods for handling missing genotypes
#' pgs.import$pgs.weight.data$allelefrequency_effect <- rep(0.5, nrow(pgs.import$pgs.weight.data));
#' pgs.data <- apply.polygenic.score(
#'     vcf.data = vcf.import$split.wide.vcf.matrices,
#'     pgs.weight.data = pgs.import$pgs.weight.data,
#'     missing.genotype.method = c('none', 'mean.dosage', 'normalize'),
#'     use.external.effect.allele.frequency = TRUE
#'     );
#'
#' # Specify allele mismatch handling
#' pgs.data <- apply.polygenic.score(
#'    vcf.data = vcf.import$split.wide.vcf.matrices,
#'    pgs.weight.data = pgs.import$pgs.weight.data,
#'    correct.strand.flips = TRUE,
#'    remove.ambiguous.allele.matches = TRUE,
#'    remove.mismatched.indels = FALSE
#'    );
#'
#' # Provide phenotype data for basic correlation analysis
#' n.samples <- length(colnames(vcf.import$split.wide.vcf.matrices$genotyped.alleles))
#' phenotype.data <- data.frame(
#'     Indiv = colnames(vcf.import$split.wide.vcf.matrices$genotyped.alleles),
#'     continuous.phenotype = rnorm(n.samples),
#'     binary.phenotype = sample(
#'         c('a', 'b'),
#'         n.samples,
#'         replace = TRUE
#'         )
#'     );
#'
#' pgs.data <- apply.polygenic.score(
#'     vcf.data = vcf.import$split.wide.vcf.matrices,
#'     pgs.weight.data = pgs.import$pgs.weight.data,
#'     phenotype.data = phenotype.data
#'     );
#'
#' # Only run validation checks on input data and report back
#' apply.polygenic.score(
#'     vcf.data = vcf.import$split.wide.vcf.matrices,
#'     pgs.weight.data = pgs.import$pgs.weight.data,
#'     validate.inputs.only = TRUE
#'     );
#' @export
apply.polygenic.score <- function(
    vcf.data,
    vcf.long.format = FALSE,
    pgs.weight.data,
    phenotype.data = NULL,
    phenotype.analysis.columns = NULL,
    correct.strand.flips = TRUE,
    remove.ambiguous.allele.matches = FALSE,
    max.strand.flips = 0,
    remove.mismatched.indels = FALSE,
    output.dir = NULL,
    file.prefix = NULL,
    missing.genotype.method = 'mean.dosage',
    use.external.effect.allele.frequency = FALSE,
    n.percentiles = NULL,
    analysis.source.pgs = NULL,
    validate.inputs.only = FALSE
    ) {

    ### Start Input Validation ###

    validate.vcf.input(vcf.data = vcf.data, vcf.long.format = vcf.long.format);
    validate.pgs.data.input(
        pgs.weight.data = pgs.weight.data,
        use.external.effect.allele.frequency = use.external.effect.allele.frequency,
        correct.strand.flips = correct.strand.flips,
        remove.ambiguous.allele.matches = remove.ambiguous.allele.matches,
        remove.mismatched.indels = remove.mismatched.indels
        );
    validate.phenotype.data.input(phenotype.data = phenotype.data, phenotype.analysis.columns = phenotype.analysis.columns, vcf.data = vcf.data, vcf.long.format = vcf.long.format);

    if (validate.inputs.only) {
        message('Input data passed validation');
        return(TRUE);
        }

    # check missing genotype method input
    if (all(missing.genotype.method %in% c('mean.dosage', 'normalize', 'none'))) {
        missing.genotype.method <- missing.genotype.method;
        } else {
        stop('missing.genotype.method must be either "mean.dosage", "normalize", or "none"');
        }

    # check that n.percentiles is a mathematical integer
    if (!is.null(n.percentiles) && (n.percentiles %% 1 != 0)) {
        stop('n.percentiles must be an integer');
        }

    # check that analysis.source.pgs is NULL or a character string representing a missing genotype method
    if (!is.null(analysis.source.pgs)) {
        if (length(analysis.source.pgs) > 1) {
            stop('analysis.source.pgs must be one of the chosen missing genotype methods');
            }
        if (!(analysis.source.pgs %in% missing.genotype.method)) {
            stop('analysis.source.pgs must be one of the chosen missing genotype methods');
            }
        } else {
        # if no source is provided, set analysis.source.pgs to the first chosen method.
        analysis.source.pgs <- missing.genotype.method[1];
        }

    ### End Input Validation ###

    # handle long format VCF data by converting to wide
    if (vcf.long.format) {
        # convert long format VCF data to wide format
        vcf.data <- convert.long.vcf.to.wide.vcf(long.vcf = vcf.data);
        # set vcf.long.format to FALSE for further processing
        vcf.long.format <- FALSE;
        }

    # merge VCF and PGS data
    merged.vcf.with.pgs.data <- combine.vcf.with.pgs(
        vcf.data = vcf.data$vcf.fixed.fields,
        pgs.weight.data = pgs.weight.data
        )$merged.vcf.with.pgs.data;

    # Sort merged VCF data by allele.matrix.row.index with NAs last (in-place for efficiency)
    data.table::setorder(merged.vcf.with.pgs.data, allele.matrix.row.index, na.last = TRUE);

    # Pre-allocate a new, correctly-sized allele matrix filled with NAs
    merged.vcf.allele.matrix <- matrix(
        data = NA,
        nrow = nrow(merged.vcf.with.pgs.data),
        ncol = ncol(vcf.data$genotyped.alleles),
        dimnames = list(NULL, colnames(vcf.data$genotyped.alleles))
        );

    # Get the indices for non-missing variants from the original VCF allele matrix
    non.missing.vcf.indices <- merged.vcf.with.pgs.data[!is.na(allele.matrix.row.index),
                                                        as.numeric(allele.matrix.row.index)];

    # Get the indices for where the non-missing variants should go in the new matrix
    new.matrix.indices <- which(!is.na(merged.vcf.with.pgs.data$allele.matrix.row.index));

    # Fill the pre-allocated matrix with the correct data
    merged.vcf.allele.matrix[new.matrix.indices, ] <- vcf.data$genotyped.alleles[non.missing.vcf.indices, ];

    # Update row names of allele matrix to unique variant identifiers from merged data
    unique.var.id <- paste(
        merged.vcf.with.pgs.data$CHROM,
        merged.vcf.with.pgs.data$POS,
        merged.vcf.with.pgs.data$effect_allele,
        sep = ':'
        );
    rownames(merged.vcf.allele.matrix) <- unique.var.id;

    # save sample names from matrix
    sample.names <- colnames(merged.vcf.allele.matrix);

    # free up some memory
    rm(vcf.data);

    ### Start Allele Match Check ###
    if (remove.ambiguous.allele.matches || correct.strand.flips) {

        match.assessment <- ApplyPolygenicScore::assess.pgs.vcf.allele.match(
            vcf.ref.allele = merged.vcf.with.pgs.data$REF,
            vcf.alt.allele = merged.vcf.with.pgs.data$ALT,
            pgs.ref.allele = merged.vcf.with.pgs.data$other_allele,
            pgs.effect.allele = merged.vcf.with.pgs.data$effect_allele,
            return.ambiguous.as.missing = remove.ambiguous.allele.matches,
            max.strand.flips = max.strand.flips,
            return.indels.as.missing = remove.mismatched.indels
            );
        merged.vcf.with.pgs.data$effect_allele <- match.assessment$new.pgs.effect.allele;
        }

    # calculate dosage
    # in split matrix format, calculate dosage from allele matrix and save to new matrix
    dosage.matrix <- convert.alleles.to.pgs.dosage(
        called.alleles = merged.vcf.allele.matrix,
        risk.alleles = merged.vcf.with.pgs.data$effect_allele
        );

    ### Start Missing Genotype Handling ###
    if ('mean.dosage' %in% missing.genotype.method) {
        # calculate dosage to replace missing genotypes
        if (use.external.effect.allele.frequency) {
            # calculate mean dosage from user-provided allele frequency
            missing.genotype.dosage <- convert.allele.frequency.to.dosage(allele.frequency = merged.vcf.with.pgs.data$allelefrequency_effect);
            } else {
            # calculate mean dosage from allele matrix
            missing.genotype.dosage <- calculate.missing.genotype.dosage(dosage.matrix = dosage.matrix);
            }
        # replace missing genotypes with mean dosage
        mean.dosage.matrix <- dosage.matrix;
        missing.dosage.index <- is.na(mean.dosage.matrix);
        mean.dosage.matrix[missing.dosage.index] <- missing.genotype.dosage[row(mean.dosage.matrix)[missing.dosage.index]];
        }

    ### End Missing Genotype Handling ###

    # calculate weighted dosage
    if ('mean.dosage' %in% missing.genotype.method) {
        # calculate weighted dosage with replaced missing genotypes
        weighted.dosage.with.replaced.missing.matrix <- mean.dosage.matrix * merged.vcf.with.pgs.data$beta;
        # clean up the memory
        rm(mean.dosage.matrix);
        }
    if ('normalize' %in% missing.genotype.method || 'none' %in% missing.genotype.method) {
        # calculate weighted dosage without missing genotypes
        weighted.dosage.matrix <- dosage.matrix * merged.vcf.with.pgs.data$beta;
        }

    ### Start Multiallelic Site Handling ###
    # Apply multiallelic site handling function by CHROM POS groupings of data.table
    # Function returns a logical mask matrix for each multiallelic site indicating which alleles are non-risk and should be
    # excluded from dosage calculation.
    non.risk.multiallelic.entries.dt <- merged.vcf.with.pgs.data[
        ,
        get.non.risk.multiallelic.site.row(
            merged.vcf.with.pgs.data = .SD,
            vcf.long.format = vcf.long.format,
            original.df.row.index = .I,
            merged.vcf.allele.matrix = merged.vcf.allele.matrix,
            current.chrom = CHROM,
            current.pos = POS
            ),
        by = c('CHROM', 'POS')
        ];

    # Initialize a global mask matrix with FALSE values
    global.non.risk.mask <- matrix(
        FALSE,
        nrow = nrow(dosage.matrix),
        ncol = ncol(dosage.matrix)
        );

    # Populate the global mask by iterating through each multiallelic site's mask.
    # This loop runs only for each unique multiallelic site (CHROM, POS), not for every SNP.
    if (nrow(non.risk.multiallelic.entries.dt) > 0) {
        for (i in seq_len(nrow(non.risk.multiallelic.entries.dt))) {
            # Extract the global row indices for the current multiallelic site
            rows.for.site <- non.risk.multiallelic.entries.dt$original.df.row.index[[i]];
            # Extract the logical mask matrix for the current multiallelic site
            mask.for.site <- non.risk.multiallelic.entries.dt$mask.matrix[[i]];

            # Assign the site-specific mask to the correct rows in the global mask.
            # Using '|' (OR) ensures that if a row is marked TRUE as non-risk in any sub-mask,
            # it remains TRUE in the global mask.
            global.non.risk.mask[rows.for.site, ] <- global.non.risk.mask[rows.for.site, ] | mask.for.site;
            }
        }

    # Apply the single, global non-risk mask to the dosage matrices
    if ('mean.dosage' %in% missing.genotype.method) {
        weighted.dosage.with.replaced.missing.matrix[global.non.risk.mask] <- NA;
        }
    if ('normalize' %in% missing.genotype.method || 'none' %in% missing.genotype.method) {
        weighted.dosage.matrix[global.non.risk.mask] <- NA;
        }

    ### End Multiallelic Site Handling ###

    ### Start Missing SNP Count ###
    ploidy <- 2; # hard-coded ploidy for human diploid genome
    bialellic.variant.id <- paste(merged.vcf.with.pgs.data$CHROM, merged.vcf.with.pgs.data$POS, sep = ':');
    n.variant.in.pgs <- length(unique(bialellic.variant.id));

    # find NAs in dosage matrix
    na.dosage.matrix <- +is.na(dosage.matrix);
    # count NAs per sample per variant site
    missingness.matrix <- rowsum(na.dosage.matrix, group = bialellic.variant.id);
    # Any values above 1 represent multiallelics and are double-counted, set to one instead
    missingness.matrix[missingness.matrix > 1] <- 1;
    # Sum up missingness per sample (colSums)
    per.sample.missing.genotype.count <- colSums(missingness.matrix);
    per.sample.missing.genotype.percent <- round(per.sample.missing.genotype.count / n.variant.in.pgs, 2);
    per.sample.non.missing.genotype.count <- n.variant.in.pgs - per.sample.missing.genotype.count;
    per.sample.non.missing.allele.count <- per.sample.non.missing.genotype.count * ploidy; # assuming diploid genome
    # clean up memory
    rm(missingness.matrix, na.dosage.matrix, dosage.matrix);

    ### End Missing SNP Count ###

    ### Start PGS Application ###

    missing.method.to.colname.ref <- c(
        'mean.dosage' = 'PGS.with.replaced.missing',
        'normalize' = 'PGS.with.normalized.missing',
        'none' = 'PGS'
        );

    # Initialize output list
    pgs.output.list <- list();

    # Calculate colSums for weighted.dosage.matrix only once if multiple methods need it
    weighted.dosage.matrix.colsums <- NULL; # Initialize to NULL
    if (('none' %in% missing.genotype.method) || ('normalize' %in% missing.genotype.method)) {
        weighted.dosage.matrix.colsums <- colSums(weighted.dosage.matrix, na.rm = TRUE);
        }

    if ('none' %in% missing.genotype.method) {
        # calculate PGS per sample
        pgs.output.list$PGS <- weighted.dosage.matrix.colsums;
        }

    if ('normalize' %in% missing.genotype.method) {
        # calculate PGS per sample with normalized missing genotypes
        pgs.output.list$PGS.with.normalized.missing <- weighted.dosage.matrix.colsums;
        # divide sum by non-missing allele count
        pgs.output.list$PGS.with.normalized.missing <- pgs.output.list$PGS.with.normalized.missing / per.sample.non.missing.allele.count;
        # account for division by zero
        pgs.output.list$PGS.with.normalized.missing[is.nan(pgs.output.list$PGS.with.normalized.missing)] <- NA;
        }

    if ('mean.dosage' %in% missing.genotype.method) {
        # calculate PGS per sample with replaced missing genotypes
        pgs.output.list$PGS.with.replaced.missing <- colSums(weighted.dosage.with.replaced.missing.matrix, na.rm = TRUE);
        }

        # format outputs
        pgs.output <- data.table::data.table(Indiv = sample.names);

        # Add PGS columns by reference if they exist
        if (!is.null(pgs.output.list[['PGS']])) { # due to pattern-matching, cannot use pgs.output.list$PGS
            pgs.output[, PGS := pgs.output.list[['PGS']]];
            }
        if (!is.null(pgs.output.list$PGS.with.normalized.missing)) {
            pgs.output[, PGS.with.normalized.missing := pgs.output.list$PGS.with.normalized.missing];
            }
        if (!is.null(pgs.output.list$PGS.with.replaced.missing)) {
            pgs.output[, PGS.with.replaced.missing := pgs.output.list$PGS.with.replaced.missing];
            }
        # free up memory conditionally
        if ('mean.dosage' %in% missing.genotype.method) {
            rm(weighted.dosage.with.replaced.missing.matrix);
            }
        if ('normalize' %in% missing.genotype.method || 'none' %in% missing.genotype.method) {
            rm(weighted.dosage.matrix);
            }



    ### End PGS Application ###


    # retrieve pgs for statisitical analyses
    pgs.for.stats <- pgs.output[[missing.method.to.colname.ref[analysis.source.pgs]]];

    # calculate percentiles
    percentiles <- get.pgs.percentiles(pgs = pgs.for.stats, n.percentiles = n.percentiles);
    data.table::setDT(percentiles);

    pgs.output[, names(percentiles) := percentiles];

    # add missing genotype count
    pgs.output[, n.pgm.sites := n.variant.in.pgs];
    pgs.output[, n.missing.genotypes := per.sample.missing.genotype.count];
    pgs.output[, percent.missing.genotypes := per.sample.missing.genotype.percent];

    # add non-missing genotype count
    pgs.output[, n.non.missing.alleles := per.sample.non.missing.allele.count];

    # initialize regression output
    regression.output <- NULL;

    # merge PGS data with phenotype data by Indiv column
    if (!is.null(phenotype.data)) {
        data.table::setDT(phenotype.data);
        pgs.output <- merge(
            x = pgs.output,
            y = phenotype.data,
            by = 'Indiv',
            all.x = TRUE,
            all.y = TRUE
            );

        ### Begin Phenotype Analysis ###

        if (!is.null(phenotype.analysis.columns)) {
            # post merge data selection
            pgs.for.phenotype.stats <- pgs.output[[missing.method.to.colname.ref[analysis.source.pgs]]];

            regression.output <- run.pgs.regression(
                pgs = pgs.for.phenotype.stats,
                phenotype.data = pgs.output[ , phenotype.analysis.columns, with = FALSE]
                );
            }
        ### End Phenotype Analysis ###

        }
    rownames(pgs.output) <- 1:nrow(pgs.output);
    final.output <- list(
        pgs.output = as.data.frame(pgs.output),
        regression.output = regression.output
        );

    # If requested, write output to file
    if (!is.null(output.dir)) {

        if (is.null(file.prefix)) {
            file.prefix <- 'ApplyPolygenicScore';
            }

        write.apply.polygenic.score.output.to.file(
            apply.polygenic.score.output = final.output,
            output.dir = output.dir,
            file.prefix = file.prefix
            );
        }

    return(final.output);
    }
