R/utils_groupcomparison.R

Defines functions .groupComparisonWithSingleCore .groupComparisonWithMultipleCores .countMissingPercentage .getContrast .handleSingleContrast .handleEmptyConditions .getAllComparisons .getEmptyComparison .getModelParameters .fitModelForGroupComparison .fitModelSingleProtein .prepareSingleProteinForGC getSamplesInfo checkRepeatedDesign

Documented in checkRepeatedDesign .countMissingPercentage .fitModelForGroupComparison .fitModelSingleProtein .getAllComparisons .getContrast .getEmptyComparison .getModelParameters getSamplesInfo .groupComparisonWithMultipleCores .groupComparisonWithSingleCore .handleEmptyConditions .handleSingleContrast .prepareSingleProteinForGC

#' Check if data represents repeated measurements design
#' 
#' @param summarization_output output of the dataProcess function
#' 
#' @return logical, TRUE if data represent repeated measurements design
#' 
#' @details This extracts information required by the group comparison workflow
#' 
#' @export
#' 
#' @examples
#' QuantData1 <- dataProcess(SRMRawData, use_log_file = FALSE)
#' checkRepeatedDesign(QuantData1)
#' 
checkRepeatedDesign = function(summarization_output) {
    SUBJECT = GROUP = NULL
    
    input = as.data.table(summarization_output$ProteinLevelData)
    subject_by_group = table(input[, list(SUBJECT, GROUP)])
    subject_appearances = apply(subject_by_group, 1, function(x) sum(x > 
                                                                         0))
    repeated = any(subject_appearances > 1)
    if (repeated) {
        msg = "Time course design of experiment - okay"
    }
    else {
        msg = "Case control design of experiment - okay"
    }
    getOption("MSstatsLog")("INFO", msg)
    repeated
}

#' Get information about number of measurements for each group
#' 
#' @param summarization_output output of the dataProcess function
#' 
#' @return data.table
#' 
#' @details This function extracts information required to compute percentages
#' of missing and imputed values in group comparison.
#' 
#' @export
#' 
#' @examples
#' QuantData <- dataProcess(DDARawData, use_log_file = FALSE)
#' samples_info <- getSamplesInfo(QuantData)
#' samples_info
#' 
getSamplesInfo = function(summarization_output) {
    RUN = NULL
    
    summarized = as.data.table(summarization_output$ProteinLevelData)
    summarized[, list(NumRuns = data.table::uniqueN(RUN)),
               by = "GROUP"]
}


#' Prepare data for a single protein for group comparison
#' @param single_protein data.table
#' @keywords internal
.prepareSingleProteinForGC = function(single_protein) {
    ABUNDANCE = GROUP = SUBJECT = RUN = NULL
    
    data.table::setnames(single_protein,
                         c("LogIntensities"),
                         c("ABUNDANCE"),
                         skip_absent = TRUE)
    single_protein = single_protein[!is.na(ABUNDANCE)]
    single_protein[, GROUP := factor(GROUP)]
    single_protein[, SUBJECT := factor(SUBJECT)]
    single_protein[, RUN := factor(RUN)]
    single_protein
}


#' Fit model and perform group comparison for a single protein
#' @param input data.table of summarized data
#' @param contrast_matrix contrast matrix
#' @param has_tech_replicates if TRUE, there are technical replicates
#' @param is_single_subject if TRUE, experiment consists of a single subject
#' @param repeated if TRUE, experiment consists of repeated measurements
#' @param groups unique labels for experimental conditions
#' @param samples_info number of runs per group
#' @param save_fitted_models if TRUE, fitted model will be saved. If FALSE,
#' it will be replaced by NULL
#' @param has_imputed if TRUE, missing values have been imputed by dataProcess
#' @importFrom stats resid fitted
#' @keywords internal
.fitModelSingleProtein = function(input, contrast_matrix, has_tech_replicates,
                                  is_single_subject, repeated, groups,
                                  samples_info,
                                  save_fitted_models, has_imputed) {
    GROUP = SUBJECT = NULL
    
    input[, GROUP := factor(GROUP)]
    input[, SUBJECT := factor(SUBJECT)]
    
    protein = unique(input$Protein)
    n_groups = nlevels(input$GROUP)
    if (n_groups == 1) {
        result = .getEmptyComparison(input, contrast_matrix,
                                     groups, protein)
        residuals = NA
        fitted_values = NA
        fit = NULL
    } else {
        fitted_model = .fitModelForGroupComparison(input, repeated,
                                                   is_single_subject,
                                                   has_tech_replicates)
        result = .getAllComparisons(input, fitted_model, contrast_matrix,
                                    groups, protein)
        result = .countMissingPercentage(contrast_matrix,
                                         input, result, samples_info,
                                         has_imputed)
        
        if (inherits(fitted_model[["full_fit"]], "lm")) {
            residuals = fitted_model[["full_fit"]][["residuals"]]
            fitted_values = fitted_model[["full_fit"]][["fitted.values"]]
        } else {
            residuals = resid(fitted_model[["full_fit"]])
            fitted_values = fitted(fitted_model[["full_fit"]])
        }
        fit = fitted_model[["full_fit"]]
    }
    
    if (!save_fitted_models) {
        fit = NULL
    }
    
    input[, residuals := residuals]
    input[, fitted := fitted_values]
    list(result, fit)
}


#' Choose a model type (fixed/mixed effects) and fit it for a single protein
#' @inheritParams .fitModelSingleProtein
#' @keywords internal
.fitModelForGroupComparison = function(input, repeated, is_single_subject,
                                       has_tech_replicates) {
    if (!repeated) {
        if (!has_tech_replicates | is_single_subject) {
            full_fit = lm(ABUNDANCE ~ GROUP, data = input)
            df_full = full_fit[["df.residual"]]
        } else {
            full_fit = suppressMessages(try(
                lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), data = input),
                TRUE
            ))
            df_full = suppressMessages(try(
                lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual,
                TRUE
            ))
        }
    } else {
        ## time-course
        if (is_single_subject) {
            full_fit = lm(ABUNDANCE ~ GROUP,
                          data = input)
            df_full = full_fit$df.residual
        } else {
            ## no single subject
            if (!has_tech_replicates) {
                full_fit = suppressMessages(try(
                    lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), data = input),
                    TRUE
                ))
                df_full = suppressMessages(try(
                    lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual,
                    TRUE))
            } else {
                full_fit = suppressMessages(try(
                    lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT) + (1|GROUP:SUBJECT),
                               data = input),
                    TRUE
                ))
                df_full = suppressMessages(try(
                    lm(ABUNDANCE ~ GROUP + SUBJECT + GROUP:SUBJECT,
                       data = input)$df.residual,
                    TRUE
                ))
            }
        }
    }
    list(full_fit = full_fit,
         df_full = df_full)
}


#' Get params (coefficients, covariance matrix, degrees of freedom) from a model
#' @param fitted_model object of class lm or lmerMod
#' @importFrom stats vcov
#' @importFrom methods is
#' @keywords internal
.getModelParameters = function(fitted_model) {
    if (is(fitted_model[["full_fit"]], "lm")) {
        model_summary = summary(fitted_model[["full_fit"]])
        cf = model_summary[["coefficients"]]
        vcv = model_summary[["cov.unscaled"]] * (model_summary[["sigma"]] ^ 2)
    } else {
        cf = as.matrix(lme4::fixef(fitted_model[["full_fit"]]))
        vcv = as.matrix(vcov(fitted_model[["full_fit"]]))
    }
    list(cf = cf, vcv = vcv, df = fitted_model[["df_full"]])
}


#' Comparison output when there are measurements only in a single condition
#' @param input summarized data
#' @param contrast_matrix contrast matrix
#' @param groups unique labels of experimental conditions
#' @param protein name of a protein
#' @keywords internal
.getEmptyComparison = function(input, contrast_matrix, groups, protein) {
    all_comparisons = lapply(seq_len(nrow(contrast_matrix)), function(row_id) {
        ith_comparison = contrast_matrix[row_id, , drop = FALSE]
        
        if (any(groups[ith_comparison != 0] %in% unique(input$GROUP))) {
            msg = paste("*** error: results of protein", protein,
                        "for comparison", row.names(ith_comparison),
                        "are NA because there are measurements",
                        "only in a single group")
            getOption("MSstatsLog")("INFO", msg)
            
            if (ith_comparison[ith_comparison != 0 &
                               (groups %in% unique(input$GROUP))] > 0) {
                list(
                    logFC = Inf,
                    issue = "oneConditionMissing"
                )
            } else {
                list(
                    logFC = -Inf,
                    issue = "oneConditionMissing"
                )
            }
        } else {
            msg = paste("*** error: results of protein", protein,
                        "for comparison", row.names(ith_comparison),
                        "are NA because there are no measurements",
                        "in both conditions.")
            getOption("MSstatsLog")("INFO", msg)
            
            list(logFC = NA,
                 issue = "completeMissing")
        }
    })
    empty_result = data.table::rbindlist(all_comparisons, fill = TRUE)
    empty_result = cbind(empty_result,
                         data.table::data.table(
                             Protein = protein, 
                             Label = row.names(contrast_matrix),
                             SE = NA, Tvalue = NA,
                             DF = NA, pvalue = NA
                         ))
    empty_result
}


#' Get all comparisons for a single protein and a contrast matrix
#' @param input summarized data
#' @param fitted_model model fitted by the .fitModelForGroupComparison function
#' @param contrast_matrix contrast matrix
#' @param groups unique labels of experimental conditions
#' @param protein name of a protein
#' @keywords internal
.getAllComparisons = function(input, fitted_model, contrast_matrix,
                              groups, protein) {
    empty_conditions = setdiff(groups, unique(input$GROUP))
    parameters = .getModelParameters(fitted_model)
    fit = fitted_model[["full_fit"]]
    coefs = parameters$cf[, 1]
    
    all_comparisons = vector("list", nrow(contrast_matrix))
    for (row_id in seq_len(nrow(contrast_matrix))) {
        ith_contrast = contrast_matrix[row_id, , drop = FALSE]
        if (length(empty_conditions) != 0) {
            result = .handleEmptyConditions(input, fit, ith_contrast,
                                            groups, parameters, protein,
                                            empty_conditions, coefs)
        } else {
            result = .handleSingleContrast(input, fit, ith_contrast, groups,
                                           parameters, protein, coefs)
        }
        all_comparisons[[row_id]] = result
    }
    data.table::rbindlist(all_comparisons, fill = TRUE)
}


#' Handle contrast when some of the conditions are missing
#' @param input summarized data
#' @param contrast single row of a contrast matrix
#' @param groups unique labels of experimental conditions
#' @param parameters parameters extracted from the model 
#' @param protein name of a protein
#' @param empty_conditions labels of empty conditions
#' @param coefs coefficient of the fitted model
#' @keywords internal
.handleEmptyConditions = function(input, fit, contrast,
                                  groups, parameters, protein,
                                  empty_conditions, coefs) {
    count_diff_pos = intersect(colnames(contrast)[contrast != 0 & contrast > 0],
                               empty_conditions)
    count_diff_neg = intersect(colnames(contrast)[contrast != 0 & contrast < 0],
                               empty_conditions)
    flag_issue_pos = length(count_diff_pos) != 0
    flag_issue_neg = length(count_diff_neg) != 0
    
    if (any(c(flag_issue_pos, flag_issue_neg))) {
        if (flag_issue_pos & flag_issue_neg) {
            issue = "completeMissing"
            logFC = NA
        } else if (flag_issue_pos & !flag_issue_neg) {
            issue_side = count_diff_pos
            logFC = -Inf
            issue = "oneConditionMissing"
        } else if (!flag_issue_pos & flag_issue_neg) {
            issue_side = count_diff_neg
            logFC = Inf
            issue = "oneConditionMissing"
        }
        result = list(Protein = protein,
                      logFC = logFC,
                      Label = row.names(contrast),
                      SE = NA, Tvalue = NA, DF = NA, pvalue = NA,
                      issue = issue)
    } else {
        result = .handleSingleContrast(input, fit, contrast, groups,
                                       parameters, protein, coefs)
    }
    result
}


#' Group comparison for a single contrast
#' @inheritParams .handleEmptyConditions
#' @keywords internal
.handleSingleContrast = function(input, fit, contrast, groups,
                                 parameters, protein, coefs) {
    groups = sort(groups)
    contrast_values = .getContrast(input, contrast, coefs, groups)
    parameters$cf = parameters$cf[names(contrast_values), , drop = FALSE]
    parameters$vcv = parameters$vcv[names(contrast_values), names(contrast_values)]
    result = get_estimable_fixed_random(parameters, contrast_values)
    if (is.null(result)) {
        result = list(Protein = protein,
                      Label = row.names(contrast),
                      logFC = NA, SE = NA, Tvalue = NA,
                      DF = NA, pvalue = NA, issue = NA)
    } else {
        result$Protein = protein
        result$Label = row.names(contrast)
        result$issue = NA
    }
    result
}


#' Create a contrast for a model with only group as a fixed effect
#' @param input summarized data for a single protein
#' @param contrast_matrix row of a contrast_matrix
#' @param coefs coefficients of a linear model (named vector)
#' @param groups unique group labels
#' @keywords internal
.getContrast = function(input, contrast, coefs, groups) {
    coef_names = names(coefs)
    intercept = grep("Intercept", coef_names, value = TRUE)
    if (length(intercept) > 0) {
        intercept_term = rep(0, length(intercept))
        names(intercept_term) = intercept
    } else {
        intercept_term = NULL
    }
    group = grep("GROUP", coef_names, value = TRUE)
    interaction = grep(":", coef_names, value = TRUE)
    group = setdiff(group, interaction)
    if (length(group) > 0) {
        group_term = contrast[, as.character(groups[groups %in% unique(input$GROUP)])]
        names(group_term) = paste0("GROUP", names(group_term))
        group_term = group_term[-1]
    } else {
        group_term = NULL
    }
    contrast = c(intercept_term, group_term)
    contrast[!is.na(coefs)]
}


#' Count percentage of missing values in given conditions
#' @param contrast_matrix contrast matrix
#' @param summarized data.table summarized by the dataProcess function
#' @param result result of groupComparison
#' @param samples_info number of runs per group
#' @param has_imputed if TRUE, missing values have been imputed by dataProcess
#' @keywords internal
.countMissingPercentage = function(contrast_matrix, summarized, 
                                   result, samples_info, has_imputed) {
    TotalGroupMeasurements = NumMeasuredFeature = NumImputedFeature = NULL
    NumFea = NumRuns = totalN = NULL
        
        counts = summarized[,
                            list(totalN = unique(TotalGroupMeasurements),
                                 NumMeasuredFeature = sum(NumMeasuredFeature, 
                                                          na.rm = TRUE),
                                 NumImputedFeature = sum(NumImputedFeature, 
                                                         na.rm = TRUE)),
                            by = "GROUP"]
    
    empty_conditions = setdiff(samples_info$GROUP, unique(counts$GROUP))
    if (length(empty_conditions) !=0) {
        counts = merge(samples_info, counts, by = "GROUP", all.x = TRUE)
        counts[, NumFea := totalN / NumRuns ] # calculate number of features, to get the expected number of measurements in missing conditions
        nofea = max(ceiling(counts$NumFea), na.rm = TRUE) # it should be integer, just in case of double, use ceiling to get interger
        counts[is.na(totalN), NumMeasuredFeature := 0]
        counts[is.na(totalN), NumImputedFeature := 0]
        counts[is.na(totalN), totalN := NumRuns * nofea]
    }
    
    missing_vector = numeric(nrow(contrast_matrix))
    imputed_vector = numeric(nrow(contrast_matrix))
    for (i in seq_len(nrow(contrast_matrix))) {
        conditions = contrast_matrix[i, ] != 0
        missing_percentage = 1 - sum(counts$NumMeasuredFeature[conditions],
                                     na.rm = TRUE) / sum(counts$totalN[conditions],
                                                         na.rm = TRUE)
        if (has_imputed) {
            imputed_percentage = sum(counts$NumImputedFeature[conditions],
                                     na.rm = TRUE) / sum(counts$totalN[conditions],
                                                         na.rm = TRUE)
            imputed_vector[i] = imputed_percentage
        }
        missing_vector[i] = missing_percentage
    }
    result$MissingPercentage = missing_vector
    if (has_imputed) {
        result$ImputationPercentage = imputed_vector
    }
    result
}

#' Perform group comparison per protein in parallel
#' @param summarized_list output of MSstatsPrepareForGroupComparison
#' @param contrast_matrix contrast matrix
#' @param save_fitted_models if TRUE, fitted models will be included in the output
#' @param repeated logical, output of checkRepeatedDesign function
#' @param samples_info data.table, output of getSamplesInfo function
#' @param numberOfCores Number of cores for parallel processing. 
#' A logfile named `MSstats_groupComparison_log_progress.log` is created to 
#' track progress. Only works for Linux & Mac OS.
#' @importFrom parallel makeCluster clusterExport parLapply stopCluster
#' @keywords internal
.groupComparisonWithMultipleCores = function(summarized_list, contrast_matrix,
                                             save_fitted_models, repeated, samples_info, 
                                             numberOfCores) {
    groups = colnames(contrast_matrix)
    has_imputed = attr(summarized_list, "has_imputed")
    all_proteins_id = seq_along(summarized_list)
    function_environment = environment()
    cl = parallel::makeCluster(numberOfCores)
    parallel::clusterExport(cl, c("MSstatsGroupComparisonSingleProtein", 
                                  "contrast_matrix", "repeated", "groups", 
                                  "samples_info", "save_fitted_models", "has_imputed"), 
                            envir = function_environment)
    cat(paste0("Number of proteins to process: ", length(all_proteins_id)), 
        sep = "\n", file = "MSstats_groupComparison_log_progress.log")
    test_results = parallel::parLapply(cl, all_proteins_id, function(i) {
        if (i %% 100 == 0) {
            cat("Finished processing an additional 100 protein comparisons", 
                sep = "\n", file = "MSstats_groupComparison_log_progress.log", append = TRUE)
        }
        MSstatsGroupComparisonSingleProtein(
            summarized_list[[i]], contrast_matrix, repeated,
            groups, samples_info, save_fitted_models, has_imputed
        )
    })
    parallel::stopCluster(cl)
    test_results
}

#' Perform group comparison per protein iteratively with a single loop
#' @param summarized_list output of MSstatsPrepareForGroupComparison
#' @param contrast_matrix contrast matrix
#' @param save_fitted_models if TRUE, fitted models will be included in the output
#' @param repeated logical, output of checkRepeatedDesign function
#' @param samples_info data.table, output of getSamplesInfo function
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @keywords internal
.groupComparisonWithSingleCore = function(summarized_list, contrast_matrix,
                                             save_fitted_models, repeated, 
                                             samples_info) {
    groups = colnames(contrast_matrix)
    has_imputed = attr(summarized_list, "has_imputed")
    all_proteins_id = seq_along(summarized_list)
    test_results = vector("list", length(all_proteins_id))
    pb = txtProgressBar(max = length(all_proteins_id), style = 3)
    for (i in all_proteins_id) {
        comparison_outputs = MSstatsGroupComparisonSingleProtein(
            summarized_list[[i]], contrast_matrix, repeated,
            groups, samples_info, save_fitted_models, has_imputed
        )
        test_results[[i]] = comparison_outputs
        setTxtProgressBar(pb, i)
    }
    close(pb)
    test_results
}
Vitek-Lab/MSstats documentation built on Nov. 4, 2024, 3:08 p.m.