#' Finding differentially abundant proteins across conditions in TMT experiment
#'
#' Tests for significant changes in protein abundance across conditions based on a family of linear mixed-effects models in TMT experiment.
#' Experimental design of case-control study (patients are not repeatedly measured) is automatically determined based on proper statistical model.
#'
#' @param data the output of \code{\link{proteinSummarization}} function. It is a list with data frames `FeatureLevelData` and `ProteinLevelData`
#' @param contrast.matrix Comparison between conditions of interests. 1) default is "pairwise", which compare all possible pairs between two conditions. 2) Otherwise, users can specify the comparisons of interest. Based on the levels of conditions, specify 1 or -1 to the conditions of interests and 0 otherwise. The levels of conditions are sorted alphabetically.
#' @param moderated TRUE will moderate t statistic; FALSE (default) uses ordinary t statistic.
#' @param adj.method adjusted method for multiple comparison. "BH" is default.
#' @param remove_norm_channel TRUE(default) removes "Norm" channels from protein level data.
#' @param remove_empty_channel TRUE(default) removes "Empty" channels from protein level data.
#' @param save_fitted_models logical, if TRUE, fitted models will be added to
#' @inheritParams .documentFunction
#'
#' @return a list that consists of the following elements:
#' (1) ComparisonResult: statistical testing results;
#' (2) FittedModel: the fitted linear models
#'
#' @export
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom stats anova lm median model.matrix na.omit p.adjust pt
#' @importFrom MSstats MSstatsContrastMatrix
#'
#' @examples
#' data(input.pd)
#' # use protein.summarization() to get protein abundance data
#' quant.pd.msstats = proteinSummarization(input.pd,
#' method="msstats",
#' global_norm=TRUE,
#' reference_norm=TRUE)
#'
#' test.pairwise = groupComparisonTMT(quant.pd.msstats, moderated = TRUE)
#' head(test.pairwise$ComparisonResult)
#'
#' # Only compare condition 0.125 and 1
#' levels(quant.pd.msstats$ProteinLevelData$Condition)
#'
#' # Compare condition 1 and 0.125
#' comparison=matrix(c(-1,0,0,1),nrow=1)
#'
#' # Set the nafmes of each row
#' row.names(comparison)="1-0.125"
#'
#' # Set the column names
#' colnames(comparison)= c("0.125", "0.5", "0.667", "1")
#' test.contrast = groupComparisonTMT(data = quant.pd.msstats,
#' contrast.matrix = comparison,
#' moderated = TRUE)
#' head(test.contrast$ComparisonResult)
#'
groupComparisonTMT = function(
data, contrast.matrix = "pairwise",
moderated = FALSE, adj.method = "BH",
remove_norm_channel = TRUE, remove_empty_channel = TRUE,
save_fitted_models = FALSE,
use_log_file = TRUE, append = FALSE,
verbose = TRUE, log_file_path = NULL
){
MSstatsConvert::MSstatsLogsSettings(
use_log_file, append, verbose, log_file_path,
base = "MSstatsTMT_log_groupComparison_", pkg_name = "MSstatsTMT"
)
getOption("MSstatsTMTLog")("INFO", "MSstatsTMT - groupComparisonTMT function")
getOption("MSstatsTMTLog")("INFO", paste("Moderated t-stat :", moderated))
getOption("MSstatsTMTLog")("INFO", paste("Adjust p-value :", adj.method))
getOption("MSstatsTMTLog")("INFO", paste("Remove empty channels :",
remove_empty_channel))
getOption("MSstatsTMTLog")("INFO", paste("Remove normalization channels :",
remove_norm_channel))
summarized = MSstatsPrepareForGroupComparisonTMT(data$ProteinLevelData,
remove_norm_channel,
remove_empty_channel)
contrast_matrix = MSstats::MSstatsContrastMatrix(contrast.matrix,
unique(summarized$Group))
fitted_models = MSstatsFitComparisonModelsTMT(summarized)
FittedModel <- fitted_models$fitted_model
names(FittedModel) <- fitted_models$protein
fitted_models = MSstatsModerateTTest(summarized, fitted_models, moderated)
testing_results = MSstatsGroupComparisonTMT(fitted_models, contrast_matrix)
testing_results = MSstatsGroupComparisonOutputTMT(testing_results,
adj.method)
if(save_fitted_models){
list(ComparisonResult = testing_results,
ModelQC = NULL,
FittedModel = FittedModel)
} else{
list(ComparisonResult = testing_results,
ModelQC = NULL,
FittedModel = NULL)
}
}
#' Prepare output of proteinSummarization for group comparison
#'
#' @param input output of proteinSummarization
#' @param remove_norm_channel if TRUE, "Norm" channel will be removed
#' @param remove_empty_channel, if TRUE, empty channel will be removed
#'
#' @return data.table
#'
#' @keywords internal
#'
MSstatsPrepareForGroupComparisonTMT = function(input, remove_norm_channel,
remove_empty_channel) {
Condition <- Abundance <- NULL
input = data.table::as.data.table(input)
input = .checkGroupComparisonInput(input)
if (remove_empty_channel & is.element("Empty", unique(input$Condition))) {
input = input[Condition != "Empty",]
input$Condition = factor(input$Condition)
}
if (remove_norm_channel & is.element("Norm", unique(input$Condition))) {
input = input[Condition != "Norm",]
input$Condition = factor(input$Condition)
}
input = input[!is.na(Abundance),]
data.table::setnames(input, c("BioReplicate", "Condition"),
c("Subject", "Group"))
## Ting: need to change later for time course design
input$Group = factor(input$Group)
input$Subject = factor(input$Subject)
input$Run = factor(input$Run)
input$Channel = factor(input$Channel)
input$TechRepMixture = factor(input$TechRepMixture)
input$Mixture = factor(input$Mixture)
input$Protein = as.character(input$Protein)
input
}
#' Fit linear models for group comparison
#'
#' @param input output of the MSstatsPrepareForGroupComparisonTMT function
#'
#' @return list
#'
#' @keywords internal
#'
MSstatsFitComparisonModelsTMT = function(input) {
Abundance = Group = Protein = NULL
Channel = Mixture = Run = Subject = TechRepMixture = NULL
all_proteins = as.character(unique(input$Protein))
num_proteins = length(all_proteins)
linear_models = vector("list", num_proteins)
annotation = unique(input[, list(Run, Channel, Subject,
Group, Mixture, TechRepMixture)])
has_single_subject = .checkSingleSubject(annotation)
has_techreps = .checkTechReplicate(annotation)
has_biomixtures = .checkMulBioMixture(annotation)
has_single_run = .checkSingleRun(annotation)
has_Repeated_Measures <- .checkRepeatedMeasures(annotation)
if(has_biomixtures){
msg = paste0("Design: ", length(unique(annotation$Mixture)), " mixtures.")
} else{
msg = paste0("Design: 1 mixture.")
}
getOption("MSstatsTMTLog")("INFO", msg)
getOption("MSstatsTMTMsg")("INFO", msg)
if(has_techreps){
msg = paste0("Design: ", length(unique(annotation$TechRepMixture)),
" technical replicated MS runs per mixture.")
} else{
msg = paste0("Design: 1 MS run per mixture.")
}
getOption("MSstatsTMTLog")("INFO", msg)
getOption("MSstatsTMTMsg")("INFO", msg)
if(has_single_subject){
msg = paste0("Design: 1 subject per condition (No biological variation).")
getOption("MSstatsTMTLog")("INFO", msg)
getOption("MSstatsTMTMsg")("INFO", msg)
} else{
if(has_Repeated_Measures){
msg = paste0("Design: repeated measures design (A biological subject is measured in multiple conditions).")
} else{
msg = paste0("Design: group comparison design (Different conditions contains different biological subjects).")
}
getOption("MSstatsTMTLog")("INFO", msg)
getOption("MSstatsTMTMsg")("INFO", msg)
}
msg = paste0("Model fitting for ", num_proteins , " proteins.")
getOption("MSstatsTMTLog")("INFO", msg)
getOption("MSstatsTMTMsg")("INFO", msg)
pb = txtProgressBar(max=num_proteins, style=3)
for (i in seq_along(all_proteins)) {
protein = all_proteins[i]
single_protein = input[Protein == protein]
model_fit_result = MSstatsComparisonModelSingleTMT(single_protein,
protein)
linear_models[[i]] = model_fit_result
setTxtProgressBar(pb, i)
}
close(pb)
rbindlist(linear_models)
}
#' Fit a linear model for group comparison for a single protein
#'
#' @param single_protein protein-level data for a single protein (single element
#' of list created by the MSstatsPrepareForGroupComparisonTMT function)
#' @param protein_name name of a protein from the single_protein data.table
#'
#' @return list
#'
#' @keywords internal
#'
MSstatsComparisonModelSingleTMT = function(single_protein, protein_name) {
Run <- Channel <- Subject <- Group <- Mixture <- TechRepMixture <- NULL
annotation = unique(single_protein[, list(Run, Channel, Subject,
Group, Mixture, TechRepMixture)])
has_single_subject = .checkSingleSubject(annotation)
has_techreps = .checkTechReplicate(annotation)
has_biomixtures = .checkMulBioMixture(annotation)
has_single_run = .checkSingleRun(annotation)
has_Repeated_Measures <- .checkRepeatedMeasures(annotation)
fitted_model = .fitModelTMT(single_protein, has_single_subject,
has_techreps, has_biomixtures, has_single_run,
has_Repeated_Measures)
result = .addVarianceInformation(fitted_model, protein_name)
result
}
#' Moderate T statistic for group comparison
#'
#' @param summarized protein-level data produced by the proteinSummarization function
#' @param fitted_models output of the MSstatsFitComparisonModelsTMT function
#' @param moderated if TRUE, moderation will be performed
#'
#' @return list
#'
#' @keywords internal
#'
MSstatsModerateTTest = function(summarized, fitted_models, moderated) {
variance_df <- variance <- Protein <- NULL
eb_input_s2 = fitted_models[variance_df != 0 & !is.na(variance_df),
variance]
eb_input_df = fitted_models[variance_df != 0 & !is.na(variance_df),
variance_df]
if (moderated) {
eb_fit = limma::squeezeVar(eb_input_s2, eb_input_df, legacy = TRUE)
df_prior = eb_fit$df.prior
variance_prior = eb_fit$var.prior
} else { ## ordinary t statistic
variance_prior = 0
df_prior = 0
}
fitted_models$df_prior = df_prior
fitted_models$variance_prior = variance_prior
fitted_models$total_df <- sum(eb_input_df, na.rm=TRUE)
result = lapply(split(fitted_models, fitted_models$protein), as.list)
result = lapply(result, function(single_fitted_model) {
protein = single_fitted_model$protein
single_fitted_model$data = summarized[Protein == protein]
single_fitted_model
})
}
#' Group comparison for TMT data
#'
#' @param fitted_models output of the MSstatsModerateTTest function
#' @param contrast_matrix contrast matrix
#'
#' @return data.table
#'
#' @keywords internal
#'
MSstatsGroupComparisonTMT = function(fitted_models, contrast_matrix) {
msg = paste0("Testing for ", length(fitted_models) , " proteins:")
getOption("MSstatsTMTLog")("INFO", msg)
getOption("MSstatsTMTMsg")("INFO", msg)
testing_results = vector("list", length(fitted_models))
pb = txtProgressBar(max = length(fitted_models), style = 3)
for (i in seq_along(fitted_models)) {
testing_result = MSstatsTestSingleProteinTMT(fitted_models[[i]],
contrast_matrix)
testing_results[[i]] = testing_result
setTxtProgressBar(pb, i)
}
close(pb)
testing_results
}
#' Hypothesis tests for a single protein in TMT data
#'
#' @param fitted_model single element of the MSstatsModerateTTest output
#' @param contrast_matrix contrast matrix
#'
#' @return list
#'
#' @keywords internal
#'
MSstatsTestSingleProteinTMT = function(fitted_model, contrast_matrix) {
single_protein = fitted_model[["data"]]
groups = as.character(unique(single_protein$Group))
groups = sort(groups)
coefs = fitted_model[["coefs"]][[1]]
s2 = fitted_model[["variance"]]
s2_df = fitted_model[["variance_df"]]
fit = fitted_model[["fitted_model"]][[1]]
s2_prior = fitted_model[["variance_prior"]]
df_prior = fitted_model[["df_prior"]]
total_df = fitted_model[["total_df"]]
protein = fitted_model[["protein"]]
results = vector("list", nrow(contrast_matrix))
if (!inherits(fit, "try-error")) {
if (is.infinite(df_prior)) {
s2_posterior <- s2_prior
} else{
s2_posterior = (s2_prior * df_prior + s2 * s2_df) / (df_prior + s2_df)
}
for (row_id in seq_len(nrow(contrast_matrix))) {
contrast = contrast_matrix[row_id, , drop = FALSE]
positive.groups = colnames(contrast)[contrast > 0]
negative.groups = colnames(contrast)[contrast < 0]
if (any(positive.groups %in% groups) &
any(negative.groups %in% groups)) {
cm = .makeContrastSingleTMT(fit, contrast, single_protein, coefs)
FC = (cm %*% coefs)[1, 1]
if (inherits(fit, "lm")) {
se2.post = diag(t(cm) %*% summary(fit)$cov.unscaled %*% cm) * s2_posterior
df.post = s2_df + df_prior
} else {
# Acknowlege: Tyler Bradshawthis contributed to this part of implementation
vcov = fit@vcov_beta
se2 = as.matrix(t(cm) %*% as.matrix(vcov) %*% cm)
## calculate posterior variance
vcov.post = fit@pp$unsc() * s2_posterior
se2.post = as.matrix(t(cm) %*% as.matrix(vcov.post) %*% cm)
## calculate posterior df
g <- sapply(fit@Jac_list, function(gm) cm %*% gm %*% cm)
denom <- try(t(g) %*% fit@vcov_varpar %*% g, silent=TRUE)
if (inherits(denom, "try-error")) {
df.post = s2_df + df_prior
} else{
df.post = 2*(se2)^2/denom + df_prior
}
}
df.post <- pmin(df.post, total_df)
t = FC / sqrt(se2.post)
p = 2*pt(-abs(t), df = df.post)
se = sqrt(se2.post)
DF = df.post
if (s2_df == 0) {
issue = "SingleMeasurePerCondition"
} else{
issue = NA
}
} else {
result = .handleMissingConditionTMT(single_protein, contrast[1, ])
FC = result[["logFC"]]
p = NA
se = NA
DF = NA
issue = result[["issue"]]
}
results[[row_id]] = list(Protein = protein, Comparison = row.names(contrast),
log2FC = FC, pvalue = p, SE = se, DF = DF, issue = issue)
# This code cannot be extracted to a new function yet due to
# performance issues with environments
}
} else {
# very few measurements so that the model is unfittable
for (row_id in 1:nrow(contrast_matrix)) {
contrast = contrast_matrix[row_id, , drop = FALSE]
result = .handleMissingConditionTMT(single_protein, contrast[1, ])
results[[row_id]] = list(Protein = protein,
Comparison = row.names(contrast),
log2FC = result$logFC, pvalue = NA, SE = NA,
DF = NA, issue = result$issue)
}
}
data.table::rbindlist(results)
}
#' Combine testing results for individual proteins
#'
#' @param testing_results output of the MSstatsGroupComparisonTMT function
#' @param adj_method method that will be used to adjust p-values for multiple comparisons
#'
#' @return data.table
#'
#' @keywords internal
#'
MSstatsGroupComparisonOutputTMT = function(testing_results, adj_method) {
adj.pvalue <- pvalue <- Protein <- Comparison <- log2FC <- SE <- DF <- issue <- NULL
testing_results = data.table::rbindlist(testing_results)
testing_results$Protein = as.factor(testing_results$Protein)
testing_results[, adj.pvalue := p.adjust(pvalue, adj_method),
by = "Comparison"]
testing_results[, list(Protein, Label = Comparison, log2FC, SE, DF,
pvalue, adj.pvalue, issue)]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.