R/quantifyViaDecoupleR.R

Defines functions plotSampleScores permuteFeatures decoupleFeatures scoreSamples

Documented in decoupleFeatures permuteFeatures plotSampleScores scoreSamples

#' @title scoreSamples
#'
#' @description 
#' This function performs an enrichment analysis to test whether a given
#' proteomic sample is contaminated with markers from apoptosis or necroptosis.
#'
#' @details 
#' The parameter \code{values} has to have (gene) symbols as \code{row.names}.
#' \code{values} can be either a numeric \code{vector}, \code{matrix}, or 
#' \code{data.frame}. It holds the normalized intensity values of features 
#' (e.g. proteins).
#' 
#' The signatures are taken per default from Tanzer et al. (2020), but also 
#' a \code{tibble} containing the contamination markers can be given to the 
#' function as argument \code{signatures}. 
#' In that case the \code{tibble} has to contain at least the 
#' column \code{"feature"} that contains the marker features (e.g. proteins).  
#' 
#' The function will either run \code{decoupleFeatures} or 
#' \code{permuteFeatures} to calculate scores. The behaviour will be triggered
#' by the presence of the column \code{value} in \code{signatures} (in that 
#' case, \code{decoupleFeatures} is run).
#' 
#' @param values numeric \code{vector}, \code{matrix}, or \code{data.frame}
#' @param contamination \code{character(1)}, either \code{"apoptosis"} or 
#' \code{"necroptosis"}
#' @param ... further arguments passed to \code{readMarkers}
#'  
#' @importFrom methods formalArgs
#' @importFrom stats na.omit
#' @importFrom decoupleR decouple
#' 
#' @export
#' 
#' @return data.frame, containining normalized enrichment (mean) scores for the
#' contamination signature (the higher the score, the higher the potential
#' contamination). If there is no quantitative information on changes 
#' (e.g. fold changes or t-values), the function will return an empirical
#' distribution of difference of means of the markers to all features in the 
#' data set
#' 
#' @references
#' Tanzer et al. (2020): Quantitative and Dynamic Catalogs of Proteins 
#' Released during Apoptotic and Necroptotic Cell Death. 
#' \emph{Cell Reports}, 30,  1260-1270.e5. 10.1016/j.celrep.2019.12.079.
#' 
#' @examples
#' library(SummarizedExperiment)
#' 
#' f <- system.file("protein_datasets/tanzer2020.RDS", 
#'     package = "apoptosisQuantification")
#' tanzer2020 <- readRDS(f)
#' prot <- assay(tanzer2020)
#'
#' ## use the intensities to calculate scores, the function will load the 
#' ## data set of Tanzer et al. (2020)
#' scoreSamples(values = prot, contamination = "apoptosis", n_perm = 100)
#' 
#' ## use the contamination scores of Tanzer et al. (2020), but with specified
#' ## fc and n_significan
scoreSamples <- function(values, contamination = c("apoptosis", "necroptosis"),
    ...) {

    ## match the contamination argument
    contamination <- match.arg(contamination)

    ## obtain the contamination (apoptoss, necroptosis) markers and prepare the
    ## object for decoupleR
    args_fct <- list(...)
    args_fct[["type"]] <- contamination

    ## match the arguments ... with the arguments of readMarkers
    args <- args_fct[names(args_fct) %in% methods::formalArgs("readMarkers")]
    if (!("signatures" %in% names(args_fct))) {
        args_fct[["signatures"]] <- do.call("readMarkers", args)
    }

    if (is.vector(values)) {
        values <- values |> 
            as.data.frame()
        colnames(values) <- "sample"
    }

    ## z-scale
    if ("scale" %in% names(args_fct))
        if (args_fct[["scale"]]) 
            values <- scale(values)

    ## fix multiple assignments: if there are multiple assignments write only
    ## one marker feature as feature_names
    feature_names <- splitNames(feature_names = rownames(values), na.rm = TRUE)
    ind_markers <- lapply(feature_names, 
        function(names_i) names_i %in% args_fct[["signatures"]]$feature)
    feature_names <- lapply(seq_along(feature_names),
        function(i) {
            ind_i <- which(ind_markers[[i]])
            if (length(ind_i) > 0) feature_names[[i]][min(ind_i)]
            else feature_names[[i]][1]
    })
    feature_names <- unlist(feature_names)

    ## convert the matrix to a data.frame, remove duplicated entries
    values <- values |> as.data.frame()
    feature_names_rem <- feature_names[!duplicated(feature_names)]
    values <- values[!duplicated(feature_names), ]
    rownames(values) <- feature_names_rem

    ## define the arguments for decouple
    if (!("n_perm" %in% names(args_fct))) 
        args_fct[["n_perm"]] <- 1000

    ## depening if the column value is present in signatures, either run the
    ## function decoupleFeatures or permuteFeatures (otherwise)
    if ("change" %in% colnames(args_fct[["signatures"]])) {
        score_values <- decoupleFeatures(values = values, args_fct = args_fct,
            contamination = contamination)
    } else {
        score_values <- permuteFeatures(values = values, args_fct = args_fct,
            contamination = contamination)
    }

    score_values
}

#' @name decoupleFeatures
#' 
#' @title Calculate score values using decoupleR/run_wmean
#' 
#' @description 
#' The function \code{decoupleFeatures} calculates contamination scores
#' using the \code{run_wmean} function from the \code{decoupleR}. The 
#' function will return score values (per sample) that can be interpreted as the 
#' standard deviations away from an empirical null distribution.
#' 
#' @details 
#' The function is insprired by the work of Aurelien Dugourd and his 
#' plasmaContamination package 
#' (https://github.com/saezlab/plasmaContamination/).
#'
#' The function will be called in \code{scoreSamples}.
#' 
#' @param values matrix, the rownames contain the feature names (SYMBOL ids) and
#' colnames the samples
#' @param args_fct list, containing the entries \code{"n_perm"} 
#' (\code{numeric(1)} and \code{"signatures"} (\code{tibble})
#' @param contamination \code{character(1)}, either \code{"apoptosis"} or
#' \code{"necroptosis"}
#' 
#' @return tibble
#' 
#' @export
#' 
#' @examples 
#' library(SummarizedExperiment)
#' f <- system.file("protein_datasets/tanzer2020.RDS", 
#'     package = "apoptosisQuantification")
#' tanzer2020 <- readRDS(f)
#' prot <- assay(tanzer2020) |>
#'     as.data.frame()
#' 
#' ## define a list with paramters
#' args_fct <- list()
#' args_fct[["signatures"]] <- readMarkers(type = "apoptosis", fc = 2, n = 1)
#' args_fct[["n_perm"]] <- 100
#' 
#' ## run the function
#' decoupleFeatures(values = prot, args_fct = args_fct, 
#'     contamination = "apoptosis")
decoupleFeatures <- function(values, args_fct, 
    contamination = c("apoptosis", "necroptosis")) {
    
    contamination <- match.arg(contamination)
    
    if (!("n_perm" %in% names(args_fct)))
        stop("'n_perm' not in 'names(args_fct)'")
    
    n_perm <- args_fct[["n_perm"]]
    if (!(is.numeric(n_perm) & length(n_perm) == 1))
        stop("'n_perm' has to be numeric of length 1")
    
    if (!("signatures" %in% names(args_fct)))
        stop("'signatures' not in 'names(args_fct)'")
    signatures <- args_fct[["signatures"]]
    
    signatures <- tibble::add_column(signatures, 
        likelihood = 1, set = contamination, mor = signatures$change)
    
    ## create an empty list to store the results
    score_list <- list()
    
    for(i in seq_len(ncol(values))) {
        sub_values <- values |>
            dplyr::select(all_of(i)) |> 
            stats::na.omit()
        sub_values$decouplerIsGreat <- sub_values[, 1]
        
        ## run decoupleR
        means <- decoupleR::run_wmean(mat = as.matrix(sub_values), 
            network = signatures, .source = "set", .target = "feature", 
            .mor = "mor", .likelihood = "likelihood", 
            times = args_fct[["n_perm"]])
        
        ## filter the output of decouple, remove some rows and columns and 
        ## store the object in the list
        means <- means |> 
            dplyr::filter(!!as.symbol("statistic") == "norm_wmean") |>
            dplyr::filter(!!as.symbol("condition") != "decouplerIsGreat") |> 
            dplyr::select(-all_of(c("statistic", "p_value")))
        score_list[[i]] <- means
    }
    
    ## combine the results (norm_wmean scores) and return the object
    score_tbl <- Reduce("rbind", score_list)
    colnames(score_tbl)[colnames(score_tbl) == "source"] <- "set"
    colnames(score_tbl)[colnames(score_tbl) == "score"] <- "value"
    tibble::tibble(score_tbl)
}

#' @name permuteFeatures
#' 
#' @title Calculate score values by calculating difference between
#' means of permuted features and markers
#' 
#' @description 
#' The function \code{permuteFeatures} calculates the difference between the
#' mean of marker features (e.g. proteins) and random samples of the 
#' feature universe (features in \code{values}). The scores will be a 
#' relative quantification of the marker features to the universe 
#' feature intensities. Samples that 
#' undergo apoptosis (or any other contamination) will have different 
#' differences than uncontaminated samples.
#' 
#' The output will be a \code{tibble} containing the differences in means
#' for the samples and repetitions. 
#' 
#' @details 
#' The number of repetitions is defined by \code{n_perm}.
#' 
#' The function will be called in \code{scoreSamples}.
#' 
#' @param values \code{data.frame}
#' @param args_fct list of arguments, has to contain the entries
#' \code{n_perm} (\code{numeric(1)}) and \code{signatures} (\code{tibble})
#' @param contamination \code{character(1)}, either \code{"apoptosis"} or 
#' \code{"necroptosis"}
#' @param seed \code{numeric(1)}
#'
#' @importFrom dplyr everything
#' @importFrom tidyr pivot_longer
#' 
#' @export
#' 
#' @examples
#' library(SummarizedExperiment)
#' f <- system.file("protein_datasets/tanzer2020.RDS", 
#'     package = "apoptosisQuantification")
#' tanzer2020 <- readRDS(f)
#' prot <- assay(tanzer2020) |>
#'     as.data.frame()
#' 
#' ## define a list with paramters
#' args_fct <- list()
#' args_fct[["signatures"]] <- readMarkers(type = "apoptosis", fc = 2, n = 1)
#' args_fct[["n_perm"]] <- 100
#' 
#' ## run the function
#' permuteFeatures(values = prot, args_fct = args_fct, 
#'     contamination = "apoptosis", seed = 2022)
permuteFeatures <- function(values, args_fct, 
    contamination = c("apoptosis", "necroptosis"), seed = 2022) {

    contamination <- match.arg(contamination)

    if (!("n_perm" %in% names(args_fct)))
        stop("'n_perm' not in 'names(args_fct)'")

    n_perm <- args_fct[["n_perm"]]
    if (!(is.numeric(n_perm) & length(n_perm) == 1))
        stop("'n_perm' has to be numeric of length 1")

    if (!("signatures" %in% names(args_fct)))
        stop("'signatures' not in 'names(args_fct)'")
    signatures <- args_fct[["signatures"]]

    ## calculate the means of the markers
    mean_marker <- apply(values[signatures$feature, ], 2, mean, na.rm = TRUE)

    ## create random numbers and calculate the means for these random draws
    set.seed(seed)
    inds <- lapply(seq_len(n_perm), function(i) {
        sample(x = seq_len(nrow(values)), size = nrow(signatures), 
               replace = FALSE)
    })
    means_inds <- lapply(inds, function(inds_i) {
          apply(values[inds_i, ], 2, mean, na.rm = TRUE)
    })

    ## calculate the difference between the markers and the randomly sampled
    ## features
    diff_means_inds <- lapply(means_inds, function(means_inds_i) {
        means_inds_i - mean_marker
    })

    score_values <- do.call("rbind", diff_means_inds) |>
        as.data.frame() |>
        tidyr::pivot_longer(cols = dplyr::everything(), 
            names_to = "condition", values_to = "value")

    tibble::tibble(score_values, set = contamination)
}


#' @name plotSampleScores
#' 
#' @title Plot the contamination scores of samples
#' 
#' @description
#' The function \code{plotSampleScores} creates a barplot or boxplot that 
#' visualizes the contamination score values obtained by 
#' \code{scoreSamples}.
#' 
#' @details
#' The function takes the argument \code{scores} that can be created via 
#' \code{scoreSamples}.
#' 
#' @param scores tibble
#' 
#' @return \code{gg}
#' 
#' @export
#' 
#' @importFrom ggplot2 ggplot aes_string geom_bar theme_minimal theme 
#' @importFrom ggplot2 element_text ylab xlab
#' 
#' @examples
#' library(SummarizedExperiment)
#' library(tibble)
#' 
#' f <- system.file("protein_datasets/tanzer2020.RDS", 
#'     package = "apoptosisQuantification")
#' tanzer2020 <- readRDS(f)
#' prot <- assay(tanzer2020)
#'
#' ## use the intensities to calculate scores
#' scores <- scoreSamples(values = prot, contamination = "apoptosis",fc = 2, n = 1, n_perm = 100)
#' scores$treatment <- tanzer2020$treatment
#' plotSampleScores(scores = scores) +
#'     ggplot2::facet_wrap(~ treatment, scales = "free_x") +
#'     ggplot2::theme(legend.position = "none")
#' 
#' ## use the intensities to calculate scores
#' scores <- scoreSamples(values = prot, contamination = "necroptosis", n_perm = 100)
#' scores$treatment <- tanzer2020$treatment
#' plotSampleScores(scores = scores) +
#'     ggplot2::facet_wrap(~ treatment, scales = "free_x") +
#'     ggplot2::theme(legend.position = "none")
plotSampleScores <- function(scores) {
    
    ## check scores
    if (!"value" %in% colnames(scores)) 
        stop("column 'value' not in 'scores'")
    if (!"condition" %in% colnames(scores)) 
        stop("column 'condition' not in 'scores'")
    if (!"set" %in% colnames(scores)) 
        stop("column 'set' not in 'scores'")
    
    if (any(duplicated(scores$condition))) {
        .method <- "permuteFeatures"
    } else {
        .method <- "decoupleFeatures"
    }
    
    ## factorize the condition column (used for x-axis)
    scores$condition <- factor(scores$condition, 
        levels = unique(scores$condition))
    
    ## do the actual plotting
    g <- ggplot2::ggplot(scores, 
        ggplot2::aes_string(y = "value", x = "condition"))
    
    ## depending on the function that was used to calculate the scores, plot 
    ## either a barplot (function decoupleFeatures)or a boxplot 
    ## (function permuteFeatures)
    if (.method == "decoupleFeatures")
        g <- g + ggplot2::geom_bar(
            ggplot2::aes_string(fill = "set", col = "set"), 
            position = "dodge", stat = "identity", color="black", ) +
            ggplot2::ylab("s.d. to empirical null distribution")

    if (.method == "permuteFeatures")
        g <- g + ggplot2::geom_boxplot(ggplot2::aes_string(x = "condition")) + 
            ggplot2::ylab("difference to markers")

    ## continue with changing the theme and return the plot
    g + ggplot2::theme_minimal() +
        ggplot2::xlab("samples") +
        ggplot2::theme(axis.text.x = ggplot2::element_text(size = 5, 
            angle = 90, hjust = 1))
}
tnaake/apoptosisQuantification documentation built on Feb. 20, 2022, 5:37 p.m.