BatchQC: Batch Effects Quality Control Software

Documented in counts2pvalue goodness_of_fit_DESeq2 nb_histogram

#' This function calculates p-values for each gene given counts, estimated NB
#' size, and estimated NB mean
#' @param counts a vector of gene expression values (in counts)
#' @param size an estimated size parameter of the NB distributions for the gene
#' @param mu a vector of estimated mu parameter of the NB distributions for
#'   different samples of the gene
#' @importFrom stats pnbinom ks.test
#' @return a p-value based on estimated NB size and mean
#' @keywords internal

counts2pvalue <- function(counts, size, mu) {
    counts <- as.numeric(counts)
    if (max(counts) <= 3) {
        p.fit <- NA
    }else {
        p <- pnbinom(counts, size = size, mu = mu)
        p.fit <- ks.test(p, 'punif')$p.value
    }
    return(p.fit)
}

#' This function calculates goodness-of-fit pvalues for all genes by looking at
#' how the NB model by DESeq2 fit the data
#' @import DESeq2
#' @import SummarizedExperiment
#' @importFrom S4Vectors DataFrame
#' @importFrom stats na.omit
#' @param se the se object where all the data is contained
#' @param count_matrix name of the assay with gene expression matrix (in counts)
#' @param condition name of the se colData with the condition status
#' @param other_variables name of the se colData containing other variables of
#'   interest that should be considered in the DESeq2 model
#' @param num_genes downsample value, default is 500 (or all genes if less)
#' @param seeding integer to set the seed to for reproducibility; default is 13
#' @return a matrix of pvalues where each row is a gene and each column is a
#'   level within the condition of interest
#' @export
#' @examples
#' # example code
#' library(scran)
#' se <- mockSCE(ncells = 20)
#' se$Treatment <- as.factor(se$Treatment)
#' se$Mutation_Status <- as.factor(se$Mutation_Status)
#' nb_results <- goodness_of_fit_DESeq2(se = se, count_matrix = "counts",
#'   condition = "Treatment", other_variables = "Mutation_Status")
#' nb_results[1]
#' nb_results[2]
#' nb_results[3]


goodness_of_fit_DESeq2 <- function(se, count_matrix, condition,
    other_variables = NULL, num_genes = 500, seeding = 13) {
    # set seed
    set.seed(seeding)
    # Obtain needed data from se object
    count_matrix <- SummarizedExperiment::assays(se)[[count_matrix]]
    condition <- SummarizedExperiment::colData(se)[[condition]]
    condition <- as.factor(condition)

    num_samples <- dim(count_matrix)[2]

    # Ensure the number of genes is greater than the desired number for sampling
    if (dim(count_matrix)[1] < num_genes) {
        num_genes <- dim(count_matrix)[1]
    }

    # Down sample
    if (dim(count_matrix)[1] > num_genes) {
        sampled <- sample(row.names(count_matrix), num_genes)
        col_names_prior <- colnames(count_matrix)
        count_matrix <- count_matrix[sampled, ]
        #rownames(count_matrix) <- sampled
        #colnames(count_matrix) <- col_names_prior
    }
    conditions_df <- NULL
    formula_for_DeSeq <- ""

    if (!is.null(other_variables)) {
        for (i in seq_len(length(other_variables))) {
            conditions_df <- DataFrame(c(conditions_df,
                SummarizedExperiment::colData(se)[[other_variables[i]]]))
            formula_for_DeSeq <- paste0(formula_for_DeSeq,
                " + ",
                other_variables[i])
        }
    }

    colnames(conditions_df) <- other_variables

    for (i in seq_len(length(colnames(conditions_df)))){
        conditions_df[, i] <- as.factor(conditions_df[, i])
    }

    if (num_samples < 20) {
        # Use DESeq2 to fit the NB model
        if (is.null(other_variables)) {
            dds <- DESeqDataSetFromMatrix(count_matrix,
                S4Vectors::DataFrame(condition), ~ condition)
        }else {
            dds <- DESeqDataSetFromMatrix(count_matrix,
                S4Vectors::DataFrame(condition, conditions_df),
                as.formula(paste0("~ condition", formula_for_DeSeq)))
        }

        dds <- DESeq(dds)

        # The size parameters estimated by DESeq2 for each gene
        size <- 1 / dispersions(dds)

        # The mu parameters estimated by DESeq2 for each count
        mu_matrix <- assays(dds)[["mu"]]

        # Count the number of levels in condition
        unique_conditions <- unique(condition)
        num_unique_conditions <- length(unique_conditions)

        # For each condition level, get goodness-of-fit p-values for each genes
        all_pvalues <- sapply(seq_len(length(unique_conditions)), function(j) {
            index_j <- which(condition == unique_conditions[j])
            # For one condition level, calculate the goodness-of-fit p-values
            pvalues_level <-  sapply(seq_len(length(size)), function(i) {
                mu_gene <- mu_matrix[i, index_j]
                count_condition <- count_matrix[i, index_j]
                pvalue <- counts2pvalue(counts = count_condition,
                    size = size[i],
                    mu = mu_gene)
                return(pvalue)
            })
            return(pvalues_level)
        })

        all_pvalues <- as.data.frame(all_pvalues, row.names =
                row.names(count_matrix))
        colnames(all_pvalues) <- unique_conditions
        recommendation <- nb_proportion(all_pvalues, 0.01, 0.42, num_samples)
        res_histogram <- nb_histogram(all_pvalues)
        reference <- paste0("Adapted for small sample sizes from: Li, Y., ",
        "Ge, X., Peng, F. et al. Exaggerated false positives by popular ",
        "differential expression methods when analyzing human population ",
        "samples. Genome Biol 23, 79 (2022). ",
        "https://doi.org/10.1186/s13059-022-02648-4")
    }else {
        conditions_perm <- sample(condition)
        # Do DE analysis on permuted data
        if (is.null(other_variables)) {
            dds <- DESeqDataSetFromMatrix(count_matrix,
                DataFrame(conditions_perm), ~ conditions_perm)
        }else {
            dds <- DESeqDataSetFromMatrix(count_matrix,
                DataFrame(conditions_perm, conditions_df),
                as.formula(paste0("~ conditions_perm", formula_for_DeSeq)))
        }
        dds <- DESeq(dds)
        res <- results(dds)
        # count the number of DEGs
        num_DEGs <- sum(res$padj <= 0.05)

        all_padj_values <- NULL
        all_pvalues <- NULL
        for (i in 2:length(resultsNames(dds))){
            padj_values <- as.data.frame(results(dds,
                name = resultsNames(dds)[i])$padj, row.names = sampled)
            all_padj_values <- as.data.frame(c(all_padj_values, padj_values))
            p_values <- as.data.frame(results(dds,
                name = resultsNames(dds)[i])$pvalue, row.names = sampled)
            all_pvalues <- as.data.frame(c(all_pvalues, p_values))
        }
        rownames(all_padj_values) <- sampled
        rownames(all_pvalues) <- sampled
        all_padj_values <- stats::na.omit(all_padj_values)
        all_pvalues <- stats::na.omit(all_pvalues)
        num_genes <- dim(count_matrix)[1]

        colnames(all_padj_values) <- resultsNames(dds)[2:
                length(resultsNames(dds))]
        colnames(all_pvalues) <- resultsNames(dds)[2:length(resultsNames(dds))]
        levels_of_condition <- length(levels(condition))

        pvals_condition <- as.data.frame(
            all_padj_values[, seq_len(levels_of_condition - 1)])
        colnames(pvals_condition) <- resultsNames(dds)[2:levels_of_condition]
        rownames(pvals_condition) <- rownames(all_padj_values)

        adj_pvals_condition <- as.data.frame(
            all_padj_values[, seq_len(levels_of_condition - 1)])
        colnames(adj_pvals_condition) <-
            resultsNames(dds)[2:levels_of_condition]
        rownames(adj_pvals_condition) <- rownames(all_padj_values)
        threshold <- floor(0.001 * num_genes)
        recommendation <- nb_proportion(adj_pvals_condition,
            pvals_condition,
            0.05,
            threshold,
            num_samples)
        res_histogram <- nb_histogram(all_padj_values)
        reference <- paste0("Paper Reference: Li, Y., ",
        "Ge, X., Peng, F. et al. Exaggerated false positives by popular ",
        "differential expression methods when analyzing human population ",
        "samples. Genome Biol 23, 79 (2022). ",
        "https://doi.org/10.1186/s13059-022-02648-4")
        }
    return(list(recommendation = recommendation, res_histogram = res_histogram,
        reference = reference))
}

#' This function creates a histogram from the negative binomial goodness-of-fit
#' adjusted pvalues.
#' @import tibble
#' @import tidyr
#' @import ggplot2
#' @param adj_p_val_table table of adjusted p-values from the nb test
#' @return a histogram of the number of genes within a p-value range

nb_histogram <- function(adj_p_val_table) {
    # tidy the data so there is a gene, condition and pval column
    adj_p_val_table <- tibble::rownames_to_column(adj_p_val_table, "features")
    adj_p_val_table <- tidyr::pivot_longer(adj_p_val_table,
        cols = 2:length(colnames(adj_p_val_table)),
        names_to = "condition",
        values_to = "p_val")

    nb_histogram <- ggplot2::ggplot(adj_p_val_table, aes_string(x = "p_val")) +
        xlab("adjusted p-value (FDR)") +
        ggplot2::geom_histogram() +
        ggplot2::facet_grid(condition ~ .)

    return(nb_histogram)
}

#' This function determines the proportion of p-values below a specific value
#' and compares to the previously determined threshold of 0.42 for extreme low
#' values.
#' @import tibble
#' @import tidyr
#' @import ggplot2
#' @param adj_p_val_table table of adjusted p-values from the nb test
#' @param p_val_table table of p-values from the nb test
#' @param low_pval value of the p-value cut off to use in proportion
#' @param threshold the value to compare the proportion of p-values to for data
#'   sets less than 20, default is 0.42
#' @param num_samples the number of samples in the analysis
#' @return a statement about whether DESeq2 is appropriate to use for analysis

nb_proportion <- function(adj_p_val_table, p_val_table, low_pval = 0.01,
    threshold = 0.42, num_samples) {
    if (num_samples < 20) {
        proportion_below_value <- mean(adj_p_val_table < low_pval, na.rm = TRUE)
        nb_fit <- proportion_below_value < threshold

        if (nb_fit) {
            recommendation <- "may use DESeq2 for your analysis."
        }else {
            recommendation <- "should not use DESeq2 for your analysis."
        }

        commentary <- paste0("With an adjusted FDR cut off of ", low_pval, ", ",
            (round(proportion_below_value, 2) * 100),
            "% of your features are below the cutoff. ",
            "Thus based on a threshold of ",
            threshold, ", you ", recommendation)
    }else {
        ngenes <- nrow(adj_p_val_table)
        threshold <- ngenes * 1 / 1000

        count_below_value <- 0
        for (i in seq_len(nrow(adj_p_val_table))){
            if (min(adj_p_val_table[i, ]) < low_pval) {
                count_below_value <- count_below_value + 1
            }
        }

        ngene_pval <- nrow(p_val_table)
        threshold_pval <- ngene_pval * 0.05

        count_below_value_pval <- 0
        for (i in seq_len(nrow(p_val_table))){
            if (min(p_val_table[i, ]) < 0.05) {
                count_below_value_pval <- count_below_value_pval + 1
            }
        }

        nb_fit <- count_below_value < threshold
        nb_fit_pval <- count_below_value_pval < threshold_pval

        if (nb_fit & nb_fit_pval) {
            if (count_below_value == 0 && count_below_value_pval == 0) {
                recommendation <- "you may use DESeq2 for your analysis."
            }else {
                recommendation <- paste0("you should be cautious about using ",
                "DESeq2 for your analysis. You have significant features,",
                " and thus you are at risk of receiving false results.")
            }
        }else {
            recommendation <- paste0(
            "therefore, we do not recommend that you should use DESeq2 for ",
            "your analysis.")
        }

        commentary <- paste0("With an adjusted FDR cut off of ", low_pval, ", ",
            count_below_value,
            " of your condition variable features are below the cutoff. ",
            "If DESeq's assumptions are met, we would not expect to find any",
            " significant features. Since ",
            count_below_value, " features have a significant FDR, and ",
            count_below_value_pval,
            " features have a significant pvalue (<0.05), ",
            recommendation)
    }

    return(commentary)
}