R/TIEC.R

Defines functions createTIEC runMocks createMocks

Documented in createMocks createTIEC runMocks

#' @title createMocks
#'
#' @export
#' @description
#' Given the number of samples of the dataset from which the mocks should be
#' created, this function produces a \code{data.frame} object with as many rows
#' as the number of mocks and as many columns as the number of samples. If an
#' odd number of samples is given, the lower even integer will be considered in
#' order to obtain a balanced design for the mocks.
#' @param nsamples an integer representing the total number of samples.
#' @param N number of mock comparison to generate.
#'
#' @return a \code{data.frame} containing \code{N} rows and \code{nsamples}
#' columns (if even). Each cell of the data frame contains the "grp1" or "grp2"
#' characters which represent the mock groups pattern.
#'
#' @examples
#' # Generate the pattern for 100 mock comparisons for an experiment with 30
#' # samples
#' mocks <- createMocks(nsamples = 30, N = 100)
#' head(mocks)
createMocks <- function(nsamples, N = 1000) {
    sample_num <- nsamples %/% 2 * 2 # Balanced design sample numerosity
    mock_df <- matrix(NA, nrow = N, ncol = sample_num)
    for (i in seq_len(N)) { # N random balanced relabellings
        grps <- rep("grp1", sample_num)
        grps[sample(seq_len(sample_num), size = sample_num / 2)] <- "grp2"
        mock_df[i, ] <- grps
    }
    rownames(mock_df) <- paste0("Comparison", seq_len(N))
    return(mock_df)
} # END - function: createMocks

#' @title runMocks
#'
#' @export
#' @import BiocParallel
#' @importFrom phyloseq sample_data
#' @description
#' Run the differential abundance detection methods on mock datasets.
#'
#' @param mocks a \code{data.frame} containing \code{N} rows and \code{nsamples}
#' columns (if even). Each cell of the data frame contains the "grp1" or "grp2"
#' characters which represent the mock groups pattern. Produced by the
#' \code{\link{createMocks}} function.
#' @inheritParams get_counts_metadata
#' @inheritParams runDA
#' @inheritParams BiocParallel::bpmapply
#'
#' @return A named list containing the results for each method.
#'
#' @examples
#' # Load some data
#' data(ps_stool_16S)
#'
#' # Generate the pattern for 10 mock comparisons
#' # (N = 1000 is suggested)
#' mocks <- createMocks(nsamples = phyloseq::nsamples(ps_stool_16S), N = 10)
#' head(mocks)
#'
#' # Add some normalization/scaling factors to the phyloseq object
#' my_norm <- setNormalizations(fun = c("norm_edgeR", "norm_CSS"),
#'     method = c("TMM", "CSS"))
#' ps_stool_16S <- runNormalizations(normalization_list = my_norm,
#'     object = ps_stool_16S)
#'
#' # Initialize some limma based methods
#' my_limma <- set_limma(design = ~ group, coef = 2, norm = c("TMM", "CSS"))
#'
#' # Run methods on mock datasets
#' results <- runMocks(mocks = mocks, method_list = my_limma,
#'     object = ps_stool_16S)
runMocks <- function(mocks, method_list, object, weights = NULL, 
    verbose = TRUE, BPPARAM = BiocParallel::SerialParam()){
    is_phyloseq <- ifelse(is(object, "phyloseq"), TRUE, FALSE)
    index <- seq_len(nrow(mocks))
    mocks_list <- as.list(as.data.frame(t(mocks)))
    out <- BiocParallel::bpmapply(mocks_list, index, 
        FUN = function(x, i) {
        if(is_phyloseq){
            phyloseq::sample_data(object)[, "group"] <- factor(x)
        } else {
            SummarizedExperiment::colData(object)[, "group"] <- factor(x)
        }
        if(verbose)
            message("  - Comparison", i, "\n")
        runDA(method_list = method_list, object = object,
              weights = weights, verbose = verbose)
        }, SIMPLIFY = FALSE, BPPARAM = BPPARAM)
    return(out)
}

#' @title createTIEC
#'
#' @export
#' @importFrom plyr ldply ddply
#' @importFrom stats ks.test
#' @importFrom reshape2 melt
#' @description
#' Extract the list of p-values from the outputs of the differential abundance
#' detection methods to compute several statistics to study the ability to
#' control the type I error and the p-values distribution.
#'
#' @param object Output of the differential abundance tests on mock comparisons.
#' Must follow a specific structure with comparison, method, matrix of
#' p-values, and method's name (See vignette for detailed information).
#'
#' @return A \code{list} of \code{data.frame}s:
#' \itemize{
#'     \item{\code{df_pval}}{ 5 columns per number_of_features x methods x
#'     comparisons rows data.frame. The four columns are called Comparison,
#'     Method, variable (containing the feature names), pval, and padj;}
#'     \item{\code{df_FPR}}{ 5 columns per methods x comparisons rows 
#'     data.frame. For each set of method and comparison, the proportion of 
#'     false positives, considering 3 thresholds (0.01, 0.05, 0.1) are 
#'     reported;}
#'     \item{\code{df_FDR}}{ 4 columns per methods rows data.frame. For each 
#'     method, the average proportion of mock comparisons where false positives
#'     are found, considering 3 thresholds (0.01, 0.05, 0.1), are reported. 
#'     Each value is an estimate of the nominal False Discovery Rate (FDR);}
#'     \item{\code{df_QQ}}{ contains the coordinates to draw the QQ-plot to
#'     compare the mean observed p-value distribution across comparisons, with
#'     the theoretical uniform distribution;}
#'     \item{\code{df_KS}}{ 5 columns and methods x comparisons rows data.frame.
#'     For each set of method and comparison, the Kolmogorov-Smirnov test
#'     statistics and p-values are reported in KS and KS_pval columns
#'     respectively.}}
#'
#' @seealso \code{\link{createMocks}}
#'
#' @examples
#' # Load some data
#' data(ps_stool_16S)
#'
#' # Generate the patterns for 10 mock comparison for an experiment
#' # (N = 1000 is suggested)
#' mocks <- createMocks(nsamples = phyloseq::nsamples(ps_stool_16S), N = 10)
#' head(mocks)
#'
#' # Add some normalization/scaling factors to the phyloseq object
#' my_norm <- setNormalizations(fun = c("norm_edgeR", "norm_CSS"),
#'     method = c("TMM", "CSS"))
#' ps_stool_16S <- runNormalizations(normalization_list = my_norm,
#'     object = ps_stool_16S)
#'
#' # Initialize some limma based methods
#' my_limma <- set_limma(design = ~ group, coef = 2,
#'     norm = c("TMM", "CSS"))
#'
#' # Run methods on mock datasets
#' results <- runMocks(mocks = mocks, method_list = my_limma,
#'     object = ps_stool_16S)
#'
#' # Prepare results for Type I Error Control
#' TIEC_summary <- createTIEC(results)
#'
#' # Plot the results
#' plotFPR(df_FPR = TIEC_summary$df_FPR)
#' plotFDR(df_FDR = TIEC_summary$df_FDR)
#' plotQQ(df_QQ = TIEC_summary$df_QQ, zoom = c(0, 0.1))
#' plotKS(df_KS = TIEC_summary$df_KS)
#' plotLogP(df_QQ = TIEC_summary$df_QQ)
createTIEC <- function(object) {
    # Create a list of data frames with two columns: pval and method name
    # One data frame for each comparison
    message("1. Extracting statistics")
    df_list_pval <- lapply(X = object, FUN = function(Comparison) {
        pval_df <- reshape2::melt(
            plyr::ldply(
                extractStatistics(object = Comparison,
                    slot = "pValMat", colName = "rawP", type = "pvalue",
                    direction = NULL, verbose = FALSE
                ), .id = "Method"
            ), value.name = "pval"
        )
        padj_df <- reshape2::melt(
            plyr::ldply(
                extractStatistics(object = Comparison,
                    slot = "pValMat", colName = "adjP", type = "pvalue",
                    direction = NULL, verbose = FALSE
                ), .id = "Method"
            ), value.name = "padj"
        )
        return(data.frame(pval_df, "padj" = padj_df[ ,"padj"]))
    })
    # Melt down to a single data frame with the Comparison column added
    df_pval <- plyr::ldply(df_list_pval, .id = "Comparison")
    ### FPR ###
    message("2. Counting p-values lower than some thresholds")
    # Count the p-values which are lower than a selected threshold
    df_FPR <- plyr::ddply(.data = df_pval, .variables = ~ Comparison +
        Method, .fun = function(x) {
        # index for not NA p-values
        k_pval <- !is.na(x[, "pval"])
        FPR_obs001 <- sum(x[, "pval"] < 0.01, na.rm = TRUE) / sum(k_pval)
        FPR_obs005 <- sum(x[, "pval"] < 0.05, na.rm = TRUE) / sum(k_pval)
        FPR_obs01 <- sum(x[, "pval"] < 0.1, na.rm = TRUE) / sum(k_pval)
        return(data.frame(FPR_obs001, FPR_obs005, FPR_obs01))
    })
    ### FDR ###
    # Check if a Method is able to find 0 DA feature after p-value correction
    # When the #DA is 0, the FDR will be 0 by construction
    # When the #DA is >0, the FDR will be 1
    # FDR = FP / (FP + TP), TP is always 0 by construction
    message("3. Counting adjusted p-values lower than some thresholds")
    # Count the adjusted p-values which are lower than a selected threshold
    df_pval_FDR <- plyr::ddply(.data = df_pval, .variables = ~ Comparison +
        Method, .fun = function(x) {
        # index for not NA p-values
        k_pval <- !is.na(x[, "padj"])
        FDR_obs001 <- sum(x[, "padj"] < 0.01, na.rm = TRUE) / sum(k_pval)
        FDR_obs005 <- sum(x[, "padj"] < 0.05, na.rm = TRUE) / sum(k_pval)
        FDR_obs01 <- sum(x[, "padj"] < 0.1, na.rm = TRUE) / sum(k_pval)
        return(data.frame(FDR_obs001, FDR_obs005, FDR_obs01))
    })
    # Compute the mean for each threshold
    df_FDR <- plyr::ddply(.data = df_pval_FDR, .variables = ~ Method, 
        .fun = function(x) {
            colMeans(x[, c("FDR_obs001", "FDR_obs005", "FDR_obs01")] > 0)
        })
    ### QQ and KS ###
    message("4. Computing KS statistics")
    # Create data frame for empirical and theoretical p-values
    # Kolmogorov-Smirnov test
    df_QQ_KS <- plyr::ddply(.data = df_pval, .variables = ~ Comparison + Method,
        .fun = function(x) {
        # Ordered list of p-values
        x <- x[order(x[, "pval"]), ]
        # Index of not NA p-values
        k_pval <- !is.na(x[, "pval"])
        # Theoretical uniform distribution
        x$pval_theoretical[k_pval] <- (seq_len(sum(k_pval))) / (sum(k_pval))
        x$pval_theoretical_rounded[k_pval] <- round((seq_len(sum(k_pval))) /
            (sum(k_pval)), digits = 2)
        x[c("KS","KS_pval")] <- stats::ks.test(
            x = x$pval_theoretical[k_pval],
            y = x$pval[k_pval])[c("statistic","p.value")]
        return(x)
    })
    ### QQ ###
    message("5. Ordering quantiles")
    # For each theoretical p-value, the mean of observed one is computed
    df_QQ <- plyr::ddply(.data = df_QQ_KS, .variables = ~ Method +
        pval_theoretical_rounded, .fun = function(x) {
        pval_mean <- mean(x$pval)
        return(pval_mean)
    })
    names(df_QQ) <- c("Method", "pval_theoretical", "pval_observed")
    ### KS ###
    # Compute the mean for KS statistics and p-values
    df_KS <- plyr::ddply(
        .data = df_QQ_KS, .variables = ~ Comparison + Method,
        .fun = function(x) {
            colMeans(x[, c("KS", "KS_pval")])
        }
    )
    return(list(
        df_pval = df_pval, df_FPR = df_FPR, df_FDR = df_FDR, df_QQ = df_QQ, 
        df_KS = df_KS
    ))
} # END - function: createTIEC
mcalgaro93/benchdamic documentation built on Nov. 28, 2024, 2:16 p.m.