R/testDA.R

Defines functions testDA

Documented in testDA

#' Comparing differential abundance/expression methods by Empirical power and False Discovery Rate
#'
#' Calculating Power, False Discovery Rates, False Positive Rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods
#' 
#' mva is excluded by default, as it is slow.
#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
#' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
#' @param R Integer. Number of times to run the tests. Default 20
#' @param tests Character. Which tests to include. Default all
#' @param relative Logical. TRUE (default) for compositional data. FALSE for absolute abundances or pre-normalized data.
#' @param effectSize Numeric. The effect size for the spike-ins. Default 5
#' @param k Vector of length 3. Number of Features to spike in each tertile (lower, mid, upper). E.g. \code{k=c(5,10,15)}: 5 features spiked in low abundance tertile, 10 features spiked in mid abundance tertile and 15 features spiked in high abundance tertile. Default NULL, which will spike 2 percent of the total amount of features in each tertile (a total of 6 percent), but minimum c(5,5,5)
#' @param cores Integer. Number of cores to use for parallel computing. Default one less than available. Set to 1 for sequential computing.
#' @param p.adj Character. Method for p-value adjustment. See \code{p.adjust} for details. Default "fdr"
#' @param args List. A list with lists of arguments passed to the different methods. See details for more.
#' @param out.all If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and limma will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise
#' @param alpha q-value threshold for determining significance for \code{Power}. Default 0.1
#' @param core.check If TRUE will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers.  
#' @param verbose If TRUE will print informative messages
#' @return An object of class \code{DA}, which contains a list of results:
#' \itemize{
#'  \item table - FPR, AUC and spike detection rate for each run
#'  \item results - A complete list of output from all the methods. Example: Get wilcoxon results from 2. run as such: \code{$results[[2]]["wil"]}
#'  \item details - A dataframe with details from the run
#'  \item run.times - A dataframe with average run times of the different methods
#' }
#' @examples 
#' # Creating random count_table and predictor
#' set.seed(5)
#' mat <- matrix(rnbinom(1000, size = 0.5, mu = 500), nrow = 50, ncol = 20)
#' rownames(mat) <- 1:50
#' pred <- c(rep("Control", 10), rep("Treatment", 10))
#' 
#' # Running testDA to find the best method
#' # This example only repeats the test 1 time (R = 1). 
#' # R should be increased to at least 20 for real test.
#' # It also uses 1 core (cores = 1). 
#' # Remove this argument to get it as high (and thereby fast) as possible.
#' res <- testDA(data = mat, predictor = pred, cores = 1, R = 1)
#' summary(res)
#' 
#' \donttest{
#' # Include a paired variable for dependent/blocked samples
#' subject <- rep(1:10, 2)
#' res <- testDA(data = mat, predictor = pred, paired = subject, cores = 1, R = 1)
#' 
#' # Include covariates
#' covar1 <- rnorm(20)
#' covar2 <- rep(c("A","B"), 10)
#' res <- testDA(data = mat, predictor = pred, 
#'               covars = list(FirstCovar = covar1, CallItWhatYouWant = covar2), cores = 1, R = 1)
#' 
#' # Data is absolute abundance
#' res <- testDA(data = mat, predictor = pred, relative = FALSE, cores = 1, R = 1)
#' }
#' 
#' @import stats snow doSNOW foreach utils doParallel
#' @importFrom parallel detectCores
#' @importFrom pROC roc
#' @export

testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20,
                   tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig",
                             "erq","erq2","neb","qpo","poi","sam",
                             "lrm","llm","llm2","lma","lmc",
                             "ere","ere2","pea","spe",
                             "wil","kru","qua","fri","abc",
                             "ttt","ltt","ltt2","tta","ttc","ttr",
                             "aov","lao","lao2","aoa","aoc",
                             "vli","lim","lli","lli2","lia","lic"),
                   relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1),
                   p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){

  stopifnot(exists("data"),exists("predictor"))
  # Check for servers
  if(core.check){
    if(cores > 20){
      ANSWER <- readline(paste("You are about to run testDA using",cores,"cores. Enter y to proceed "))
      if(ANSWER != "y") stop("Process aborted")
    }
  }
  # Time taking
  t1 <- proc.time()
  
  # Extract from phyloseq
  if(is(data, "phyloseq")){
    DAdata <- DA.phyloseq(data, predictor, paired, covars)
    count_table <- DAdata$count_table
    predictor <- DAdata$predictor
    paired <- DAdata$paired
    covars <- DAdata$covars
  } else {
    count_table <- data
  }
  if(!is.null(covars)){
    for(i in seq_along(covars)){
      assign(names(covars)[i], covars[[i]])
    }
  }
  
  # Coerce data
  if(!is.null(paired)) paired <- as.factor(paired)
  count_table <- as.matrix(count_table)
  
  # Checks
  if(relative) if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) stop("count_table must only contain integer values when relative=TRUE")
  if(min(count_table) < 0) stop("count_table contains negative values")
  if(sum(colSums(count_table) == 0) > 0) stop("Some samples are empty")
  if(ncol(count_table) != length(predictor)) stop("Number of samples in count_table does not match length of predictor")
  if(length(unique(predictor)) < 2) stop("predictor should have at least two levels")
  
  # Remove Features not present in any samples
  if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
  count_table <- count_table[rowSums(count_table) > 0,]
  if(nrow(count_table) <= 15) warning("Dataset contains very few features") 
  
  # Prune tests argument
  decimal <- zeroes <- FALSE
  if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) decimal <- TRUE
  if(any(count_table == 0)) zeroes <- TRUE
  tests <- unique(tests)
  if(!"zzz" %in% tests) tests <- pruneTests(tests, predictor, paired, covars, relative, decimal, zeroes)
  tests.par <- paste0(unlist(lapply(seq_len(R), function(x) rep(x,length(tests)))),"_",rep(tests,R))
  if(length(tests) == 0) stop("No tests to run!")
  
  # Run time warnings
  if(verbose){
    if("neb" %in% tests & !is.null(paired)){
      message("As 'neb' is included and a 'paired' variable is supplied, this might take a long time")
    }
  }

  if(verbose) message(paste("Running on",cores,"cores"))

  # Spike vs no features
  if(is.null(k)){
    k <- rep(round(nrow(count_table)*0.02),3)
    if(sum(k) < 15){
      k <- c(5,5,5)
    } 
  } 
  if(sum(k) == nrow(count_table)) stop("Set to spike all features. Can't calculate FPR or AUC. Change k argument")
  if(sum(k) > nrow(count_table)) stop("Set to spike more features than are present in the data. Change k argument")
  if(sum(k) < 15 & sum(k) >= 10 & R <= 10) warning("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR")
  if(sum(k) < 10 & sum(k) >= 5 & R <= 20) warning("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR")
  if(sum(k) < 5 & R <= 50) warning("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR")
  if(sum(k) > nrow(count_table)/2) warning("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument")
                                 
  # predictor
  if(any(is.na(predictor))) warning("Predictor contains NAs!")
  if(is.numeric(predictor[1])){
    num.pred <- TRUE
    if(verbose) message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
    if(length(levels(as.factor(predictor))) == 2){
      ANSWER <- readline("The predictor is quantitative, but only contains 2 unique values. Are you sure this is correct? Enter y to proceed ")
      if(ANSWER != "y") stop("Wrap the predictor with as.factor(predictor) to treat it is a categorical variable")
    }
  } else {
    num.pred <- FALSE
    if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!")
    if(verbose) message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", ")))
  }
  if(!is.null(paired)){
    if(verbose) message(paste("The paired variable has",length(unique(paired)),"levels"))
    if(length(unique(paired)) < 5) warning("The paired variable has less than 5 levels. Mixed-effect models are excluded")
  }
  
  # out.all
  if(is.null(out.all)){
    if(length(unique(predictor)) == 2) out.all <- FALSE
    if(length(unique(predictor)) > 2) out.all <- TRUE
    if(num.pred) out.all <- FALSE
  }
  
  # Covars
  if(!is.null(covars)){
    for(i in seq_along(covars)){
      if(verbose) if(any(is.na(covars[[i]]))) warning(names(covars)[i],"contains NAs!")
      if(is.numeric(covars[[i]][1])){
        if(verbose) message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE)))
      } else {
        if(verbose) message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", ")))
      }
    }
  }
  
  if(verbose) cat("Spikeing...\n")
  # Shuffle predictor
  if(is.null(paired)){
    rands <- lapply(seq_len(R),function(x) sample(predictor))
  } else {
    rands <- lapply(seq_len(R),function(x) unsplit(lapply(split(predictor,paired), sample), paired))
  }
  
  # Spikeins
  spikeds <- lapply(seq_len(R),function(x) spikein(count_table, rands[[x]], effectSize,  k, num.pred, relative))
  count_tables <- lapply(seq_len(R),function(x) spikeds[[x]][[1]])
  
  # Extract test arguments
  if(!all(names(args) %in% tests)) stop("One or more names in list with additional arguments does not match names of tests")
  argsL <- list()
  for(a in seq_along(tests)){
    argsL[tests[a]] <- args[tests[a]]
  }
  
  ### Run tests
  if(verbose) cat(paste("Testing", length(tests),"methods", R,"times each...\n"))
  # Progress bar
  pb <- txtProgressBar(max = length(tests.par), style = 3)
  progress <- function(n) setTxtProgressBar(pb, n)
  opts <- list(progress = progress)
  
  # Start parallel
  if(cores == 1) {
    registerDoSEQ() 
  } else {
    cl <- parallel::makeCluster(cores)
    registerDoSNOW(cl)
    on.exit(stopCluster(cl))
  }

  # Run the tests in parallel
  results <- foreach(i = tests.par, .options.snow = opts) %dopar% {
    
    t1.sub <- proc.time()
    
    # Extract run info
    run.no <- as.numeric(gsub("_.*","",i))
    i <- gsub(".*_","",i)

    # zzz tests
    if(!is.na(pmatch("zzz",i))){
      j <- i
      i <- "zzz"
    }
    
    # Run tests
    res.sub <- tryCatch(switch(i,
                               zzz = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars), argsL[[j]])),
                               mva = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative, p.adj), argsL[[i]])),
                               wil = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj), argsL[[i]])),
                               ttt = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj), argsL[[i]])),
                               ttr = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj), argsL[[i]])),
                               ltt = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,relative, p.adj), argsL[[i]])),
                               tta = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj), argsL[[i]])),
                               ttc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj), argsL[[i]])),
                               ltt2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj), argsL[[i]])),
                               neb = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj), argsL[[i]])),
                               erq = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               ere = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]], p.adj), argsL[[i]])),
                               erq2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               ere2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]], p.adj), argsL[[i]])),
                               msf = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]], p.adj), argsL[[i]])),
                               zig = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars, p.adj), argsL[[i]])),
                               abc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,out.all, p.adj), argsL[[i]])),
                               ds2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               ds2x = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               per = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj), argsL[[i]])),
                               bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]]), argsL[[i]])),
                               adx = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]]), argsL[[i]])),
                               lim = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj), argsL[[i]])),
                               lli = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj), argsL[[i]])),
                               lia = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               lic = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               lli2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               kru = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]], relative, p.adj), argsL[[i]])),
                               aov = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, relative, p.adj), argsL[[i]])),
                               lao = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,relative, p.adj), argsL[[i]])),
                               aoa = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, p.adj), argsL[[i]])),
                               aoc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, p.adj), argsL[[i]])),
                               lao2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, p.adj), argsL[[i]])),
                               lrm = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars, relative,out.all, p.adj), argsL[[i]])),
                               llm = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj), argsL[[i]])),
                               lma = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               lmc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               llm2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               spe = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],relative, p.adj), argsL[[i]])),
                               pea = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],relative, p.adj), argsL[[i]])),
                               poi = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj), argsL[[i]])),
                               qpo = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,relative,out.all, p.adj), argsL[[i]])),
                               vli = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj), argsL[[i]])),
                               zpo = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,relative,out.all, p.adj), argsL[[i]])),
                               znb = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,relative,out.all, p.adj), argsL[[i]])),
                               fri = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,relative, p.adj), argsL[[i]])),
                               qua = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,relative, p.adj), argsL[[i]])),
                               sam = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,fdr.output = alpha), argsL[[i]]))),
                        error = function(e) NULL)
    
    if(!is.null(res.sub) & (!i %in% c("sam","adx"))){
      res.sub[is.na(res.sub$pval),"pval"] <- 1
      res.sub[is.na(res.sub$pval.adj),"pval.adj"] <- 1
    }

    run.time.sub <- (proc.time()-t1.sub)[3]
    return(list(res.sub,run.time.sub))
    
  }
  run.times <- lapply(results, function(x) x[[2]])
  results <- lapply(results, function(x) x[[1]])
  
  names(results) <- tests.par
  names(run.times) <- tests.par

  # Handle failed tests
  results <- results[!sapply(results,is.null)]
  
  cat("\n")
  if(length(unique(gsub(".*_","",names(results)))) != length(tests)){
    if(length(tests) - length(unique(gsub(".*_","",names(results)))) == 1){
      if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure"))
    } else {
      if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure"))
    }

    # Produce informative messages
    if(all(tests[!tests %in% unique(gsub(".*_","",names(results)))] == "sam")){
      if(verbose) message("sam usually fails if some samples has too many zeroes")
    }
    if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){
      if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
    }
      
    tests <- unique(gsub(".*_","",names(results)))
  }
  
  r <- NULL
  final.results <- foreach(r = seq_len(R)) %do% {

    res.sub <- results[names(results)[gsub("_.*","",names(results)) == r]]
    
    # Split ALDEx2 results in t.test and wilcoxon
    if("adx" %in% gsub(".*_","",names(res.sub))){
      adx.t <- as.data.frame(res.sub[paste0(r,"_","adx")])[,c(1:7,12:13)]
      adx.w <- as.data.frame(res.sub[paste0(r,"_","adx")])[,c(1:7,12:13)]
      colnames(adx.t) <- gsub(".*_adx.","",colnames(adx.t))
      colnames(adx.w) <- colnames(adx.t)
      adx.t$pval <- as.numeric(as.data.frame(res.sub[paste0(r,"_","adx")])[,8])
      adx.w$pval <- as.numeric(as.data.frame(res.sub[paste0(r,"_","adx")])[,10])
      adx.t$pval.adj <- as.numeric(as.data.frame(res.sub[paste0(r,"_","adx")])[,9])
      adx.w$pval.adj <- as.numeric(as.data.frame(res.sub[paste0(r,"_","adx")])[,11])
      adx.t$Method <- "ALDEx2 t-test (adx)"
      adx.w$Method <- "ALDEx2 wilcox (adx)"
      res.sub[paste0(r,"_","adx")] <- NULL
      res.names <- names(res.sub)
      res.sub <- c(res.sub,list(adx.t),list(adx.w))
      names(res.sub) <- c(res.names,paste0(r,"_","adx.t"),paste0(r,"_","adx.w"))
    }

    # Make pseudo-pval for SAMseq
    if("sam" %in% gsub(".*_","",names(res.sub))){
      samdf <- as.data.frame(res.sub[paste0(r,"_","sam")])
      colnames(samdf) <- gsub(".*_sam.","",colnames(samdf))
      
      samdf$pval <- 1/rank(samdf$Score)
      samdf$pval.adj <- 1
      samdf[samdf$Sig == "Yes","pval.adj"] <- 0 
      
      res.sub[paste0(r,"_","sam")] <- NULL
      res.names <- names(res.sub)
      res.sub <- c(res.sub,list(samdf))
      names(res.sub) <- c(res.names,paste0(r,"_","sam"))
    }
    
    # Insert spiked column
    newnames <- gsub(".*_","",names(res.sub))
    rsp <- NULL
    res.sub <- foreach(rsp = seq_along(res.sub)) %do% {
      temp <- res.sub[[rsp]]
      temp$Spiked <- "No"
      temp[temp$Feature %in% spikeds[[r]][[2]],"Spiked"] <- "Yes"
      return(temp)
    }
    names(res.sub) <- newnames
    
    # Confusion matrix
    totalPos <- sapply(res.sub,function(x) nrow(x[x$pval <= 0.05,]))
    totalNeg <- sapply(res.sub,function(x) nrow(x[x$pval > 0.05,])) 
    trueNeg <- totalNeg  #if effectSize == 1
    truePos <- 0  #if effectSize == 1
    falseNeg <- 0 #if effectSize == 1
    if(effectSize != 1){
      truePos <- sapply(res.sub, function(x) sum(x[x$pval <= 0.05,"Feature"] %in% spikeds[[r]][[2]]))
      falseNeg <- sapply(res.sub, function(x) sum(x[x$pval > 0.05,"Feature"] %in% spikeds[[r]][[2]]))
    }
    falsePos <- totalPos - truePos
    trueNeg <- totalNeg - falseNeg
    
    # FPR 
    fprs <- sapply(seq_along(res.sub), function(x) {
      if((falsePos[x] + trueNeg[x]) != 0){
        falsePos[x] / (falsePos[x] + trueNeg[x])
      } else {0}})
    
    
    # Spike detection rate
    # True positive for adjusted p-values
    truePos.adj <- 0  #if effectSize == 1
    if(effectSize != 1){
      truePos.adj <- sapply(res.sub, function(x) sum(x[x$pval.adj <= alpha,"Feature"] %in% spikeds[[r]][[2]]))
    }
    sdrs <- sapply(seq_along(res.sub), function(x) truePos.adj[x] / sum(k))
    
    # AUC
    aucs <- sapply(seq_along(res.sub), function(x) {
      if(effectSize != 1){
        test_roc <- NULL
        tryCatch(
          test_roc <- pROC::roc(as.numeric(res.sub[[x]]$Feature %in% spikeds[[r]][[2]]) ~ res.sub[[x]]$pval, auc=TRUE, direction = ">", quiet=TRUE),
          error = function(e) NULL)
        if(!is.null(test_roc)){
          as.numeric(test_roc$auc) 
        } else {
          0.5
        }
      } else {
        0.5
      }
    })
    
    # Confusion matrix adjusted
    totalPos.adj <- sapply(res.sub, function(x) nrow(x[x$pval.adj <= alpha,]))
    truePos.adj <- sapply(res.sub, function(x) sum(x[x$pval.adj <= alpha,"Feature"] %in% spikeds[[r]][[2]]))
    falsePos.adj <- totalPos.adj - truePos.adj
    
    fdrs <- sapply(seq_along(res.sub), function(x) {
      if(totalPos.adj[x] != 0){
        falsePos.adj[x] / totalPos.adj[x]
      } else {0}})
    
    # Combine and return
    df.combined <- data.frame(Method = sapply(res.sub, function(x) x$Method[1]),
                              AUC = aucs,
                              FPR = fprs,
                              FDR = fdrs,
                              Power = sdrs,
                              Run = r)
    rownames(df.combined) <- NULL

    # SAMseq FPRs not possible
    if("sam" %in% newnames){
      df.combined[df.combined$Method == "SAMseq (sam)","FPR"] <- NA
    }
        return(list(df.combined,res.sub))
  }
  
  output.results <- do.call(rbind,lapply(final.results, function(x) x[[1]]))
  output.all.results <- lapply(final.results, function(x) x[[2]])
  
  if(num.pred){
    pred.det <- "Quantitative"
    pred.ord <- paste(min(predictor,na.rm=TRUE),"to",max(predictor,na.rm=TRUE))
  } else {
    if(length(levels(as.factor(predictor))) == 2){
      pred.det <- "Two-class"
      pred.ord <- paste(levels(as.factor(predictor))[1],"<",levels(as.factor(predictor))[2])
    } else {
      pred.det <- "Multi-class" 
      pred.ord <- paste(levels(as.factor(predictor)), collapse = ", ")
    }
  }
  
  if(is.null(paired)) pair.det <- "No" else pair.det <- "Yes"

  run.secs <- (proc.time() - t1)[3]

  if((run.secs)/60/60 > 1){
    run.time <- paste(round((run.secs)/60/60,2),"Hours")
   } else {
    run.time <- paste(round((run.secs)/60,2),"Minutes")
   }

  output.details <- data.frame(Features = nrow(count_table),
                               Samples = ncol(count_table),
                               Predictor = pred.det,
                               Ordering = pred.ord,
                               Paired = pair.det,
                               Covars = paste(names(covars), collapse = ", "),
                               RunTime = run.time,
                               Relative = relative,
                               EffectSize = effectSize,
                               Spiked = paste(paste0(c("Low:","Mid:","High:"),k), collapse = ", "),
                               OutAll = out.all)
  rownames(output.details) <- ""
  output.details <- as.data.frame(t(output.details))
  colnames(output.details) <- ""
  
  # Run times
  run.times.all <- foreach(i = unique(gsub(".*_","",names(results))),.combine = rbind) %do% {
    round(mean(as.numeric(run.times[gsub(".*_","",names(run.times)) == i]))/60,4)
  }
  run.times.all <- as.data.frame(run.times.all)
  rownames(run.times.all) <- unique(gsub(".*_","",names(results)))
  colnames(run.times.all) <- "Minutes"
  
  out <- list(table = output.results, results = output.all.results, details = output.details, run.times = run.times.all)
  class(out) <- "DA"
  
  return(out)
}
Russel88/DAtest documentation built on March 24, 2022, 3:50 p.m.