R/clr_effect.r

Defines functions aldex.effect

Documented in aldex.effect

#' Calculate effect sizes and differences between conditions
#'
#' Determines the median clr abundance of the feature in all samples and in groups.
#' Determines the median difference between the two groups.
#' Determines the median variation within each two group.
#' Determines the effect size, which is the median of the ratio of the 
#'  between-group difference and the larger of the variance within groups.
#'
#' @param clr \code{clr} is the data output of \code{aldex.clr}.
#' @param verbose Print diagnostic information while running. Useful only for debugging if fails on large datasets.
#' @param include.sample.summary Include median clr values for each sample, defaults to FALSE.
#' @param useMC Use multicore by default (FALSE).
#' @param CI Give effect 95\% confidence intervals, defaults to FALSE.
#' @param glm.conds Give effect for glm contrasts, note: saved as list.
#' @param paired.test Calculate effect size for paired samples, defaults to FALSE.
#' 
#' @details An explicit example for two conditions is shown in the `Examples` below.
#' 
#' @return Returns a dataframe with the following information:
#'  \item{rab.all}{a vector containing the median clr value for each feature.}
#'  \item{rab.win.conditionA}{a vector containing the median clr value for each feature in condition A.}
#'  \item{rab.win.conditionB}{a vector containing the median clr value for each feature in condition B.}
#'  \item{diff.btw}{a vector containing the per-feature median difference between condition A and B.}
#'  \item{diff.win}{a vector containing the per-feature maximum median difference between 
#'    Dirichlet instances within conditions.}
#'  \item{effect}{a vector containing the per-feature effect size.}
#'  \item{overlap}{a vector containing the per-feature proportion of effect size that is 0 or less.}
#' 
#' @references Please use the citation given by \code{citation(package="ALDEx")}.
#' 
#' @author Greg Gloor, Andrew Fernandes, Matt Links
#' 
#' @seealso \code{\link{aldex.clr}}, \code{\link{aldex.ttest}}, \code{\link{aldex.glm}}, 
#'   \code{\link{aldex.glm.effect}}, \code{\link{selex}}
#' 
#' @examples
#' # x is the output of the \code{x <- clr(data, mc.samples)} function
#' # conditions is a description of the data
#' # for the selex dataset, conditions <- c(rep("N", 7), rep("S", 7))
#' data(selex)
#' # subset for efficiency
#' selex <- selex[1201:1600,]
#' conds <- c(rep("NS", 7), rep("S", 7))
#' x <- aldex.clr(selex, conds, mc.samples=2, denom="all")
#' effect.test <- aldex.effect(x)
#' 
#' @export
aldex.effect <- function(clr, verbose = TRUE, include.sample.summary = FALSE, useMC = FALSE,
                         CI = FALSE, glm.conds = NULL, paired.test = FALSE) {


# returns the median differences in abundance between 2 conditions
# returns the median effect size and proportion of effect that overlaps 0
# data is returned in a data frame
# requires multicore
# this uses Rfast

  # Use clr conditions slot instead of input
     if (is.vector(clr@conds)) {
       conditions <- clr@conds
     } else if (is.factor(clr@conds)) {
     	if (length(levels(clr@conds) == 2)) {
     	  conditions <- clr@conds
     	}
     } else if (is.matrix(clr@conds)){
       if(is.null(glm.conds)) stop("please provide a binary condition vector")
       conditions <- glm.conds
     } else {
       stop("please check that the conditions parameter for aldex.clr is correct.")
     }

    is.multicore = FALSE

    if ("BiocParallel" %in% rownames(installed.packages()) & useMC==TRUE){
        message("multicore environment is OK -- using the BiocParallel package")
        #require(BiocParallel)
        is.multicore = TRUE
    }
    else {
        if (verbose == TRUE) message("operating in serial mode")
    }

    nr <- numFeatures(clr) # number of features
    rn <- getFeatureNames(clr) # feature names
    # ---------------------------------------------------------------------

    # sanity check to ensure only two conditons passed to this function
    conditions <- as.factor( conditions )
    levels     <- levels( conditions )

    if ( length( conditions ) !=  numConditions(clr) ) stop("mismatch btw 'length(conditions)' and 'ncol(reads)'")

    if ( length( levels ) != 2 ) stop("only two condition levels are currently supported")

    levels <- vector( "list", length( levels ) )
    names( levels ) <- levels( conditions )

    for ( l in levels( conditions ) ) {
        levels[[l]] <- which( conditions == l )
        if ( length( levels[[l]] ) < 2 ) stop("condition level '",l,"' has less than two replicates")
    }

    # end sanity check
if (verbose == TRUE) message("sanity check complete")

    # Summarize the relative abundance (rab) win and all groups

    rab <- vector( "list", 3 )
    names(rab) <- c( "all", "win", "spl" )
    rab$win <- list()

    #this is the median value across all monte carlo replicates
    # for loops replaced with do.call
    cl2p <- NULL
    cl2p <- do.call(cbind, getMonteCarloInstances(clr))
    # this is a 2X speedup
    rab$all <- Rfast::rowMedians(cl2p)
    names(rab$all) <- rownames(cl2p)
    rm(cl2p)

 if (verbose == TRUE) message("rab.all  complete")

    for(level in levels(conditions)){
      cl2p <- NULL
      cl2p <- do.call(cbind, getMonteCarloInstances(clr)[levels[[level]]] )
      rab$win[[level]] <-  Rfast::rowMedians(cl2p)
      rm(cl2p)
 }

 if (verbose == TRUE) message("rab.win  complete")

    if (is.multicore == TRUE)  rab$spl <- bplapply( getMonteCarloInstances(clr), function(m) { t(apply( m, 1, median )) } )
    #RMV if (is.multicore == FALSE) rab$spl <- lapply( getMonteCarloInstances(clr), function(m) { t(apply( m, 1, median )) } )
    if (is.multicore == FALSE) rab$spl <- lapply( getMonteCarloInstances(clr), function(m) { Rfast::rowMedians(m) } )
if (verbose == TRUE) message("rab of samples complete")

    # ---------------------------------------------------------------------
    # Compute diffs btw and win groups

if (paired.test == FALSE ){
    l2d <- vector( "list", 2 )
    names( l2d ) <- c( "btw", "win" )
    l2d$win <- list()

    # abs( win-conditions diff ), btw smps
    #this generates a linear sample of the values rather than an exhaustive sample
    for ( level in levels(conditions) ) {
        concat <- NULL
        for ( l1 in sort( levels[[level]] ) ) {
            concat <- cbind(  getMonteCarloReplicate(clr,l1),concat )

        }

        #if the sample is huge, only sample 10000
        if ( ncol(concat) < 10000 ){
            sampl1 <- t(apply(concat, 1, function(x){sample(x, ncol(concat))}))
            sampl2 <- t(apply(concat, 1, function(x){sample(x, ncol(concat))}))
        } else {
            sampl1 <- t(apply(concat, 1, function(x){sample(x, 10000)}))
            sampl2 <- t(apply(concat, 1, function(x){sample(x, 10000)}))
        }
        l2d$win[[level]] <- cbind( l2d$win[[level]] , abs( sampl1 - sampl2 ) )
        rm(sampl1)
        rm(sampl2)
    }
if (verbose == TRUE) message("within sample difference calculated")

    # Handle the case when the groups have different spl sizes
    # get the minimum number of win spl comparisons
    ncol.wanted <- min( sapply( l2d$win, ncol ) )
    # apply multicore paradigm ML
    if (is.multicore == TRUE) l2d$win  <- bplapply( l2d$win, function(arg) { arg[,1:ncol.wanted] } )
    if (is.multicore == FALSE) l2d$win  <- lapply( l2d$win, function(arg) { arg[,1:ncol.wanted] } )

    # btw condition diff (signed)
    #get the btw condition as a random sample rather than exhaustive search
    concatl1 <- NULL
    concatl2 <- NULL
    for( l1 in levels[[1]] ) concatl1 <- cbind( getMonteCarloReplicate(clr,l1),concatl1 )
    for( l2 in levels[[2]] ) concatl2 <- cbind( getMonteCarloReplicate(clr,l2),concatl2 )

    sample.size <- min(ncol(concatl1), ncol(concatl2))

    if ( sample.size < 10000 ){
        smpl1 <- t(apply(concatl1, 1, function(x){sample(x, sample.size)}))
        smpl2 <- t(apply(concatl2, 1, function(x){sample(x, sample.size)}))
    } else {
        smpl1 <- t(apply(concatl1, 1, function(x){sample(x, 10000)}))
        smpl2 <- t(apply(concatl2, 1, function(x){sample(x, 10000)}))
    }
    l2d$btw <- smpl2 - smpl1

    rm(smpl1)
    rm(smpl2)

if (verbose == TRUE) message("between group difference calculated")

    win.max <- matrix( 0 , nrow=nr , ncol=ncol.wanted )
    l2d$effect <- matrix( 0 , nrow=nr , ncol=ncol(l2d$btw) )
    rownames(l2d$effect) <- rn

###the number of elements in l2d$btw and l2d$win may leave a remainder when
  #recycling these random vectors. Warnings are suppressed because this is not an issue
  #for this calculation. In fact, any attempt to get rid of this error would
  #decrease our power as one or both vectors would need to be truncated gg 20/06/2013

    options(warn=-1)

    for ( i in 1:nr ) {
        #mat <- rbind( l2d$win[[1]][i,] , l2d$win[[2]][i,] )
        #win.max[i,] <- apply( mat , 2 , max )
        # this is a 2x speedup
        win.max[i,] <- pmax( l2d$win[[1]][i,] , l2d$win[[2]][i,] )
        l2d$effect[i,] <- l2d$btw[i,] / win.max[i,]
        l2d$effect[i,][is.na(l2d$effect[i,])] <- 0
    }

    options(warn=0)

    rownames(win.max)   <- rn
    attr(l2d$win,"max") <- win.max
    rm(win.max)

    # ---------------------------------------------------------------------
    # Summarize diffs

    l2s <- vector( "list", 2 )
    names( l2s ) <- c( "btw", "win" )
    l2s$win <- list()

    #RMV l2s$btw <- t(apply( l2d$btw, 1, median ))
    l2s$btw <- Rfast::rowMedians(l2d$btw)
    #RMV l2s$win  <- t(apply( attr(l2d$win,"max"), 1, median ))
    l2s$win  <- Rfast::rowMedians( attr(l2d$win,"max"))
if (verbose == TRUE) message("group summaries calculated")

    if(CI == FALSE) {
    #  effect  <- t(apply( l2d$effect, 1, function(row){row[is.na(row)] <- 0 ; median(row) }))
      effect <- Rfast::rowMedians(l2d$effect)
    } else {
      effectlow <- as.vector(t(apply( l2d$effect, 1, function(x) {x[is.na(x)] <- 0 ;
          quantile( x, probs=0.025, names=FALSE)} )))
      effecthigh <- as.vector(t(apply( l2d$effect, 1, function(x) {x[is.na(x)] <- 0 ;
          quantile( x, probs=0.975, names=FALSE)} )))
      effect <- Rfast::rowMedians(l2d$effect)
    }
    overlap <- apply( l2d$effect, 1, function(row) { if(all(is.na(row))) warning("NAs in effect, ignore if using ALR");
                row[is.na(row)] <- 0 ;
                min( aitchison.mean( c( sum( row < 0 ) , sum( row > 0 ) ) + 0.5 ) ) } )
if (verbose == TRUE) message("unpaired effect size calculated")
} else if (paired.test == TRUE) {
  l2s <- vector( "list",2 )
  names( l2s ) <- c( "btw", "win" )

  # generate the comparison sets from the condition levels
  
  sets <- names(levels)
  setA <- which(conditions == sets[1])
  setB <- which(conditions == sets[2])
  
  diff <- NULL
  for(i in 1:length(setA)){
    jnk1 <- getMonteCarloReplicate(clr,setA[i])
    jnk2 <- getMonteCarloReplicate(clr,setB[i])
    diff <- cbind(diff, jnk2-jnk1)
  }
  
  overlap <- apply( diff, 1, function(row) { if(all(is.na(row))) warning("NAs in effect, ignore if using ALR");
                row[is.na(row)] <- 0 ;
                min( aitchison.mean( c( sum( row < 0 ) , sum( row > 0 ) ) + 0.5 ) ) } )

  l2s$btw <- apply(diff, 1, median) # robust estimator
  l2s$win <- apply(diff, 1, function(x) mad(x, constant=1.428))
  
  # get the 95% CI bounds on the difference 
  effect.low <- apply(diff, 1, function(x) quantile(x, probs=0.025))
  effect.high <- apply(diff, 1, function(x) quantile(x, probs=0.975))
  
  effectlow <- as.vector(effect.low/l2s$win)
  effecthigh <- as.vector(effect.high/l2s$win)
  
  effect <- l2s$btw/l2s$win
  if (verbose == TRUE) message("paired effect size calculated")

}

# make and fill in the data table
# i know this is inefficient, but it works and is not a bottleneck
   if(CI == FALSE) {
    rv <- list(
        rab = rab,
        diff = l2s,
        effect = effect,
        overlap = overlap
    )
    } else {
    rv <- list(
        rab = rab,
        diff = l2s,
        effect = effect,
        effectlow = effectlow,
        effecthigh = effecthigh,
        overlap = overlap
     )
    }

if (verbose == TRUE) message("summarizing output")

   y.rv <- data.frame(rv$rab$all)
   colnames(y.rv) <- c("rab.all")
   for(i in names(rv$rab$win)){
       nm <- paste("rab.win", i, sep=".")
       y.rv[,nm] <- data.frame(rv$rab$win[[i]])
   }
   if (include.sample.summary == TRUE){
    for(i in names(rv$rab$spl)){
       nm <- paste("rab.sample", i, sep=".")
       if (is.multicore == TRUE) y.rv[,nm] <- data.frame(t(rv$rab$spl[[i]]))
       if (is.multicore == FALSE) y.rv[,nm] <- data.frame(rv$rab$spl[[i]])
   }

   }
   for(i in names(rv$diff)){
       nm <- paste("diff", i, sep=".")
       y.rv[,nm] <- data.frame(rv$diff[[i]])
   }
   if(CI == FALSE) {
     y.rv[,"effect"] <- data.frame(rv$effect)
     y.rv[,"overlap"] <- data.frame(rv$overlap)
   } else {
     y.rv[,"effect"] <- data.frame(rv$effect)
     y.rv[,"effect.low"] <- data.frame(rv$effectlow)
     y.rv[,"effect.high"] <- data.frame(rv$effecthigh)
     y.rv[,"overlap"] <- data.frame(rv$overlap)
   }
    return(y.rv)

}
ggloor/ALDEx_bioc documentation built on Oct. 31, 2023, 1:13 a.m.