R/lefser.R

Defines functions lefser trunc filterKruskal .numeric01 contastWithinClassesOrFewPerClass createUniqueValues fillPmatZmat

Documented in lefser

fillPmatZmat <- function(group,
                         block,
                         expr_sub,
                         p.threshold)
{
  # creates a list of boolean vectors, each vector indicates
  # existance (TRUE) or absence (FALSE) of a class/sub-class combination
  combos <- apply(
    expand.grid(levels(group), levels(block)), 1L, paste0, collapse = "")
  combined <- paste0(as.character(group), as.character(block))
  logilist <- lapply(setNames(nm = sort(combos)), `==`, combined)

  ## uses Wilcoxon rank-sum test to test for significant differential abundances between
  ## subclasses of one class against subclasses of all othe classes; results are saved in
  ## "pval_mat" and "z_mat" matrices
  whichlist <- lapply(logilist, which)
  sblock <- seq_along(levels(block))
  texp_sub <- t(expr_sub)
  iters <- expand.grid(sblock, sblock + length(sblock))
  group_formats <- apply(iters, 1L, function(x) {
    ind <- unlist(whichlist[x])
    apply(texp_sub, 2L, function(g) {
      wx <- suppressWarnings(coin::wilcox_test(g ~ group, subset = ind))
      cbind.data.frame(
          p.value = coin::pvalue(wx), statistic = coin::statistic(wx)
      )
    })
  })

  res <- lapply(group_formats, function(x) do.call(rbind, x))
  pval_mat <- do.call(cbind, lapply(res, `[[`, "p.value"))
  z_mat <- do.call(cbind, lapply(res, `[[`, "statistic"))

  rownames(pval_mat) <- rownames(expr_sub)
  rownames(z_mat) <- rownames(expr_sub)

  ## converts "pval_mat" into boolean matrix "logical_pval_mat" where
  ## p-values <= wilcoxon.threshold
  logical_pval_mat <- pval_mat <= p.threshold * 2.0
  logical_pval_mat[is.na(logical_pval_mat)] <- FALSE

  ## determines which rows (features) have all p-values<=0.05
  ## and selects such rows from the matrix of z-statistics
  sub <- apply(logical_pval_mat, 1L, all)
  z_mat_sub <- z_mat[sub, , drop = FALSE]
    # confirms that z-statistics of a row all have the same sign
  sub <- abs(rowSums(z_mat_sub)) == rowSums(abs(z_mat_sub))
  expr_sub[names(sub[sub]), , drop = FALSE]
}

## ensures that more than half of the values in each for each feature are unique
## if that is not the case then a count value is altered by adding it to a small value
## generated via normal distribution with mean=0 and sd=5% of the count value
createUniqueValues <- function(df, group){
  orderedrows <- rownames(df)
  splitdf <- split(df, group)
  maxim <- vapply(table(group), function(x) max(x * 0.5, 4), numeric(1L))
  for (i in seq_along(splitdf)) {
    sdat <- splitdf[[i]]
    splitdf[[i]][] <- lapply(sdat, function(cols) {
        if (length(unique(cols)) > maxim[i])
            cols
        else
            abs(cols + rnorm(
                length(cols), mean = 0, sd = max(cols * 0.05, 0.01))
            )
    })
  }
  df <- do.call(rbind, unname(splitdf))
  df[match(orderedrows, rownames(df)),, drop = FALSE]
}

contastWithinClassesOrFewPerClass <-
  function(expr_sub_t_df, rand_s, min_cl, ncl, groups) {
    cols <- expr_sub_t_df[rand_s, , drop = FALSE]
    cls <- expr_sub_t_df$class[rand_s]
    # if the number of classes is less than the actual number (typically two)
    # of classes in the dataframe then return TRUE
    if (length(unique(cls)) < ncl) {
      return (TRUE)
    }
    # detect if for each class there are not fewer than the minimum (min_cl) number of samples
    if (any(table(cls) < min_cl)) {
      return (TRUE)
    }
    # separate the randomly selected samples (cols) into a list of the two classes
    drops <- c("class")
    by_class <-
      lapply(seq_along(groups), function(x) {
        cols[cols[, "class"] == groups[x],!(names(cols) %in% drops)]
      })

    # makes sure that within each class all features have at least min_cl unique count values
    for (i in seq_along(groups)) {
      unique_counts_per_microb = apply(by_class[[i]], 2, function(x) {
        length(unique(x))
      })
      if ((any(unique_counts_per_microb <= min_cl) &
           min_cl > 1) |
          (min_cl == 1 & any(unique_counts_per_microb <= 1))) {
        return (TRUE)
      }
    }
    return (FALSE)

  }

ldaFunction <- function (data, lfk, rfk, min_cl, ncl, groups) {
  # test 1000 samples for contrast within classes per feature
  # and that there is at least a minimum number of samples per class
  for (j in 1:1000) {
    rand_s <- sample(seq_len(lfk), rfk, replace = TRUE)
    if (!contastWithinClassesOrFewPerClass(data, rand_s, min_cl, ncl, groups)) {
      break
    }
  }
  # lda with rfk number of samples
  lda.fit <- lda(class ~ ., data = data, subset = rand_s)
  # coefficients that transform observations to discriminants
  w <- lda.fit$scaling[, 1]
  # scaling of lda coefficients
  w.unit <- w / sqrt(sum(w ^ 2))
  sub_d <- data[rand_s,]
  ss <- sub_d[,-match("class", colnames(sub_d))]
  xy.matrix <- as.matrix(ss)
  # the original matrix is transformed
  LD <- xy.matrix %*% w.unit
  # effect size is calculated as difference between averaged disciminants
  # of two classes
  effect_size <-
    abs(mean(LD[sub_d[, "class"] == 0]) - mean(LD[sub_d[, "class"] == 1]))
  # scaling lda coefficients by the efect size
  scal <- w.unit * effect_size
  # mean count values per fclass per feature
  rres <- lda.fit$means
  rowns <- rownames(rres)
  lenc <- length(colnames(rres))

  coeff <- vector("numeric", length(scal))
  for (v in seq_along(scal)) {
    if (!is.na(scal[v])) {
      coeff[v] <- abs(scal[v])
    } else{
      coeff[v] <- 0
    }

  }
  # count value differences between means of two classes for each feature
  lda.means.diff <- (lda.fit$means[2,] - lda.fit$means[1,])
  # difference between a feature's class means and effect size adjusted lda coefficient
  # are averaged for each feature
  (lda.means.diff + coeff) / 2
}

.numeric01 <- function(x) {
    x <- as.factor(x)
    uvals <- levels(x)
    ifelse(x == uvals[1L], 0L, 1L)
}

filterKruskal <- function(expr, group, p.value) {
  # applies "kruskal.test.alt" function to each row (feature) of expr
  # to detect differential abundance between classes, 0 and 1
  kw.res <- apply(expr, 1L, function(x) {
    kruskal.test(x ~ group)[["p.value"]]
  })
  # selects p-values less than or equal to kw.threshold
  kw.sub <- kw.res <= p.value
  
  # eliminates NAs
  kw.sub[is.na(kw.sub)] <- FALSE
  
  # extracts features with statistically significant differential abundance
  # from "expr" matrix
  expr[kw.sub,]
}

trunc <- function(scores_df, trim.names){
  Names <- droplevels(scores_df)$Names
  if(trim.names){
    listNames <- strsplit(Names, "\\||\\.")
    Names <- vapply(listNames, tail, character(1L), 1L)
  }
    
  scores_df$Names <- Names
  return(scores_df)
  
}

#' R implementation of the LEfSe method
#'
#' Perform a LEfSe analysis: the function carries out differential analysis
#' between two sample groups for multiple microorganisms and uses linear discirminant analysis
#' to establish their effect sizes. Subclass information for each class can be incorporated
#' into the analysis (see examples). Microorganisms with large differences between two sample groups
#' are identified as biomarkers.
#'
#' @param expr A \code{\linkS4class{SummarizedExperiment}} with expression data.
#' @param kruskal.threshold numeric(1) The p-value for the Kruskal-Wallis Rank
#' Sum Test (default 0.05).
#' @param wilcox.threshold numeric(1) The p-value for the Wilcoxon Rank-Sum Test
#' when 'blockCol' is present (default 0.05).
#' @param lda.threshold numeric(1) The effect size threshold (default 2.0).
#' @param groupCol character(1) Column name in `colData(expr)` indicating
#' groups, usually a factor with two levels (e.g., `c("cases", "controls")`;
#' default "GROUP").
#' @param blockCol character(1) Optional column name in `colData(expr)`
#' indicating the blocks, usually a factor with two levels (e.g.,
#' `c("adult", "senior")`; default NULL).
#' @param assay The i-th assay matrix in the `SummarizedExperiment` ('expr';
#' default 1).
#' @param trim.names If `TRUE` extracts the most specific taxonomic rank of organism.
#' @return
#' The function returns a dataframe with two columns, which are
#' names of microorganisms and their LDA scores.
#'
#' @importFrom stats kruskal.test reorder rnorm
#' @importFrom coin pvalue statistic wilcox_test
#' @importFrom MASS lda
#' @importFrom methods as is
#' @importFrom stats setNames
#' @importFrom utils tail
#' @import SummarizedExperiment
#'
#' @examples
#'     # (1) Using classes only
#'     data(zeller14)
#'     # exclude 'adenoma'
#'     zeller14 <- zeller14[, zeller14$study_condition != "adenoma"]
#'     res_group <- lefser(zeller14, groupCol = "study_condition")
#'     head(res_group)
#'
#'     # (2) Using classes and sublasses
#'     data(zeller14)
#'     # exclude 'adenoma'
#'     zeller14 <- zeller14[, zeller14$study_condition != "adenoma"]
#'     res_block <- lefser(
#'          zeller14, groupCol = "study_condition", blockCol = "age_category"
#'     )
#'     head(res_block)
#' @export
lefser <-
  function(expr,
           kruskal.threshold = 0.05,
           wilcox.threshold = 0.05,
           lda.threshold = 2.0,
           groupCol = "GROUP",
           blockCol = NULL,
           assay = 1L,
           trim.names = FALSE)
  {
    groupf <- colData(expr)[[groupCol]]
    if (is.null(groupf))
        stop("A valid group assignment 'groupCol' must be provided")
    groupf <- as.factor(groupf)
    groupsf <- levels(groupf)
    if (length(groupsf) != 2L)
      stop(
        "Group classification is not dichotomous:\n",
        "Found (", paste(groupsf, collapse = ", "), ")"
      )
    group <- .numeric01(groupf)
    groups <- 0:1
    expr_data <- assay(expr, i = assay)
    expr_sub <- filterKruskal(expr_data, group, kruskal.threshold)
    
    if (!is.null(blockCol)) {
        block <- as.factor(colData(expr)[[blockCol]])
        expr_sub <- fillPmatZmat(groupf, block, expr_sub, wilcox.threshold)
    }

    # transposes matrix and add a "class" (i.e., group) column
    # matrix converted to dataframe
    expr_sub_t <- t(expr_sub)
    expr_sub_t_df <- as.data.frame(expr_sub_t)
    expr_sub_t_df <- createUniqueValues(expr_sub_t_df, groupf)
    expr_sub_t_df <- cbind(expr_sub_t_df, class = group)

    # number of samples (i.e., subjects) in the dataframe
    lfk <- nrow(expr_sub_t_df)
    # rfk is the number of subject that will be used in linear discriminant analysis
    rfk <- floor(lfk * 2 / 3)
    # number of classes (two)
    ncl <- length(groups)
    # count samples in each class of the dataframe, select the number from the class with a smaller
    # count of samples and multiply that number by 2/*2/3*0.5
    min_cl <-
      as.integer(min(table(expr_sub_t_df$class)) * 2 / 3 * 2 / 3 *
                   0.5)
    # if min_cl is less than 1, then make it equal to 1
    min_cl <- max(min_cl, 1)

    # lda_fn repeated 30 times, producing a matrix of 30 scores per feature
    eff_size_mat <-
      replicate(30, suppressWarnings(ldaFunction(
        expr_sub_t_df, lfk, rfk, min_cl, ncl, groups
      )), simplify = TRUE)

    # mean of 30 scores per feature
    raw_lda_scores <- rowMeans(eff_size_mat)

    # processing of score
    processed_scores <-
      sign(raw_lda_scores) * log((1 + abs(raw_lda_scores)), 10)

    # sorting of scores
    processed_sorted_scores <- sort(processed_scores)
    scores_df <- data.frame(Names = names(processed_sorted_scores), 
                            scores = as.vector(processed_sorted_scores),
                            stringsAsFactors = FALSE)

    scores_df <- trunc(scores_df, trim.names)
    
    threshold_scores <- abs(scores_df$scores) >= lda.threshold
    scores_df <- scores_df[threshold_scores, ]
    return(scores_df)
  }

Try the lefser package in your browser

Any scripts or data that you put into this service are public.

lefser documentation built on Nov. 8, 2020, 8:01 p.m.