R/gv_PERMANOVA.R

Defines functions .one_permanova run_PERMANOVA

Documented in run_PERMANOVA

#' @title Permutational multivariate analysis of variance (PERMANOVA)
#'
#' @description
#' The `run_PERMANOVA` function is used to identify the association between
#' the community and environmental variables.
#'
#' @details
#' The `run_PERMANOVA` function is used to identify the association between
#' the community and environmental variables, applying the distance in profile
#' and calculating the F statistic between community and variable by permutation
#' test to determine the significance. It can be applied to both
#' [`phyloseq::phyloseq-class`] and [`SummarizedExperiment::SummarizedExperiment`] object.
#'
#' @references Anderson, Marti J. "Permutational multivariate analysis of
#' variance."Department of Statistics, University of Auckland,
#' Auckland 26 (2005): 32-46.
#'
#' @author Created by Hua Zou (5/14/2022 Shenzhen China)
#'
#' @param object (Required). a [`phyloseq::phyloseq-class`] or
#' [`SummarizedExperiment::SummarizedExperiment`] object.
#' @param level (Optional). character. Summarization
#' level (from \code{rank_names(pseq)}, default: NULL).
#' @param variables (Optional). vector. variables for test (default: all).
#' @param method (Optional). character. Provide one of the currently supported
#' options. See `distanceMethodList` for a detailed list of the supported options
#' and links to accompanying documentation. Options include:
#'  * "bray": bray crutis distance.
#'  * "unifrac": unweighted UniFrac distance.
#'  * "wunifrac": weighted-UniFrac distance.
#'  * "GUniFrac": The variance-adjusted weighted UniFrac distances (default: alpha=0.5).
#'  * "dpcoa": sample-wise distance used in Double Principle Coordinate Analysis.
#'  * "jsd": Jensen-Shannon Divergence.
#'  Alternatively, you can provide a character string that defines a custom
#'  distance method, if it has the form described in `designdist` (default: "bray").
#' @param mode (Optional). character. Since there are missing values
#' in some columns, providing two test model:
#'  * "one": test on one variable per time.
#'  * "all": test on all variables once.
#' (default: "one").
#' @param seedNum (Optional). numeric. specify seeds for reproduction (default: 123).
#' @param alpha (Optional). numeric. the parameter for "GUniFrac" controlling
#' weight on abundant lineages (default: 0.5).
#' @param p_adjust (Optional). character. method to adjust p-values by.
#' Options include "holm", "hochberg", "hommel",
#' "bonferroni", "BH", "BY", "fdr", "none". See [`stats::p.adjust()`]
#' for more details (default: "BH").
#'
#' @return
#'   A data.frame of PERMANOVA result.
#'
#' @importFrom vegan adonis vegdist
#' @importFrom tibble column_to_rownames column_to_rownames
#' @importFrom SummarizedExperiment colData assay
#' @importFrom stats setNames p.adjust
#'
#' @usage run_PERMANOVA(
#'    object,
#'    level = c(NULL, "Kingdom", "Phylum", "Class",
#'            "Order", "Family", "Genus",
#'            "Species", "Strain", "unique"),
#'    variables = "all",
#'    method = c("unifrac", "wunifrac", "GUniFrac", "bray", "dpcoa", "jsd"),
#'    mode = c("one", "all"),
#'    seedNum = 123,
#'    alpha = 0.5,
#'    p_adjust = c("none", "fdr", "bonferroni", "holm",
#'        "hochberg", "hommel", "BH", "BY"))
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' # phyloseq object
#' data("Zeybel_2022_gut")
#' run_PERMANOVA(Zeybel_2022_gut,
#'           variables = c("LiverFatClass", "Liver_fat", "Gender"),
#'           mode = "one",
#'           method = "bray")
#'
#' # SummarizedExperiment object
#' data("Zeybel_2022_protein")
#' se_impute <- impute_abundance(
#'   object = Zeybel_2022_protein,
#'   group = "LiverFatClass",
#'   ZerosAsNA = TRUE,
#'   RemoveNA = TRUE,
#'   cutoff = 20,
#'   method = "knn")
#' run_PERMANOVA(se_impute,
#'           variables = c("LiverFatClass", "Liver_fat"),
#'           mode = "all",
#'           method = "bray")
#' }
#'
run_PERMANOVA <- function(
    object,
    level = NULL,
    variables = "all",
    method = c("bray", "unifrac", "wunifrac",
               "GUniFrac", "dpcoa", "jsd"),
    mode = "one",
    seedNum = 123,
    alpha = 0.5,
    p_adjust = "BH") {

  # data("Zeybel_2022_gut")
  # object = Zeybel_2022_gut
  # level = "Phylum"
  # variables = c("LiverFatClass", "Liver_fat", "Gender")
  # mode = "one"
  # method = "bray"
  # seedNum = 123
  # alpha = 0.5
  # p_adjust = "BH"

  method <- match.arg(
    method, c("bray", "unifrac", "wunifrac",
              "GUniFrac", "dpcoa", "jsd")
  )

  p_adjust <- match.arg(
    p_adjust, c("none", "fdr", "bonferroni", "holm",
                "hochberg", "hommel", "BH", "BY")
    )

  mode <- match.arg(
    mode, c("one", "all")
  )

  # phyloseq object
  if (all(!is.null(object), inherits(object, "phyloseq"))) {
    ps <- check_sample_names(object = object)

    # taxa level
    if (!is.null(level)) {
      ps <- aggregate_taxa(x = ps, level = level)
    } else {
      ps <- ps
    }

    if (!is.null(ps@phy_tree) & (method %in%
                                 c("unifrac", "wunifrac", "GUniFrac"))) {
      method <- match.arg(
        method,
        c("unifrac", "wunifrac", "GUniFrac")
      )
    } else if (method %in% c("unifrac", "wunifrac", "GUniFrac")) {
      message("It enforces to use Bray-Curtis because no phy_tree")
      method <- "bray"
    }
    ## sample table & profile table
    sam_tab <- phyloseq::sample_data(ps) %>%
      data.frame() %>%
      tibble::rownames_to_column("TempRowNames")

    if (phyloseq::taxa_are_rows(ps)) {
      prf_tab <- phyloseq::otu_table(phyloseq::t(ps)) %>%
        data.frame()
    } else {
      prf_tab <- phyloseq::otu_table(ps) %>% data.frame()
    }

  }  else if (all(!is.null(object), inherits(object, "SummarizedExperiment"))) {
    # sample table & profile table
    sam_tab <- SummarizedExperiment::colData(object) %>%
      data.frame() %>%
      tibble::rownames_to_column("TempRowNames")
    prf_tab <- SummarizedExperiment::assay(object) %>%
      data.frame() %>%
      t()
  }

  # distance
  tryCatch(
    expr = {
      disMatrix <- run_distance(
        object = object,
        method = method,
        alpha = alpha)
    },
    error = function(e){
      message('run_distance Caught an error!')
      print(e)
    }
  )
  # variables for test
  if (all(length(variables) == 1, variables == "all")) {
    sam_tab <- sam_tab
  } else {
    sam_tab <- sam_tab %>%
      dplyr::select(dplyr::all_of(c("TempRowNames", variables)))
  }

  # set seed
  if (!is.null(seedNum)) {
    set.seed(seedNum)
  }

  if (mode == "one") {
    tryCatch(
      expr = {
        res <- .one_permanova(x = sam_tab,
                              y = disMatrix,
                              z = prf_tab,
                              m = method,
                              padjust = p_adjust)
      },
      error = function(e){
        message('.one_permanova Caught an error!')
        print(e)
      }
    )
  } else {
    tryCatch(
      expr = {
        res <- .all_permanova(x = sam_tab,
                              y = prf_tab,
                              m = method,
                              padjust = p_adjust)
      },
      error = function(e){
        message('.all_permanova Caught an error!')
        print(e)
      }
    )
  }

  return(res)
}

#' @keywords internal
#' @noRd
.one_permanova <- function(x, y, z, m, padjust) {

  # x = sam_tab
  # y = disMatrix
  # z = prf_tab
  # m = method

  dat_x <- x %>%
    dplyr::select(-TempRowNames)

  res <- data.frame()
  for (i in 1:ncol(dat_x)) {
    dat <- data.frame(value = dat_x[, i], z)
    na_index <- which(is.na(dat$value))

    if (!length(na_index) == 0) {
      # missing values
      datphe <- dat$value[-na_index]

      if (length(datphe) == 0 | length(unique(datphe)) == 1) {

        res <- c(length(datphe), rep(NA, 6))
      }
      if (length(unique(datphe)) < 6) {
        datphe <- as.factor(datphe)
      }

      dat_cln <- dat[-na_index, ]
      datprf <- dat_cln[, -1, F]
      dis <- vegan::vegdist(datprf, method = m)

    } else {

      datphe <- dat$value

      if (length(datphe) == 0 | length(unique(datphe)) == 1) {
        res <- c(length(datphe), rep(NA, 6))
      }

      if (length(unique(datphe)) < 6) {
        datphe <- as.factor(datphe)
      }

      dis <- y
    }

    # https://github.com/joey711/phyloseq/issues/1457
    ad <- vegan::adonis(unname(dis) ~ datphe, permutations = 999)
    tmp <- as.data.frame(ad$aov.tab) %>% dplyr::slice(1)
    out <- c(length(datphe), as.numeric(tmp[, c(1:6)])) %>%
      data.frame() %>%
      t()

    res <- rbind(res, out)
  }

  colnames(res) <- c("SumsOfSample", "Df", "SumsOfSqs",
                     "MeanSqs", "F.Model", "R2", "Pr(>F)")
  res$AdjustedPvalue <- stats::p.adjust(res$`Pr(>F)`, method = padjust)
  rownames(res) <- colnames(dat_x)

  return(res)
}

#' @keywords internal
#' @noRd
.all_permanova <- function(x, y, m, padjust) {

  # x = sam_tab
  # y = prf_tab
  # m = method

  # remove missing values of all columns
  phen <- na.omit(x)

  if (nrow(phen) != nrow(x)) {
    # profile
    prof <- y[rownames(y) %in% rownames(phen), , ]
  } else {
    prof <- y
  }

  # remove TempRowNames
  dat_x <- phen %>%
    dplyr::select(-TempRowNames)

  # formula
  fma <- formula(paste("prof",
                       paste0(colnames(dat_x), collapse = "+"),
                       sep = "~"))
  print(fma)

  fit <- vegan::adonis2(formula = fma,
                        data = dat_x,
                        permutations = 999,
                        method = m,
                        by = "margin")

  print(fit)

  temp_res <- data.frame(fit)[1:ncol(dat_x), , F]
  colnames(temp_res) <- c("Df", "SumsOfSqs",
                     "R2", "F.Model", "Pr(>F)")
  temp_res$SumsOfSample <- nrow(dat_x)
  temp_res$AdjustedPvalue <- stats::p.adjust(temp_res$`Pr(>F)`, method = padjust)

  res <- temp_res %>%
    dplyr::select(SumsOfSample, everything())

  return(res)
}
HuaZou/MicrobiomeAnalysis documentation built on May 13, 2024, 11:10 a.m.