R/process_normalization.R

Defines functions scale_Range scale_Pareto scale_Zscore scale_MeanCenter scale_variables normalize_feature get_norm_factors set_nf norm_cpm trans_clr gm_mean norm_clr norm_tmm norm_rle norm_css norm_tss norm_rarefy

Documented in norm_clr norm_cpm norm_css norm_rarefy norm_rle norm_tmm norm_tss scale_variables

#' @title Normalize the microbial abundance data
#'
#' @description
#' It is critical to normalize the feature table to eliminate any bias due to
#' differences in the sampling sequencing depth.This function implements six
#' widely-used normalization methods for microbial compositional data.
#'
#' @author Created by Yang Cao
#'
#' @param object a matrix, data.frame, [phyloseq::phyloseq-class] or
#'   [phyloseq::otu_table-class] object.
#' @param method the methods used to normalize the microbial abundance data.
#'   Options includes:
#'   * "none": do not normalize.
#'   * "rarefy": random subsampling counts to the smallest library size in the
#'     data set.
#'   * "TSS": total sum scaling, also referred to as "relative abundance", the
#'     abundances were normalized by dividing the corresponding sample library
#'     size.
#'   * "TMM": trimmed mean of m-values. First, a sample is chosen as reference.
#'     The scaling factor is then derived using a weighted trimmed mean over
#'     the differences of the log-transformed gene-count fold-change between
#'     the sample and the reference.
#'   * "RLE", relative log expression, RLE uses a pseudo-reference calculated
#'     using the geometric mean of the gene-specific abundances over all
#'     samples. The scaling factors are then calculated as the median of the
#'     gene counts ratios between the samples and the reference.
#'   * "CSS": cumulative sum scaling, calculates scaling factors as the
#'     cumulative sum of gene abundances up to a data-derived threshold.
#'   * "CLR": centered log-ratio normalization.
#'   * "CPM": pre-sample normalization of the sum of the values to 1e+06.
#' @param ... other arguments passed to the corresponding normalization
#'   methods.
#'
#' @seealso [edgeR::calcNormFactors()],[DESeq2::estimateSizeFactorsForMatrix()],
#' [metagenomeSeq::cumNorm()]
#'
#' @importMethodsFrom BiocGenerics normalize
#' @importFrom phyloseq sample_data<-
#' @exportMethod normalize
#'
#' @aliases normalize,phyloseq-method
#' @rdname normalize-methods
#'
#' @return the same class with `object`.
#'
#' @examples
#' data(caporaso)
#' normalize(caporaso, "TSS")
#'
setMethod(
  "normalize", "phyloseq",
  function(object,
           method = "TSS",
           ...) {
    otu <- otu_table(object)
    otu_normed <- normalize(otu, method = method, ...)

    # extract norm_factor attributes and prepend to the sample_data
    nf <- attr(otu_normed, "norm_factor")
    if (!is.null(nf)) {
      sample_data(object) <- cbind(
        sample_data(object),
        norm_factor = nf
      )
    }

    # please note otu_table<- function will drop the norm_factor attribute
    otu_table(object) <- otu_normed

    object
  }
)

#' @importMethodsFrom BiocGenerics normalize
#' @aliases normalize,otu_table-method normalize
#' @rdname normalize-methods
#'
setMethod(
  "normalize", "otu_table",
  function(object,
           method = "TSS",
           ...) {
    methods <- c("none", "rarefy", "TSS", "TMM", "RLE", "CSS", "CLR", "CPM")
    if (method %in% methods) {
      object_normed <- switch(method,
                              none = object,
                              rarefy = norm_rarefy(object, ...),
                              TSS = norm_tss(object),
                              TMM = norm_tmm(object, ...),
                              RLE = norm_rle(object, ...),
                              CSS = norm_css(object, ...),
                              CLR = norm_clr(object),
                              CPM = norm_cpm(object)
      )
    } else {
      stop(
        "`method` must be one of none, rarefy, TSS,",
        " TMM, RLE, CSS, CLR, or CPM",
        call. = FALSE
      )
    }

    object_normed
  }
)

#' @importMethodsFrom BiocGenerics normalize
#' @aliases normalize,data.frame-method normalize
#' @rdname normalize-methods
#'
setMethod(
  "normalize", "data.frame",
  function(object,
           method = "TSS",
           ...) {
    otu <- otu_table(object, taxa_are_rows = TRUE)
    otu_norm <- normalize(otu, method, ...)
    nf <- attr(otu_norm, "norm_factor")

    res <- as.data.frame(otu_norm)
    if (!is.null(nf)) {
      attr(res, "norm_factor") <- nf
    }

    res
  }
)

#' @importMethodsFrom BiocGenerics normalize
#' @aliases normalize,matrix-method normalize
#' @rdname normalize-methods
setMethod(
  "normalize", "matrix",
  function(object,
           method = "TSS",
           ...) {
    otu <- as.data.frame(object)
    otu_norm <- normalize(otu, method, ...)
    nf <- attr(otu_norm, "norm_factor")

    res <- as.matrix(otu_norm)
    if (!is.null(nf)) {
      attr(res, "norm_factor") <- nf
    }

    res
  }
)

## Four normalization methods do not save the norm factor: value, rarefy,
## clr and tss; where three methods save the norm factor: css, rle, tmm.

#' Normalize feature table by rafefying such that all samples have the same
#' number of total counts (library size).
#'
#' For rarefying, reads in the different samples are randomly removed until
#' the same predefined number has been reached, to assure all samples have the
#' same library size. Rarefying normalization method is the standard in
#' microbial ecology. Please note that the authors of phyloseq do not advocate
#' using this rarefying a normalization procedure, despite its recent
#' popularity
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#' object.
#' @param size,rng_seed,replace,trim_otus,verbose extra arguments passed to
#'   [`phyloseq::rarefy_even_depth()`].
#' @export
#' @rdname normalize-methods
#' @aliases norm_rarefy
#' @importFrom phyloseq rarefy_even_depth sample_sums
#' @seealso [`phyloseq::rarefy_even_depth()`]
norm_rarefy <- function(object,
                        size = min(sample_sums(object)),
                        rng_seed = FALSE,
                        replace = TRUE,
                        trim_otus = TRUE,
                        verbose = TRUE) {
  object_rarefied <- rarefy_even_depth(
    object,
    sample.size = size,
    rngseed = rng_seed,
    replace = replace,
    trimOTUs = trim_otus,
    verbose = verbose
  )

  # do not save the norm_factor
  # the norm factors can be calculated in the subsequently differential
  # analysis method, e.g. edgeR, DESeq
  object_rarefied
}

#' Total-Sum Scaling (TSS) method
#'
#' TSS simply transforms the feature table into relative abundance by dividing
#' the number of total reads of each sample.
#'
#' @param object object a [phyloseq::phyloseq-class] or
#'   [phyloseq::otu_table-class] object
#' @export
#' @rdname normalize-methods
#' @aliases norm_tss
#' @importFrom phyloseq otu_table<-
norm_tss <- function(object) {
  otu <- otu_table(object)
  size <- colSums(otu)
  otu_normed <- sweep(otu, MARGIN = 2, STATS = size, FUN = "/")
  otu_table(object) <- otu_table(
    otu_normed,
    taxa_are_rows = taxa_are_rows(object)
  )

  # do not save the norm_factor, the norm factors are calculated based on the
  # subsequently differential analysis method, e.g. edgeR, DESeq
  object
}

#' Cumulative-Sum Scaling (CSS) method
#'
#' CSS is based on the assumption that the count distributions in each sample
#' are equivalent for low abundant genes up to a certain threshold.  Only the
#' segment of each sample’s count distribution that is relatively invariant
#' across samples is scaled by CSS
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object.
#' @param sl The value to scale.
#' @importFrom phyloseq sample_data<-
#' @importFrom metagenomeSeq newMRexperiment cumNorm cumNormStatFast MRcounts
#' @seealso [metagenomeSeq::calcNormFactors()]
#' @export
#' @rdname normalize-methods
#' @aliases norm_css
norm_css <- function(object, sl = 1000) {
  if (inherits(object, "phyloseq")) {
    object_mgs <- phyloseq2metagenomeSeq(object)
  } else if (inherits(object, "otu_table")) {
    object_mgs <- otu_table2metagenomeSeq(object)
  }

  # cumNormStatFast requires counts of all samples at least have two
  # non zero features. Thus, if there are samples with only one non-zer
  # features, cumNormStat is taken to compute the pth quantile.
  count <- as(otu_table(object), "matrix")
  fun_p <- select_quantile_func(count)
  nf <- metagenomeSeq::calcNormFactors(object_mgs, p = fun_p(object_mgs))
  nf <- unlist(nf) / sl
  object_nf <- set_nf(object, nf)

  object_nf
}

#' Relative log expression (RLE) normalization
#'
#' RLE assumes most features are not differential and uses the relative
#' abundances to calculate the normalization factor.
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object
#' @param locfunc a function to compute a location for a sample. By default,
#'   the median is used.
#' @param type method for estimation: either "ratio"or "poscounts" (recommend).
#' @param geo_means default `NULL`, which means the geometric means of the
#'   counts are used. A vector of geometric means from another count matrix can
#'   be provided for a "frozen" size factor calculation.
#' @param control_genes default `NULL`, which means all taxa are used for size
#'   factor estimation, numeric or logical index vector specifying the taxa
#'   used for size factor estimation (e.g. core taxa).
#' @seealso [DESeq2::estimateSizeFactorsForMatrix()]
#' @export
#' @rdname normalize-methods
#' @aliases norm_rle
norm_rle <- function(object,
                     locfunc = stats::median,
                     type = c("poscounts", "ratio"),
                     geo_means = NULL,
                     control_genes = NULL) {
  stopifnot(class(object) %in% c("phyloseq", "otu_table"))
  type <- match.arg(type, c("poscounts", "ratio"))

  # use substitute() to create missing argument
  geo_means <- ifelse(is.null(geo_means), substitute(), geo_means)
  control_genes <- ifelse(is.null(control_genes), substitute(), control_genes)

  otu <- as(otu_table(object), "matrix")
  nf <- estimateSizeFactorsForMatrix(
    otu,
    locfunc = locfunc,
    geoMeans = geo_means,
    controlGenes = control_genes,
    type = type
  )
  object_nf <- set_nf(object, nf)

  object_nf
}

# https://github.com/biobakery/Maaslin2/blob/master/R/utility_scripts.R
#
#' TMM (trimmed mean of m-values) normalization
#'
#' TMM calculates the normalization factor using a robust statistics based on
#' the assumption that most features are not differential and should, in
#' average, be equal between the samples. The TMM scaling factor is  calculated
#' as the weighted mean of log-ratios between each pair of samples, after
#' excluding the highest count OTUs and OTUs with the largest log-fold change.
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object
#' @param ref_column column to use as reference
#' @param logratio_trim amount of trim to use on log-ratios
#' @param sum_trim amount of trim to use on the combined absolute levels
#'   ("A" values)
#' @param do_weighting whether to compute the weights or not
#' @param Acutoff cutoff on "A" values to use before trimming
#' @seealso [edgeR::calcNormFactors()]
#' @export
#' @rdname normalize-methods
#' @aliases norm_tmm
norm_tmm <- function(object,
                     ref_column = NULL,
                     logratio_trim = 0.3,
                     sum_trim = 0.05,
                     do_weighting = TRUE,
                     Acutoff = -1e10) {
  otu <- as(otu_table(object), "matrix")
  nf <- edgeR::calcNormFactors(
    otu,
    method = "TMM",
    refcolumn = ref_column,
    logratioTrim = logratio_trim,
    sumTrim = sum_trim,
    doWeighting = do_weighting,
    Acutoff = Acutoff
  )
  object_nf <- set_nf(object, nf)

  object_nf
}

#' CLR (centered log-ratio) normalization
#'
#' In CLR, the log-ratios are computed relative to the geometric mean of all
#' features.
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#'   object
#' @export
#' @rdname normalize-methods
#' @aliases norm_clr
norm_clr <- function(object) {
  otu <- as(otu_table(object), "matrix")
  otu_norm <- apply(otu, 2, trans_clr)

  otu_table(object) <- otu_table(
    otu_norm,
    taxa_are_rows = taxa_are_rows(object)
  )

  # do not save the norm_factor, the norm factors are calculated based on the
  # subsequently differential analysis method, e.g. edgeR, DESeq
  object
}

# from joey711/shiny-phyloseq/blob/master/panels/paneldoc/Transform.md
gm_mean <- function(x, na.rm = TRUE) {
  # The geometric mean, with some error-protection bits.
  exp(sum(log(x[x > 0 & !is.na(x)]), na.rm = na.rm) / length(x))
}

trans_clr <- function(x, base = exp(1)) {
  x <- log((x / gm_mean(x)), base)
  x[!is.finite(x) | is.na(x)] <- 0.0
  return(x)
}

#' Normalize the sum of values of each sample to million (counts per million)
#'
#' `norm_cpm`: This normalization method is from the original LEfSe algorithm,
#' recommended when very low values are present (as shown in the LEfSe galaxy).
#'
#' @param object a [phyloseq::phyloseq-class] or [phyloseq::otu_table-class]
#' @export
#' @rdname normalize-methods
#' @aliases norm_cpm
#' @importFrom phyloseq transform_sample_counts
norm_cpm <- function(object) {
  otu <- as(otu_table(object), "matrix") %>%
    as.data.frame()

  # whether the object is summarized
  hie <- check_tax_summarize(object)
  if (hie) {
    features <- row.names(otu)
    features_split <- strsplit(features, "|", fixed = TRUE)
    single_indx <- which(lengths(features_split) < 2)

    ## keep the counts of a sample identical with `normalization`
    ## if we norm the counts in two steps:
    ## 1. calculate scale size: norm_coef = normalization/lib_size;
    ## 2. multiple the scale size value * norm_coef
    ## the counts of a sample colSums(otu) may not equal to the argument
    ## normalization.
    ## e.g. normalization = 1e6, colSums(otu) = 999999
    ## Finally, the kruskal test may be inaccurate,
    ## e.g. https://github.com/yiluheihei/microbiomeMarker/issues/13
    ps_normed <- transform_sample_counts(
      object,
      function(x) x * 1e+06 / sum(x[single_indx])
    )
  } else {
    ps_normed <- transform_sample_counts(
      object,
      function(x) x * 1e+06 / sum(x)
    )
  }

  otu_normed <- data.frame(otu_table(ps_normed))
  otu_normed <- purrr::map_df(
    otu_normed,
    function(x) {
      if (mean(x) && stats::sd(x) / mean(x) < 1e-10) {
        return(round(x * 1e6) / 1e6)
      } else {
        return(x)
      }
    }
  )

  otu_normed <- as.data.frame(otu_normed)
  row.names(otu_normed) <- row.names(otu)
  colnames(otu_normed) <- colnames(otu)
  otu_table(object) <- otu_table(otu_normed, taxa_are_rows = TRUE)

  # do not save the norm_factor, the norm factors are calculated based on the
  # subsequently differential analysis method, e.g. edgeR, DESeq
  object
}

# set the norm factors of the object, if object is in phyloseq-class, add a
# var `norm_factor` in the sample_data to save the norm factors of each sample;
# if object in otu_table-class, add a attributes `norm_factor` to the object
# to save the norm factors.
#' @importFrom phyloseq sample_data<-
#' @noRd
set_nf <- function(object, nf) {
  # ensure norm factors from sample_data and attributes of otu_table are
  # identical
  names(nf) <- NULL

  if (inherits(object, "phyloseq")) {
    sample_data(object) <- cbind(sample_data(object), norm_factor = nf)
    # to keep accordance with otu_table,
    # we also add the attributes `norm_factor` to otu_table
    ot <- otu_table(object)
    attr(ot, "norm_factor") <- nf
    otu_table(object) <- ot
  } else if (inherits(object, "otu_table")) {
    attr(object, "norm_factor") <- nf
  } else {
    stop("object must be a `phloseq` or `otu_table` object")
  }

  object
}


#' Extract the normalization factors
#'
#' This function will be used to extract the normalization factors. After
#' dividing the observed feature table by normalization factors (eliminate
#' sequencing biases), we will obtain the normalized feature table.
#'
#' @param object a [`phyloseq::phyloseq-class`], [phyloseq::otu_table-class]
#'   object.
#' @return a numeric vector with the length equal to the number of samples, or
#'  `NULL` if the `object` has not been normalized.
#' @noRd
get_norm_factors <- function(object) {
  if (inherits(object, "phyloseq")) {
    nf <- sample_data(object)$norm_factor
  } else {
    nf <- attr(object, "norm_factor")
  }

  nf
}


# Deprecated functions ----------------------------------------------------

# This function is deprecated
#' normalize the summarized feature
#' @param feature otu table or data.frame
#' @param normalization  set the normalization value
#' @noRd
normalize_feature <- function(feature, normalization) {
  if (inherits(feature, "otu_table")) {
    if (!taxa_are_rows(feature)) {
      feature <- t(feature)
    }
    feature <- feature@.Data %>% data.frame()
  }
  if (is.null(normalization)) {
    return(feature)
  }

  feature_split <- strsplit(row.names(feature), "\\|")
  hie <- ifelse(any(lengths(feature_split) > 1), TRUE, FALSE)
  if (hie) {
    single_indx <- which(lengths(feature_split) < 2)
    abd <- purrr::map_dbl(feature, ~ sum(.x[single_indx]))
  } else {
    abd <- purrr::map_dbl(feature, sum)
  }
  normed_coef <- normalization / abd
  normed_feature <- purrr::map2_df(
    feature, normed_coef,
    function(x, y) {
      res <- x * y
      if (mean(res) && stats::sd(res) / mean(res) < 1e-10) {
        res <- round(res * 1e6) / 1e6
      }
      res
    }
  )

  # for row names setting, phyloseq requires otu_table and tax_table has the
  # same taxa
  normed_feature <- as.data.frame(normed_feature)
  row.names(normed_feature) <- row.names(feature)

  otu_table(normed_feature, taxa_are_rows = TRUE)
}


#' @title Data scaling adjusts each variable/feature
#'
#' @description
#' Data Normalization not only normalizes data by samples, but also
#' scales data by variable/feature. Data scaling adjusts each variable/feature
#' by a scaling factor computed based on the dispersion of the variable. The
#' former is to change data distribution per sample and the latter is to put
#' variable/feature into same distribution.
#'
#' @author Created by Hua Zou (12/02/2022 Shenzhen China)
#'
#' @param object (Required). a [`phyloseq::phyloseq-class`] or
#' [`SummarizedExperiment::SummarizedExperiment-class`] object.
#' @param level (Optional). character. Summarization
#' level (from \code{rank_names(pseq)}, default: NULL).
#' @param method (Optional). character. scaling methods. Options are:
#'  * "none", return the original data
#'  * "mean_center": values minus mean statistic.
#'  * "zscore": mean-centered and divided by the
#'  standard deviation of each variable.
#'  * "pareto": mean-centered and divided by the square root of
#'  the standard deviation of each variable.
#'  * "range": mean-centered and divided by the range of each variable.
#'  (default: "none").
#'
#' @usage scale_variables(
#'    object,
#'    level = c(NULL, "Kingdom", "Phylum", "Class",
#'            "Order", "Family", "Genus",
#'            "Species", "Strain", "unique"),
#'    method = c("none", "mean_center", "zscore",
#'             "pareto", "range")
#'    )
#'
#' @export
#'
#' @return A [`phyloseq::phyloseq-class`] or
#' [`SummarizedExperiment::SummarizedExperiment-class`] object with cleaned data.
#'
#' @importFrom dplyr %>%
#'
#' @examples
#'
#' \dontrun{
#' # phyloseq object
#' data("enterotypes_arumugam")
#' scale_variables(
#'   object = enterotypes_arumugam,
#'   level = "Phylum",
#'   method = "mean_center")
#'
#' # SummarizedExperiment object
#' data("Zeybel_2022_protein")
#' scale_variables(
#'   Zeybel_2022_protein,
#'   method = "zscore")
#' }
#'
scale_variables <- function(
    object,
    level = NULL,
    method = c("none", "mean_center", "zscore",
               "pareto", "range")) {

  # object = enterotypes_arumugam
  # level = "Genus"
  # method = "zscore"

  # object = Zeybel_2022_protein
  # level = NULL
  # method = "zscore"

  method <- match.arg(
    method, c("none", "mean_center", "zscore",
                 "pareto", "range")
  )

  if (any(inherits(object, "environment"), inherits(object, "phyloseq"))) {

    # taxa level
    if (!is.null(level)) {
      object <- aggregate_taxa(x = object, level = level)
    } else {
      object <- object
    }

    # profile: Row->features; Column->samples
    otu <- as(otu_table(object), "matrix")
  } else if (inherits(object, "SummarizedExperiment")) {
    # profile: Row->features; Column->samples (features-by-samples matrix)
    otu <- SummarizedExperiment::assay(object) %>%
      as.data.frame()
  } else {
    otu <- as.matrix(object)
  }

  if (method == "none") {
    abd <- otu
  } else if (method == "mean_center") {
    abd <- apply(otu, 1, scale_MeanCenter) %>%
      t()
  } else if (method == "zscore") {
    abd <- apply(otu, 1, scale_Zscore) %>%
      t()
  } else if (method == "pareto") {
    abd <- apply(otu, 1, scale_Pareto) %>%
      t()
  } else if (method == "range") {
    abd <- apply(otu, 1, scale_Range) %>%
      t()
  }

  if (any(inherits(object, "environment"), inherits(object, "phyloseq"))) {
    phyloseq::otu_table(object) <- phyloseq::otu_table(abd, taxa_are_rows = taxa_are_rows(object))
  } else if (inherits(object, "SummarizedExperiment")) {

    if (nrow(abd) != nrow(otu)) {
      rdata <- SummarizedExperiment::rowData(object)
      cdata <- SummarizedExperiment::colData(object)
      if (length(object@metadata) == 0) {
        mdata <- NULL
      } else {
        mdata <- object@metadata
      }
      res <- import_SE(object = abd,
                       rowdata = rdata,
                       coldata = cdata,
                       metadata = mdata)

      # res <- base::subset(res, colnames(prf_tab) %in% colnames(depurdata))
    } else {
      SummarizedExperiment::assay(object) <- abd
    }

  } else {
    object <- abd
  }

  return(object)
}

#' values minus mean statistic
#' @keywords internal
#' @noRd
scale_MeanCenter <- function(x) {
  return(x - mean(x, na.rm = TRUE)) # mean(x, na.rm = TRUE)
}

#' mean-centered and divided by the standard deviation of each variable
#' @keywords internal
#' @noRd
scale_Zscore <- function(x) {
  return((x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE))
}

#' mean-centered and divided by the square root of the standard deviation
#' of each variable.
#' @keywords internal
#' @noRd
scale_Pareto <- function(x) {
  return((x - mean(x, na.rm = TRUE)) / sqrt(sd(x, na.rm = TRUE)))
}

#' mean-centered and divided by the range of each variable.
#' @keywords internal
#' @noRd
scale_Range <- function(x) {
  if (max(x, na.rm = TRUE) == min(x, na.rm = TRUE)) {
    return(x)
  } else {
    return((x - mean(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
  }
}
HuaZou/MicrobiomeAnalysis documentation built on May 13, 2024, 11:10 a.m.