R/addAlpha.R

Defines functions .calculate_alpha .alpha_rarefaction .get_indices

#' Estimate alpha diversity indices
#'
#' These functions estimates alpha diversity indices optionally using
#' rarefaction.
#'
#' @inheritParams calculateDMN
#'
#' @param x a \code{\link[SummarizedExperiment]{SummarizedExperiment}} object.
#'
#' @param assay.type \code{Character scalar}. Specifies the name of assay
#'   used in calculation. (Default: \code{"counts"})
#'
#' @param index \code{Character vector}. Specifies the alpha diversity
#'   indices to be calculated.
#'
#' @param name \code{Character vector}. A name for the column of the
#'   \code{colData} where results will be stored. (Default: \code{index})
#'
#' @param niter \code{Integer scalar}. Specifies the number of
#'   rarefaction rounds. Rarefaction is not applied when \code{niter=NULL}
#'   (see Details section). (Default: \code{NULL})
#'
#' @param ... optional arguments:
#' \itemize{
#'   \item \code{sample}: \code{Integer scalar}. Specifies the rarefaction
#'   depth i.e. the number of counts drawn from each sample.
#'   (Default: \code{min(colSums2(assay(x, assay.type)))})
#'
#'   \item \code{tree.name}: \code{Character scalar}. Specifies which rowTree
#'    will be used. ( Faith's index). (Default: \code{"phylo"})
#'
#'   \item \code{node.label}: \code{Character vector} or \code{NULL} Specifies
#'   the links between rows and node labels of phylogeny tree specified
#'   by \code{tree.name}. If a certain row is not linked with the tree, missing
#'   instance should be noted as NA. When \code{NULL}, all the rownames should
#'   be found from the tree. (Faith's index). (Default: \code{NULL})
#'
#'   \item \code{only.tips}: (Faith's index). \code{Logical scalar}. Specifies
#'   whether to remove internal nodes when Faith's index is calculated.
#'   When \code{only.tips=TRUE}, those rows that are not tips of tree are
#'   removed. (Default: \code{FALSE})
#'
#'   \item \code{threshold}: (Coverage and all evenness indices).
#'   \code{Numeric scalar}.
#'   From \code{0 to 1}, determines the threshold for coverage and evenness
#'   indices. When evenness indices are calculated values under or equal to
#'   this threshold are denoted as zeroes. For coverage index, see details.
#'   (Default: \code{0.5} for coverage, \code{0} for evenness indices)
#'
#'   \item \code{quantile}: (log modulo skewness index). \code{Numeric scalar}.
#'   Arithmetic abundance classes are evenly cut up to to this quantile of the
#'   data. The assumption is that abundances higher than this are not common,
#'   and they are classified in their own group. (Default: \code{0.5})
#'
#'   \item \code{nclasses}: (log modulo skewness index). \code{Integer scalar}.
#'   The number of arithmetic abundance classes from zero to the quantile cutoff
#'   indicated by \code{quantile}. (Default: \code{50})
#'
#'   \item \code{ntaxa}: (absolute and relative indices). \code{Integer scalar}.
#'   The n-th position of the dominant taxa to consider. (Default: \code{1})
#'
#'   \item \code{aggregate}: (absolute, dbp, dmn, and relative indices).
#'   \code{Logical scalar}. Aggregate the values for top members selected by
#'   \code{ntaxa} or not. If \code{TRUE}, then the sum of relative abundances
#'   is returned. Otherwise the relative abundance is returned for the single
#'   taxa with the indicated rank (default: \code{aggregate = TRUE}).
#'
#'   \item \code{detection}: (observed index). \code{Numeric scalar} Selects
#'   detection threshold for the abundances (Default: \code{0})
#' }
#'
#' @return
#' \code{getAlpha} returns a \code{DataFrame}.
#' \code{addAlpha} returns a \code{x} with additional \code{colData} column(s)
#' named \code{name}.
#'
#' @details
#'
#' ## Diversity
#'
#' Alpha diversity is a joint quantity that combines elements or community
#' richness and evenness. Diversity increases, in general, when species richness
#' or evenness increase.
#'
#' The following diversity indices are available:
#'
#' \itemize{
#'
#' \item 'coverage': Number of species needed to cover a given fraction of
#' the ecosystem (50 percent by default). Tune this with the threshold
#' argument.
#'
#' \item 'faith': Faith's phylogenetic alpha diversity index measures how
#' long the taxonomic distance is between taxa that are present in the sample.
#' Larger values represent higher diversity. Using this index requires
#' rowTree. (Faith 1992)
#'
#' If the data includes features that are not in tree's tips but in
#' internal nodes, there are two options. First, you can keep those features,
#' and prune the tree to match features so that each tip can be found from
#' the features. Other option is to remove all features that are not tips.
#' (See \code{only.tips} parameter)
#'
#' \item 'fisher': Fisher's alpha; as implemented in
#' \code{\link[vegan:diversity]{vegan::fisher.alpha}}. (Fisher et al. 1943)
#'
#' \item 'gini_simpson': Gini-Simpson diversity i.e. \eqn{1 - lambda},
#' where \eqn{lambda} is the
#' Simpson index, calculated as the sum of squared relative abundances.
#' This corresponds to the diversity index
#' 'simpson' in \code{\link[vegan:diversity]{vegan::diversity}}.
#' This is also called Gibbs–Martin, or Blau index in sociology,
#' psychology and management studies. The Gini-Simpson index (1-lambda)
#' should not be
#' confused with Simpson's dominance (lambda), Gini index, or
#' inverse Simpson index (1/lambda).
#'
#' \item 'inverse_simpson': Inverse Simpson diversity:
#' \eqn{1/lambda} where \eqn{lambda=sum(p^2)} and p refers to relative
#' abundances.
#' This corresponds to the diversity index
#' 'invsimpson' in vegan::diversity. Don't confuse this with the
#' closely related Gini-Simpson index
#'
#' \item 'log_modulo_skewness': The rarity index characterizes the
#' concentration of species at low abundance. Here, we use the skewness of
#' the frequency
#' distribution of arithmetic abundance classes (see Magurran & McGill 2011).
#' These are typically right-skewed; to avoid taking log of occasional
#' negative skews, we follow Locey & Lennon (2016) and use the log-modulo
#' transformation that adds a value of one to each measure of skewness to
#' allow logarithmization.
#'
#' \item 'shannon': Shannon diversity (entropy).
#'
#' }
#'
#' ## Dominance
#'
#' A dominance index quantifies the dominance of one or few species in a
#' community. Greater values indicate higher dominance.
#'
#' Dominance indices are in general negatively correlated with alpha diversity
#' indices (species richness, evenness, diversity, rarity). More dominant
#' communities are less diverse.
#'
#' The following community dominance indices are available:
#'
#' \itemize{
#'
#' \item 'absolute': Absolute index equals to the absolute abundance of the
#' most dominant n species of the sample (specify the number with the argument
#' \code{ntaxa}). Index gives positive integer values.
#'
#' \item 'dbp': Berger-Parker index (See Berger & Parker 1970) calculation
#' is a special case of the 'relative' index. dbp is the relative abundance of
#' the most
#' abundant species of the sample. Index gives values in interval 0 to 1,
#' where bigger value represent greater dominance.
#'
#' \deqn{dbp = \frac{N_1}{N_{tot}}}{%
#' dbp = N_1/N_tot} where \eqn{N_1} is the absolute abundance of the most
#' dominant species and \eqn{N_{tot}} is the sum of absolute abundances of all
#' species.
#'
#' \item 'core_abundance': Core abundance index is related to core species.
#' Core species are species that are most abundant in all samples, i.e., in
#' whole data set. Core species are defined as those species that have
#' prevalence over 50\%. It means that in order to belong to core species,
#' species must be prevalent in 50\% of samples. Core species are used to
#' calculate the core abundance index. Core abundance index is sum of relative
#' abundances of core species in the sample. Index gives values in interval
#' 0 to 1, where bigger value represent greater dominance.
#'
#' \deqn{core_abundance = \frac{N_{core}}{N_{tot}}}{%
#' core_abundance = N_core/N_tot} where \eqn{N_{core}} is the sum of absolute
#' abundance of the core species and \eqn{N_{tot}} is the sum of absolute
#' abundances of all species.
#'
#' \item 'gini':  Gini index is probably best-known from socio-economic
#' contexts (Gini 1921). In economics, it is used to measure, for example, how
#' unevenly income is distributed among population. Here, Gini index is used
#' similarly, but income is replaced with abundance.
#'
#' If there is small group of species
#' that represent large portion of total abundance of microbes, the inequality
#' is large and Gini index closer to 1. If all species has equally large
#' abundances, the equality is perfect and Gini index equals 0. This index
#' should not be confused with Gini-Simpson index, which quantifies diversity.
#'
#' \item 'dmn': McNaughton’s index is the sum of relative abundances of the two
#' most abundant species of the sample (McNaughton & Wolf, 1970). Index gives
#' values in the unit interval:
#'
#' \deqn{dmn = (N_1 + N_2)/N_tot}
#'
#' where \eqn{N_1} and \eqn{N_2} are the absolute
#' abundances of the two most dominant species and \eqn{N_{tot}} is the sum of
#' absolute abundances of all species.
#'
#' \item 'relative': Relative index equals to the relative abundance of the
#' most dominant n species of the sample (specify the number with the
#' argument \code{ntaxa}).
#' This index gives values in interval 0 to 1.
#'
#' \deqn{relative = N_1/N_tot}
#'
#' where \eqn{N_1} is the absolute abundance of the most
#' dominant species and \eqn{N_{tot}} is the sum of absolute abundances of all
#' species.
#'
#' \item 'simpson_lambda': Simpson's (dominance) index or Simpson's lambda is
#' the sum of squared relative abundances. This index gives values in the unit
#' interval. This value equals the probability that two randomly chosen
#' individuals belongs to the
#' same species. The higher the probability, the greater the dominance (See
#' e.g. Simpson 1949).
#'
#' \deqn{lambda = \sum(p^2)}
#'
#' where p refers to relative abundances.
#'
#' There is also a more advanced Simpson dominance index (Simpson 1949).
#' However, this is not provided and the simpler squared sum of relative
#' abundances is used instead as the alternative index is not in the unit
#' interval and it is highly
#' correlated with the simpler variant implemented here.
#'
#' }
#'
#' ## Evenness
#'
#' Evenness is a standard index in community ecology, and it quantifies how
#' evenly the abundances of different species are distributed. The following
#' evenness indices are provided:
#'
#' By default, this function returns all indices.
#'
#' The available evenness indices include the following (all in lowercase):
#' \itemize{
#'   \item 'camargo': Camargo's evenness (Camargo 1992)
#'   \item 'simpson_evenness': Simpson’s evenness is calculated as inverse
#'   Simpson diversity (1/lambda) divided by
#'   observed species richness S: (1/lambda)/S.
#'   \item 'pielou': Pielou's evenness (Pielou, 1966), also known as Shannon or
#'   Shannon-Weaver/Wiener/Weiner
#'   evenness; H/ln(S). The Shannon-Weaver is the preferred term; see
#'   Spellerberg and Fedor (2003).
#'   \item 'evar': Smith and Wilson’s Evar index (Smith & Wilson 1996).
#'   \item 'bulla': Bulla’s index (O) (Bulla 1994).
#' }
#'
#' Desirable statistical evenness metrics avoid strong bias towards very
#' large or very small abundances; are independent of richness; and range
#' within the unit interval with increasing evenness (Smith & Wilson 1996).
#' Evenness metrics that fulfill these criteria include at least camargo,
#' simpson, smith-wilson, and bulla. Also see Magurran & McGill (2011)
#' and Beisel et al. (2003) for further details.
#'
#' ## Richness
#'
#' The richness is calculated per sample. This is a standard index in community
#' ecology, and it provides an estimate of the number of unique species in the
#' community. This is often not directly observed for the whole community but
#' only for a limited sample from the community. This has led to alternative
#' richness indices that provide different ways to estimate the species
#' richness.
#'
#' Richness index differs from the concept of species diversity or evenness in
#' that it ignores species abundance, and focuses on the binary presence/absence
#' values that indicate simply whether the species was detected.
#'
#' The function takes all index names in full lowercase. The user can provide
#' the desired spelling through the argument \code{\link{name}} (see examples).
#'
#' The following richness indices are provided.
#'
#' \itemize{
#'
#'   \item 'ace': Abundance-based coverage estimator (ACE) is another
#'   nonparametric richness
#'   index that uses sample coverage, defined based on the sum of the
#'   probabilities
#'   of the observed species. This method divides the species into abundant
#'   (more than 10
#'   reads or observations) and rare groups
#'   in a sample and tends to underestimate the real number of species. The
#'   ACE index
#'   ignores the abundance information for the abundant species,
#'   based on the assumption that the abundant species are observed regardless
#'   of their
#'   exact abundance. We use here the bias-corrected version
#'   (O'Hara 2005, Chiu et al. 2014) implemented in
#'   \code{\link[vegan:specpool]{estimateR}}.
#'   For an exact formulation, see \code{\link[vegan:specpool]{estimateR}}.
#'   Note that this index comes with an additional column with standard
#'   error information.
#'
#'   \item 'chao1': This is a nonparametric estimator of species richness. It
#'   assumes that rare species carry information about the (unknown) number
#'   of unobserved species. We use here the bias-corrected version
#'   (O'Hara 2005, Chiu et al. 2014) implemented in
#'   \code{\link[vegan:specpool]{estimateR}}. This index implicitly
#'   assumes that every taxa has equal probability of being observed. Note
#'   that it gives a lower bound to species richness. The bias-corrected
#'   for an exact formulation, see \code{\link[vegan:specpool]{estimateR}}.
#'   This estimator uses only the singleton and doubleton counts, and
#'   hence it gives more weight to the low abundance species.
#'   Note that this index comes with an additional column with standard
#'   error information.
#'
#'   \item 'hill': Effective species richness aka Hill index
#'   (see e.g. Chao et al. 2016).
#'   Currently only the case 1D is implemented. This corresponds to the exponent
#'   of Shannon diversity. Intuitively, the effective richness indicates the
#'   number of
#'   species whose even distribution would lead to the same diversity than the
#'   observed
#'   community, where the species abundances are unevenly distributed.
#'
#'   \item 'observed': The _observed richness_ gives the number of species that
#'   is detected above a given \code{detection} threshold in the observed sample
#'   (default 0). This is conceptually the simplest richness index. The
#'   corresponding index in the \pkg{vegan} package is "richness".
#'
#' }
#'
#' @references
#'
#' Beisel J-N. et al. (2003)
#' A Comparative Analysis of Diversity Index Sensitivity.
#' _Internal Rev. Hydrobiol._ 88(1):3-15.
#' \url{https://portais.ufg.br/up/202/o/2003-comparative_evennes_index.pdf}
#'
#' Berger WH & Parker FL (1970)
#' Diversity of Planktonic Foraminifera in Deep-Sea Sediments.
#' _Science_ 168(3937):1345-1347. doi: 10.1126/science.168.3937.1345
#'
#' Bulla L. (1994)
#' An  index of diversity and its associated diversity measure.
#' _Oikos_ 70:167--171
#'
#' Camargo, JA. (1992)
#' New diversity index for assessing structural alterations in aquatic
#' communities.
#' _Bull. Environ. Contam. Toxicol._ 48:428--434.
#'
#' Chao A. (1984)
#' Non-parametric estimation of the number of classes in a population.
#' _Scand J Stat._ 11:265–270.
#'
#' Chao A, Chun-Huo C, Jost L (2016).
#' Phylogenetic Diversity Measures and Their Decomposition:
#' A Framework Based on Hill Numbers. Biodiversity Conservation and
#' Phylogenetic Systematics,
#' Springer International Publishing, pp. 141–172,
#' doi:10.1007/978-3-319-22461-9_8.
#'
#' Chiu, C.H., Wang, Y.T., Walther, B.A. & Chao, A. (2014).
#' Improved nonparametric lower bound of species richness via a modified
#' Good-Turing frequency formula.
#' _Biometrics_ 70, 671-682.
#'
#' Faith D.P. (1992)
#' Conservation evaluation and phylogenetic diversity.
#' _Biological Conservation_ 61(1):1-10.
#'
#' Fisher R.A., Corbet, A.S. & Williams, C.B. (1943)
#' The relation between the number of species and the number of individuals in
#' a random sample of animal population.
#' _Journal of Animal Ecology_ *12*, 42-58.
#'
#' Gini C (1921)
#' Measurement of Inequality of Incomes.
#' _The Economic Journal_ 31(121): 124-126. doi: 10.2307/2223319
#'
#' Locey KJ and Lennon JT. (2016)
#' Scaling laws predict global microbial diversity.
#' _PNAS_ 113(21):5970-5975; doi:10.1073/pnas.1521291113.
#'
#' Magurran AE, McGill BJ, eds (2011)
#' Biological Diversity: Frontiers in Measurement and Assessment
#' (Oxford Univ Press, Oxford), Vol 12.
#'
#' McNaughton, SJ and Wolf LL. (1970).
#' Dominance and the niche in ecological systems.
#' _Science_ 167:13, 1--139
#'
#' O'Hara, R.B. (2005).
#' Species richness estimators: how many species can dance on the head of a pin?
#' _J. Anim. Ecol._ 74, 375-386.
#'
#' Pielou, EC. (1966)
#' The measurement of diversity in different types of
#' biological collections. _J Theoretical Biology_ 13:131--144.
#'
#' Simpson EH (1949)
#' Measurement of Diversity.
#' _Nature_ 163(688). doi: 10.1038/163688a0
#'
#' Smith B and Wilson JB. (1996)
#' A Consumer's Guide to Evenness Indices.
#' _Oikos_ 76(1):70-82.
#'
#' Spellerberg and Fedor (2003).
#' A tribute to Claude Shannon (1916 –2001) and a plea for more rigorous use of
#' species richness, species diversity and the ‘Shannon–Wiener’ Index.
#' _Alpha Ecology & Biogeography_ 12, 177–197.
#'
#' @seealso
#' \itemize{
#'   \item \code{\link[scater:plotColData]{plotColData}}
#'   \item \code{\link[vegan:specpool]{estimateR}}
#'   \item \code{\link[vegan:diversity]{diversity}}
#' }
#'
#' @examples
#'
#' data("GlobalPatterns")
#' tse <- GlobalPatterns
#'
#' # Calculate the default Shannon index with no rarefaction
#' tse <- addAlpha(tse, index = "shannon")
#'
#' # Shows the estimated Shannon index
#' tse$shannon
#'
#' # Calculate observed richness with 10 rarefaction rounds
#' tse <- addAlpha(tse,
#'    assay.type = "counts",
#'    index = "observed_richness",
#'    sample = min(colSums(assay(tse, "counts")), na.rm = TRUE),
#'    niter=10)
#'
#' # Shows the estimated observed richness
#' tse$observed_richness
#'
#' # One can also calculate the indices and get the results without adding
#' # them to colData
#' res <- getAlpha(tse, index = "shannon")
#' res |> head()
#'
#' @name addAlpha
#' @export
NULL

#' @rdname addAlpha
#' @export
setMethod("addAlpha", signature = c(x = "SummarizedExperiment"),
    function(x, ...){
        # Hiddenly support altExp
        x <- .check_and_get_altExp(x, ...)
        # Calculate indices
        args <- c(list(x = x), list(...))
        args <- args[ !names(args) %in% c("altexp") ]
        res <- do.call(getAlpha, args)
        # Add them to colData
        name <- colnames(res)
        res <- as.list(res)
        res <- unname(res)
        x <- .add_values_to_colData(x, res, name)
        return(x)
    }
)

#' @rdname addAlpha
#' @importFrom BiocParallel bplapply
#' @export
setMethod("getAlpha", signature = c(x = "SummarizedExperiment"),
    function(
        x, assay.type = "counts",
        index = c(
            "coverage_diversity", "fisher_diversity", "faith_diversity",
            "gini_simpson_diversity", "inverse_simpson_diversity",
            "log_modulo_skewness_diversity", "shannon_diversity",
            "absolute_dominance", "dbp_dominance",
            "core_abundance_dominance", "gini_dominance",
            "dmn_dominance", "relative_dominance",
            "simpson_lambda_dominance", "camargo_evenness",
            "pielou_evenness", "simpson_evenness",
            "evar_evenness", "bulla_evenness", "ace_richness",
            "chao1_richness", "hill_richness", "observed_richness"),
        name = index, niter = NULL, BPPARAM = SerialParam(), ...){
        ############################## Input check #############################
        # Support altExp hiddenly
        x <- .check_and_get_altExp(x, ...)
        # Check assay.type
        .check_assay_present(assay.type, x)
        # Check that index is a character vector
        if( !.is_non_empty_character(index) ){
            stop("'index' should be a character vector.", call. = FALSE)
        }
        # if multiple indices to be estimated, name should a vector of
        # same length
        if( !.is_non_empty_character(name) || length(name) != length(index) ){
            stop(
                "'name' must be a non-empty character value and have the ",
                "same length than 'index'.",
                call. = FALSE)
        }
        # Check niter
        if( !(is.null(niter) || (.is_an_integer(niter) && niter >= 0)) ){
            stop("'niter' must be NULL or an integer.", call. = FALSE)
        }
        # Check if index exists. For each index input, detect it and get
        # information (e.g. internal function) to calculate the index.
        index <- .get_indices(index, name, x, assay.type, ...)
        ############################ Input check end ###########################
        # Looping over the data.frame of indices to be estimated
        res <- bplapply(index, function(ind){
            # Performing rarefaction if sample is specified
            if( !is.null(niter) && niter > 0 ){
                temp <- .alpha_rarefaction(
                    x, assay.type, ind[[1]], ind[[6]], niter, BPPARAM, ...)
            } else {
                # Estimate index without rarefaction
                temp <- .calculate_alpha(x, assay.type, ind[[1]], ind[[6]], ...)
            }
            return(temp)

        }, BPPARAM = BPPARAM)
        # Combine results into single DF
        res <- do.call(cbind, res)
        res <- DataFrame(res)
        return(res)
    }
)

################################ HELP FUNCTIONS ################################

# Search alpha diversity index that user wants to calculate.
.get_indices <- function(index, name, x, assay.type, tree = NULL, ...){
    # Initialize list for supported indices
    supported <- list()
    # Supported diversity indices
    temp <- c(
        "coverage", "faith", "fisher", "gini_simpson", "inverse_simpson",
        "log_modulo_skewness", "shannon")
    temp <- data.frame(index = temp)
    temp[["measure"]] <- "diversity"
    temp[["index_long"]] <- paste0(temp[["index"]], "_", temp[["measure"]])
    temp[["non_neg"]] <- c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)
    supported[["diversity"]] <- temp
    # Supported dominance indices
    temp <- c(
        "absolute", "dbp", "core_abundance", "gini", "dmn", "relative",
        "simpson_lambda")
    temp <- data.frame(index = temp)
    temp[["measure"]] <- "dominance"
    temp[["index_long"]] <- paste0(temp[["index"]], "_", temp[["measure"]])
    temp[["non_neg"]] <- c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE)
    supported[["dominance"]] <- temp
    # Supported evenness indices
    temp <- c(
        "camargo", "pielou", "simpson", "evar", "bulla")
    temp <- data.frame(index = temp)
    temp[["measure"]] <- "evenness"
    temp[["index_long"]] <- paste0(temp[["index"]], "_", temp[["measure"]])
    temp[["non_neg"]] <- c(FALSE, FALSE, FALSE, FALSE, FALSE)
    supported[["eveness"]] <- temp
    # Supported richness indices
    temp <- c(
        "ace", "chao1", "hill", "observed")
    temp <- data.frame(index = temp)
    temp[["measure"]] <- "richness"
    temp[["index_long"]] <- paste0(temp[["index"]], "_", temp[["measure"]])
    temp[["non_neg"]] <- c(FALSE, FALSE, FALSE, FALSE)
    supported[["richness"]] <- temp
    # Combine
    supported <- do.call(rbind, supported)
    # Find the index that user wants to calculate
    ind <- match(tolower(index), supported[["index_long"]])
    ind_short <- match(tolower(index), supported[["index"]])
    ind[ is.na(ind) ] <- ind_short[ is.na(ind) ]
    detected <- supported[ind, ]
    # Add the index that was searched
    detected[["search"]] <- index
    # Add names that user wants to use when storing results to colData
    detected[["name"]] <- name
    # Check if there are indices that were not detected
    if( any(is.na(detected[["index"]])) ){
        not_detected <- paste0(
            detected[is.na(detected[["index"]]), "search"], collapse = "', '")
        not_detected <- paste0("'", not_detected, "'")
        stop(
            "'index' is corresponding to none of the alpha diversity ",
            "indices. The following 'index' was not detected: ", not_detected,
            call. = FALSE)
    }
    # Faith index is available only for TreeSE with rowTree
    tse_with_tree <- (is(x, "TreeSummarizedExperiment") &&
        !is.null(rowTree(x))) || !is.null(tree)
    if( "faith" %in% detected[["index"]] && !tse_with_tree ){
        # Drop faith index from indices being calculated
        detected <- detected[!detected[["index"]] %in% c("faith"), ]
        # If there are still other indices being calculated, give warning.
        # Otherwise, give error if faith was the only index that user wants to
        # calculate.
        FUN <- if( nrow(detected) == 0 ) stop else warning
        FUN("'faith' index can be calculated only for TreeSE with rowTree(x) ",
            "populated or with 'tree' provided separately.", call. = FALSE)
    }
    # Check for unsupported values (negative values)
    if( any(assay(x, assay.type) < 0 & !is.na(assay(x, assay.type))) ){
        ind <- detected[["non_neg"]]
        index_rm <- detected[!ind, "index"]
        detected <- detected[ind, ]
        if( length(index_rm) > 0 ){
            FUN <- if (nrow(detected) == 0) stop else warning
            FUN("The following indices cannot be calculated due to unsupported
                values (negative values): ", paste(index_rm, collapse = ", "),
                call. = FALSE)
        }
    }
    # The result is a data.frame where each column represent single index to be
    # estimated. Rows include the name of index and other information.
    detected <- as.data.frame( t(detected) )
    return(detected)
}

# This function rarifies the data niter of times and calculates index for the
# rarified data. The result is a mean of the iterations.
#' @importFrom DelayedMatrixStats rowMeans2
#' @importFrom BiocParallel bplapply
.alpha_rarefaction <- function(x, assay.type, index, name, niter, BPPARAM, ...){
    # Subsample the data and then calculate index based on the rarified data
    temp_assay <- "temporary_assay_name"
    res <- bplapply(seq(niter), function(i){
        args <- c(list(x = x, assay.type = assay.type), list(...))
        args[["name"]] <- temp_assay
        args[["verbose"]] <- FALSE
        x_sub <- do.call(rarefyAssay, args)
        temp <- .calculate_alpha(x_sub, temp_assay, index, name, ...)
        return(temp)
    }, BPPARAM = BPPARAM)
    # Combine list of matrices from multiple iterations
    res <- do.call(cbind, res)
    cnames <- unique(colnames(res))
    # There might be multiple indices; for instance chao1 has 2 values. For
    # every index calculate mean for each sample.
    res <- lapply(cnames, function(cname){
        rowMeans2(res[, cname, drop = FALSE])
    })
    names(res) <- cnames
    res <- do.call(cbind, res)
    # Give warning about missing samples. Same might have been dropped during
    # rarefaction leading to missing values for dropped samples.
    if( !all(colnames(x) %in% rownames(res)) ){
        warning(
            "Some samples were dropped during rarefaction leading to missing ",
            "diversity values. Consider lower 'sample'.", call. = FALSE)
    }
    # It might be that certain samples were dropped off if they have lower
    # abundance than rarefaction  depth --> order so that data includes all the
    # samples
    res <- res[match(colnames(x), rownames(res)), , drop = FALSE]
    return(res)
}

# This function is helper function that calls the specific functions that
# calculate indices.
.calculate_alpha <- function(x, assay.type, index, name, ...){
    # Get abundance matrix
    mat <- assay(x, assay.type)
    # Get correct function based on index
    FUN <- switch(index,
        shannon = .calc_shannon,
        gini_simpson = .calc_gini_simpson,
        inverse_simpson = .calc_inverse_simpson,
        coverage = .calc_coverage,
        fisher = .calc_fisher,
        faith = .estimate_faith,
        log_modulo_skewness = .calc_log_modulo_skewness,
        #
        simpson_lambda = .simpson_lambda,
        core_abundance = .calc_core_dominance,
        gini = .calc_gini_dominance,
        absolute = .calc_dominance,
        relative = .calc_dominance,
        dbp = .calc_dominance,
        dmn = .calc_dominance,
        #
        camargo = .calc_camargo_evenness,
        pielou = .calc_pielou_evenness,
        simpson = .calc_simpson_evenness,
        evar = .calc_evar_evenness,
        bulla = .calc_bulla_evenness,
        #
        observed = .calc_observed,
        chao1 = .calc_chao1,
        ace = .calc_ace,
        hill = .calc_hill
    )
    # Calculate values, create matrix where column represent index and rows
    # samples, and add name for the index
    res <- FUN(x = x, mat = mat, index = index, ...)
    res <- as.matrix(res)
    colnames(res) <- paste0(name, colnames(res))
    return(res)
}
microbiome/mia documentation built on Jan. 30, 2025, 12:32 a.m.