R/process_aggregate_taxa.R

Defines functions aggregate_taxa

Documented in aggregate_taxa

## Note: This function is copied from package microbiome

#' @title Aggregate Taxa
#'
#' @description Summarize phyloseq data into a higher phylogenetic level.
#'
#' @details This provides a convenient way to aggregate phyloseq OTUs
#' (or other taxa) when the phylogenetic tree is missing. Calculates the
#' sum of OTU abundances over all OTUs that map to the same higher-level
#' group. Removes ambiguous levels from the taxonomy table. Returns a
#' phyloseq object with the summarized abundances.
#'
#' @param x \code{\link{phyloseq-class}} object
#' @param level Summarization level (from \code{rank_names(pseq)})
#' @param verbose verbose
#'
#' @return Summarized phyloseq object
#'
#' @examples
#' data(caporaso)
#' caporaso_phylum <- aggregate_taxa(caporaso, "Phylum")
#'
#' @importFrom phyloseq taxa_are_rows<- merge_phyloseq
#'
#' @export
#'
#' @references See citation('microbiome')
#'
#' @author Contact: Leo Lahti \email{microbiome-admin@@googlegroups.com}
#'
#' @keywords utilities
#'
aggregate_taxa <- function(x, level, verbose = FALSE) {

    if (!level %in% rank_names(x)) {
        stop(
            "The level argument should be one of the options
            given by rank_names(x): ",
            paste(rank_names(x), collapse = " / ")
        )
    }

    # Check if the object is already at the given level
    inds <- all(tax_table(x)[, level] == tax_table(x)[, ncol(tax_table(x))])
    inds <- which(inds)
    check1 <- length(inds) > 0
    check2 <- !any(duplicated(tax_table(x)[, level]))
    if (check1 && check2) {
        return(x)
    }

    # Sanity checks for a phyloseq object. Required with some methods.
    if (!taxa_are_rows(x)) {
        x@otu_table <- otu_table(t(otu_table(x)), taxa_are_rows = TRUE)
        taxa_are_rows(x) <- TRUE
    }

    fill_na_taxa <- "Unknown"

    if (verbose) {
        print("Remove taxonomic information below the target level")
    }
    M <- as.matrix(tax_table(x))
    inds2 <- match(level, colnames(M))
    M <- M[, seq_len(inds2)]
    M[is.na(M)] <- fill_na_taxa
    # Ensure that the filled entries are unique
    inds <- which(M[, level] == fill_na_taxa)
    M[inds, seq_len(inds2)] <- fill_na_taxa

    unique <- apply(M, 1, function(x) {
        paste(x, collapse = "_")
    })
    M <- cbind(M, unique = unique)
    x@tax_table <- tax_table(M)

    if (!nrow(tax_table(x)) == nrow(otu_table(x))) {
        stop("Taxonomic table and OTU table dimensions do not match.")
    }

    if (verbose) {
        print("Mark the potentially ambiguous taxa")
    }
    # Some genera for instance belong to multiple Phyla and perhaps these
    # are different
    # genera. For instance there is genus Clostridium in Tenericutes
    # and Firmicutes.
    # (GlobalPatterns data set) and even more families.

    tt <- tax_table(x)
    if (verbose) {
        print("-- split")
    }
    otus <- split(rownames(tt), as.character(tt[, "unique"]))

    ab <- matrix(NA, nrow = length(otus), ncol = nsamples(x))
    rownames(ab) <- names(otus)
    colnames(ab) <- sample_names(x)

    if (verbose) {
        print("-- sum")
    }
    d <- abundances(x)

    ab <- t(vapply(otus, function(taxa) {
        as.numeric(colSums(matrix(d[taxa, ], ncol = nsamples(x)), na.rm = TRUE))
    }, FUN.VALUE = unname(as.numeric(d[1, ]))))

    colnames(ab) <- colnames(d)
    rownames(ab) <- names(otus)

    if (verbose) {
        print("Create phyloseq object")
    }
    OTU <- otu_table(ab, taxa_are_rows = TRUE)
    x2 <- phyloseq(OTU)

    if (verbose) {
        print("Remove ambiguous levels")
    }
    ## First remove NA entries from the target level
    inds3 <- match(level, colnames(tt@.Data))
    inds4 <- match("unique", colnames(tt@.Data))
    taxtab <- tt@.Data[
        which(!is.na(tt@.Data[, level])),
        c(seq_len(inds3), inds4)
    ]

    if (verbose) {
        print("-- unique")
    }
    tax <- unique(taxtab)
    if (verbose) {
        print("-- Rename the lowest level")
    }
    tax <- as.data.frame(tax)
    if (verbose) {
        print("-- rownames")
    }
    rownames(tax) <- tax[, "unique"]

    if (verbose) {
        print("-- taxa")
    }
    tax <- as.matrix(tax)

    if (verbose) {
        print("Convert to taxonomy table")
    }
    TAX <- tax_table(tax)

    if (verbose) {
        print("Combine OTU and Taxon matrix into Phyloseq object")
    }
    x2 <- merge_phyloseq(x2, TAX)

    # Then keep short names for those taxa where short names are unique
    tt <- tax_table(x2)

    uni <- names(which(table(as.vector(tt[, level])) == 1))
    inds <- which(tt[, level] %in% uni)
    taxa <- tt[inds, level]
    tt[inds, "unique"] <- taxa
    rownames(tt)[inds] <- taxa
    ab <- abundances(x2)
    rnams <- rownames(ab)
    rnams[inds] <- taxa
    rownames(ab) <- rnams

    x2 <- phyloseq(
        otu_table(ab, taxa_are_rows = TRUE),
        tax_table(tt)
    )

    if (verbose) {
        print("Add the metadata as is")
    }
    if (!is.null(x@sam_data)) {
        x2 <- phyloseq(
            otu_table(ab, taxa_are_rows = TRUE),
            tax_table(tt),
            sample_data(x)
        )
    }

    return(x2)
}
HuaZou/MicrobiomeAnalysis documentation built on May 13, 2024, 11:10 a.m.