#' Test for differential abundance: method 'treeAGG-DA-edgeR'
#'
#' Test differential abundance of entities using functions from the
#' \code{\link{edgeR}} (Robinson et al. 2010, \emph{Bioinformatics}; McCarthy et
#' al. 2012, \emph{Nucleic Acids Research}) to fit models and calculate
#' moderated test for each entity. We have used
#' \code{\link[edgeR]{estimateDisp}} to estimate the dispersion. The
#' statistical methods implemented in the \code{edgeR} package were originally
#' designed for the analysis of gene expression data such as RNA-sequencing
#' counts. Here, we apply these methods to counts that might be from microbes or
#' cells.
#'
#' The experimental design must be specified using a design matrix. The
#' customized design matrix could be given by \code{design}.
#'
#' Normalization for samples is automatically performed by \code{edgeR} package.
#' More details about the calculation of normalization factor could be found
#' from \code{\link[edgeR]{calcNormFactors}}. A sample might include entities
#' corresponding to leaf nodes and internal nodes of tree. Only entities
#' corresponding to leaf nodes are used to calculate the library size of each
#' sample. The reason is that the abundance of an entity, corresponding to an
#' internal node, is calculated by taking sum of the abundance from its
#' descendant leaf nodes.
#'
#' @param obj A treeSummarizedExperiment object.
#' @param design A numeric matrix. It must be of full column rank. Defaults to
#' use all columns of \code{colData} to create design matrix. Note: Users should
#' check whether the default created design matrix is exactly what they want or
#' create their own design matrix using \code{\link[stats]{model.matrix}}.
#' @param contrast numeric vector specifying one contrast of
#' the linear model coefficients to be tested equal to zero. Its length
#' must equal to the number of columns of design. If NULL, the last
#' coefficient will be tested equal to zero.
#' @param normalize A logical value, TRUE or FALSE. The default is TRUE.
#' @param method Normalization method to be used. See
#' \code{\link[edgeR]{calcNormFactors}} for more details.
#' @param prior.count average prior count to be added to observation to shrink
#' the estimated log-fold-changes towards zero. See \code{prior.count} in
#' \code{\link[edgeR]{glmFit}}
#' @param use.assays A numeric vector. It specifies which matrix-like elements
#' in assays will be used to do analysis.
#' @param adjust.method A character string stating the method used to adjust
#' p-values for multiple testing, passed on to \code{\link[stats]{p.adjust}}.
#' It could be "bonferroni", "holm", "hochberg", "hommel", "BH", or "BY".
#'
#'
#' @importFrom S4Vectors DataFrame
#' @importFrom edgeR DGEList calcNormFactors estimateDisp glmFit glmLRT
#' topTags
#' @export
#' @return A treeSummarizedExperiment
#' \item{assays}{A list of tables}
#' \item{rowData}{It stores the information of rows in \code{assays}, and the
#' tables extracted from a \code{DGELRT} object that is generated by
#' \code{\link[edgeR:glmFit]{glmLRT}}. The later is stored as the internal part
#' of the \code{rowData}. More details or example could be found in the vignette
#' \code{Example of data analysis}}
#' \item{colData}{NULL}
#' \item{metadata}{
#' \itemize{
#' \item \code{use.assays} which elements in the \code{assays} have been
#' used to run differential abundance analysis.
#' \item \code{design} the design matrix as input.
#' \item \code{contrast} the contrast vector as input.
#' \item \code{output_glmFit} the output from \code{\link[edgeR]{glmFit}}. A
#' object of \code{\link[edgeR]{DGEGLM-class}}
#' }
#' }
#' @examples
#'
#' library(S4Vectors)
#' set.seed(1)
#' y <- matrix(rnbinom(300,size=1,mu=10),nrow=10)
#' colnames(y) <- paste(rep(LETTERS[1:3], each = 10), rep(1:10,3), sep = "_")
#' rownames(y) <- tinyTree$tip.label
#'
#' rowInf <- DataFrame(nodeLab = rownames(y),
#' var1 = sample(letters[1:3], 10, replace = TRUE),
#' var2 = sample(c(TRUE, FALSE), 10, replace = TRUE))
#' colInf <- DataFrame(gg = factor(sample(1:3, 30, replace = TRUE)),
#' group = rep(LETTERS[1:3], each = 10))
#' toy_lse <- leafSummarizedExperiment(tree = tinyTree, rowData = rowInf,
#' colData = colInf,
#' assays = list(y, (2*y), 3*y))
#'
#' toy_tse <- nodeValue(data = toy_lse, fun = sum, tree = tinyTree,
#' message = TRUE)
#'
#' # build the model
#' contrastList <- list(contrast1 = c(0, 0, 0, -1, 1),
#' contrast2 = c(0, -1, 1, 0, 0))
#' mod <- runEdgeR(obj = toy_tse, contrast = contrastList)
#' # show results gained from the second element of the assasy
#' # sort by PValue
#' topNodes(mod, sort.by = "PValue", use.assays = 2)
runEdgeR <- function(obj, design = NULL, contrast = NULL,
normalize = TRUE, method = "TMM",
adjust.method = "BH",
prior.count = 0.125,
use.assays = NULL){
# which elements from assays will be used for analysis.
if (is.null(use.assays)) {
use.assays <- seq_along(assays(obj))
} else {
use.assays <- use.assays
}
# calculate library size
linkD <- linkData(obj)
objL <- obj[linkD$isLeaf, ]
tableL <- assays(objL, withDimnames = FALSE)[use.assays]
libSize <- lapply(seq_along(tableL), FUN = function(x) {
rs <- rowsum(tableL[[x]], group = rep(1, nrow(tableL[[x]])))
rs[1, ]
})
# extract elements from assays for analysis
tableA <- assays(obj, use.nodeLab = TRUE,
withDimnames = TRUE)[use.assays]
# create DEGList
y <- lapply(seq_along(tableA), FUN = function(x) {
yy <- DGEList(tableA[[x]], remove.zeros = TRUE)
yy$samples$lib.size <- libSize[[x]]
yy
} )
# normalisation
if (normalize) {
y <- lapply(seq_along(tableA), FUN = function(x) {
calcNormFactors(y[[x]], method = method)
})
} else {
y <- y
}
# default design matrix
if (is.null(design)) {
design <- .designMatrix(data = colData(obj))
} else { design <- design }
# estimate dispersion
y <- lapply(seq_along(y), FUN = function(x) {
y <- estimateDisp(y[[x]], design = design)
return(y)})
# build model
fit <- lapply(seq_along(y), FUN = function(x) {
glmFit(y[[x]], design = design, prior.count = prior.count)})
tt1 <- lapply(seq_along(fit), FUN = function(x) {
# make sure a list is provided for the contrast
if (is.null(contrast)) {
lrt <- glmLRT(fit[[x]], contrast = contrast)
tabx <- topTags(lrt, n = Inf, adjust.method = adjust.method,
sort.by = "none")$table
xx <- list(tabx)
} else {
xx <- lapply(contrast, FUN = function(y) {
lrt <- glmLRT(fit[[x]], contrast = y)
tabx <- topTags(lrt, n = Inf,
adjust.method = adjust.method,
sort.by = "none")$table
return(tabx)
})
}
names(xx) <- names(contrast)
return(xx) })
tt2 <- lapply(seq_along(tt1), function(x) {
# add and rearrange rows so that the output is also row-wise
# corresponding to the assays data.
res <- lapply(tt1[[x]], function(y) {
# find rows deleted
idx <- rownames(y)
od <- linkD$nodeLab_alias
if (is.null(od)) {
od <- linkD$nodeLab
}
idc <- setdiff(od, idx)
# add deleted rows in the table
df <- DataFrame(y)
dm <- matrix(NA, nrow = length(idc), ncol = ncol(df),
dimnames = list(idc, colnames(df)))
dfc <- DataFrame(dm)
dfA <- rbind(df, dfc)
# sort rows
dfA <- dfA[od,]
return(dfA) })
return(res)
})
# reshape: convert a list into a dataFrame
tt3 <- lapply(seq_along(tt2), FUN = function(j) {
x <- tt2[[j]]
#dx <- x[[1]][, 0]
dx <- x[[1]][, 0]
for (i in seq_along(x)) {
xi <- x[[i]]
nxi <- names(x)[i]
if (is.null(nxi)) { nxi <- "contrastNULL"}
dx[[nxi]] <- xi
}
return(dx)
})
names(tt3) <- paste("result_assay", use.assays, sep = "" )
# reshape to a dataFrame: one column for a result from one assay table
tt4 <- tt3[[1]][, 0]
for (i in seq_along(tt3)) {
nxi <- names(tt3)[i]
tt4[[nxi]] <- tt3[[i]]
}
rowData(obj)[["Results_internal_treeAGG"]] <- as(tt4, "internal_rowData")
# contrast should have the same length as the sub-element of tt3
if (is.null(contrast)) {
contrast <- lapply(seq_along(tt3[[1]]), function(x) {NULL})
names(contrast) <- "contrastNULL"
}
## put the analysis result in the metadata
metadata(obj)$output_glmFit <- fit
metadata(obj)$use.assays <- use.assays
metadata(obj)$design <- design
metadata(obj)$contrast <- contrast
# return(outL)
return(obj)
}
#' Create design matrix
#'
#' \code{.designMatrix} creates a design matrix by expanding factors to a set of
#' dummay variables and epanding interactions similarly.
#'
#'
#' \code{.designMatrix} creates a design matrix using the data extracted from
#' \code{data}. \code{cols} specifies the columns to extract.
#' \code{.designMatrix} is built on \code{\link[stats]{model.matrix}} with
#' \code{contrasts.arg = NULL}.
#'
#' @param data A \code{data.frame} or \code{DataFrame}.
#' @param cols A numeric vector. Specify columns to include in the design of
#' model matrix. Default is to include all columns.
#'
#' @importFrom stats model.matrix as.formula
#' @keywords internal
#' @return a matrix.
.designMatrix <- function(data, cols = NULL) {
stopifnot(class(data) %in% c("data.frame", "DataFrame"))
# if cols is null, use all columns.
if (is.null(cols)) {
cols <- seq_len(ncol(data))
}
# create design matrix
terms <- colnames(data)[cols]
formula <- as.formula(paste("~", paste(terms, collapse = " + ")))
design <- model.matrix(formula, data = data)
return(design)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.