#' Quickly perform pseudo-bulk DE analyses
#'
#' A wrapper function around \pkg{edgeR}'s quasi-likelihood methods
#' to conveniently perform differential expression analyses on pseudo-bulk profiles,
#' allowing detection of cell type-specific changes between conditions in replicated studies.
#'
#' @param x A numeric matrix of counts where rows are genes and columns are pseudo-bulk profiles.
#' Alternatively, a SummarizedExperiment object containing such a matrix in its assays.
#' @param col.data A data.frame or \linkS4class{DataFrame} containing metadata for each column of \code{x}.
#' @param label A vector of factor of length equal to \code{ncol(x)},
#' specifying the cluster or cell type assignment for each column of \code{x}.
#' @param design A formula to be used to construct a design matrix from variables in \code{col.data}.
#' Alternatively, a function that accepts a data.frame with the same fields as \code{col.data} and returns a design matrix.
#' @param condition A vector or factor of length equal to \code{ncol(x)},
#' specifying the experimental condition for each column of \code{x}.
#' Only used for abundance-based filtering of genes.
#' @param coef String or character vector containing the coefficients to drop from the design matrix to form the null hypothesis.
#' Can also be an integer scalar or vector specifying the indices of the relevant columns.
#' @param contrast Numeric vector or matrix containing the contrast of interest.
#' Alternatively, a character vector to be passed to \code{\link{makeContrasts}} to create this numeric vector/matrix.
#' If specified, this takes precedence over \code{coef}.
#' @param lfc Numeric scalar specifying the log-fold change threshold to use in \code{\link{glmTreat}} or \code{\link{treat}}.
#' @param assay.type String or integer scalar specifying the assay to use from \code{x}.
#' @param include.intermediates Logical scalar indicating whether the intermediate \pkg{edgeR} objects should be returned.
#' @param row.data A \linkS4class{DataFrame} containing additional row metadata for each gene in \code{x},
#' to be included in each of the output DataFrames.
#' This should have the same number and order of rows as \code{x}.
#' @param ... For the generic, additional arguments to pass to individual methods.
#'
#' For the SummarizedExperiment method, additional arguments to pass to the ANY method.
#' @param method String specifying the DE analysis framework to use.
#' @param robust Logical scalar indicating whether robust empirical Bayes shrinkage should be performed.
#' @param qualities Logical scalar indicating whether quality weighting should be used when \code{method="voom"},
#' see \code{\link{voomWithQualityWeights}} for more details.
#' @param sorted Logical scalar indicating whether the output tables should be sorted by p-value.
#' @param sample Deprecated.
#'
#' @return
#' A \linkS4class{List} with one \linkS4class{DataFrame} of DE results per unique (non-failed) level of \code{cluster}.
#' This contains columns from \code{\link{topTags}} if \code{method="edgeR"} or \code{\link{topTable}} if \code{method="voom"}.
#' All DataFrames have row names equal to \code{rownames(x)}.
#'
#' The \code{\link{metadata}} of the List contains \code{failed},
#' a character vector with the names of the labels for which the comparison could not be performed - see Details.
#'
#' The \code{\link{metadata}} of the individual DataFrames contains \code{design}, the final design matrix for that label.
#' If \code{include.intermediates}, the \code{\link{metadata}} will also contain
#' \code{y}, the DGEList used for the analysis; and \code{fit}, the DGEGLM object after GLM fitting.
#'
#' @details
#' In replicated multi-condition scRNA-seq experiments,
#' we often have clusters comprised of cells from different samples of different experimental conditions.
#' It is often desirable to check for differential expression between conditions within each cluster,
#' allowing us to identify cell-type-specific responses to the experimental perturbation.
#'
#' Given a set of pseudo-bulk profiles (usually generated by \code{\link{sumCountsAcrossCells}}),
#' this function loops over the labels and uses \pkg{edgeR} or \code{\link{voom}} to detect DE genes between conditions.
#' The DE analysis for each label is largely the same as a standard analysis for bulk RNA-seq data,
#' using \code{design} and \code{coef} or \code{contrast} as described in the \pkg{edgeR} or \pkg{limma} user guides.
#' Generally speaking, \pkg{edgeR} handles low counts better via its count-based model
#' but \code{method="voom"} supports variable sample precision when \code{quality=TRUE}.
#'
#' Performing pseudo-bulk DGE enables us to reuse well-tested methods developed for bulk RNA-seq data analysis.
#' Each pseudo-bulk profile can be treated as an \emph{in silico} mimicry of a real bulk RNA-seq sample
#' (though in practice, it tends to be much more variable due to the lower numbers of cells).
#' This also models the relevant variability between experimental replicates (i.e., across samples)
#' rather than that between cells in the same sample, without resorting to expensive mixed-effects models.
#'
#' The DE analysis for each label is independent of that for any other label.
#' This aims to minimize problems due to differences in abundance and variance between labels,
#' at the cost of losing the ability to share information across labels.
#'
#' In some cases, it will be impossible to perform a DE analysis for a label.
#' The most obvious reason is if there are no residual degrees of freedom;
#' other explanations include impossible contrasts or a failure to construct an appropriate design matrix
#' (e.g., if a cell type only exists in one condition).
#'
#' Note that we assume that \code{x} has already been filtered to remove unstable pseudo-bulk profiles generated from few cells.
#'
#' @section Comments on abundance filtering:
#' For each label, abundance filtering is performed using \code{\link{filterByExpr}} prior to further analysis.
#' Genes that are filtered out will still show up in the DataFrame for that label, but with all statistics set to \code{NA}.
#' As this is done separately for each label, a different set of genes may be filtered out for each label,
#' which is largely to be expected if there is any label-specific expression.
#'
#' By default, the minimum group size for \code{filterByExpr} is determined using the design matrix.
#' However, this may not be optimal if the design matrix contains additional terms (e.g., blocking factors)
#' in which case it is not easy to determine the minimum size of the groups relevant to the comparison of interest.
#' To overcome this, users can specify \code{condition.field} to specify the group to which each sample belongs,
#' which is used by \code{filterByExpr} to obtain a more appropriate minimum group size.
#'
#' @author Aaron Lun
#'
#' @references
#' Tung P-Y et al. (2017).
#' Batch effects and the effective design of single-cell gene expression studies.
#' \emph{Sci. Rep.} 7, 39921
#'
#' Lun ATL and Marioni JC (2017).
#' Overcoming confounding plate effects in differential expression analyses of single-cell RNA-seq data.
#' \emph{Biostatistics} 18, 451-464
#'
#' Crowell HL et al. (2019).
#' On the discovery of population-specific state transitions from multi-sample multi-condition single-cell RNA sequencing data.
#' \emph{biorXiv}
#'
#' @examples
#' set.seed(10000)
#' library(scuttle)
#' sce <- mockSCE(ncells=1000)
#' sce$samples <- gl(8, 125) # Pretending we have 8 samples.
#'
#' # Making up some clusters.
#' sce <- logNormCounts(sce)
#' clusters <- kmeans(t(logcounts(sce)), centers=3)$cluster
#'
#' # Creating a set of pseudo-bulk profiles:
#' info <- DataFrame(sample=sce$samples, cluster=clusters)
#' pseudo <- sumCountsAcrossCells(sce, info)
#'
#' # Making up an experimental design for our 8 samples.
#' pseudo$DRUG <- gl(2,4)[pseudo$sample]
#'
#' # DGE analysis:
#' out <- pseudoBulkDGE(pseudo,
#' label=pseudo$cluster,
#' condition=pseudo$DRUG,
#' design=~DRUG,
#' coef="DRUG2"
#' )
#' out[[1]]
#' metadata(out[[1]])$design
#' @seealso
#' \code{\link{sumCountsAcrossCells}}, to easily generate the pseudo-bulk count matrix.
#'
#' \code{\link{decideTestsPerLabel}}, to generate a summary of the DE results across all labels.
#'
#' \code{\link{pseudoBulkSpecific}}, to look for label-specific DE genes.
#'
#' \code{pbDS} from the \pkg{muscat} package, which uses a similar approach.
#' @name pseudoBulkDGE
NULL
.pseudo_bulk_master <- function(x, col.data, label, design, coef, contrast=NULL,
condition=NULL, lfc=0, include.intermediates=TRUE, row.data=NULL, sorted=FALSE,
method=c("edgeR", "voom"), qualities=TRUE, robust=TRUE, sample=NULL)
{
if (!is.null(sample)) {
.Deprecated(msg="'sample=' is deprecated and will be ignored")
}
if (is.matrix(design)) {
.Defunct(msg="matrix 'design=' is defunct, use a formula or function instead")
}
.pseudo_bulk_dge(x=x, col.data=col.data, label=label, condition=condition,
design=design, coef=coef, contrast=contrast, lfc=lfc, row.data=row.data,
sorted=sorted, include.intermediates=include.intermediates,
method=match.arg(method), qualities=qualities, robust=robust)
}
#' @importFrom edgeR DGEList
#' @importFrom S4Vectors DataFrame SimpleList metadata metadata<-
.pseudo_bulk_dge <- function(x, col.data, label, design, coef, contrast=NULL,
condition=NULL, lfc=0, null.lfc.list=NULL, row.data=NULL, sorted=FALSE, include.intermediates=FALSE,
method=c("edgeR", "voom"), qualities=TRUE, robust=TRUE)
{
de.results <- list()
failed <- character(0)
label <- as.character(label)
method <- match.arg(method)
# Avoid requiring 'coef' if 'contrast' is specified.
if (!is.null(contrast)) {
coef <- NULL
}
for (i in sort(unique(label))) {
chosen <- i==label
curx <- x[,chosen,drop=FALSE]
curdata <- col.data[chosen,,drop=FALSE]
y <- DGEList(curx, samples=as.data.frame(curdata))
curcond <- condition[chosen]
curdesign <- try({
if (is.function(design)) {
design(curdata)
} else {
model.matrix(design, data=curdata)
}
}, silent=TRUE)
if (is(curdesign, "try-error")) {
failed <- c(failed, i)
next
} else {
args <- list(y, row.names=rownames(x), curdesign=curdesign, curcond=curcond,
coef=coef, contrast=contrast, lfc=lfc, null.lfc=null.lfc.list[[i]],
robust=robust, include.intermediates=include.intermediates)
if (method=="edgeR") {
stuff <- do.call(.pseudo_bulk_edgeR, args)
pval.field <- "PValue"
} else {
stuff <- do.call(.pseudo_bulk_voom, c(args, list(qualities=qualities)))
pval.field <- "p.value"
}
if (is.null(stuff)) {
failed <- c(failed, i)
next
}
}
if (!is.null(row.data)) {
# Adding stuff[,0] to preserve the DF's metadata and row names.
stuff <- cbind(stuff[,0], row.data, stuff)
}
if (sorted) {
o <- order(stuff[,pval.field])
stuff <- stuff[o,,drop=FALSE]
}
de.results[[i]] <- stuff
}
output <- SimpleList(de.results)
metadata(output)$failed <- failed
output
}
#' @importFrom S4Vectors DataFrame metadata metadata<-
#' @importFrom edgeR estimateDisp glmQLFit glmQLFTest getOffset scaleOffset
#' calcNormFactors filterByExpr topTags glmLRT glmFit glmTreat
#' @importFrom limma makeContrasts
.pseudo_bulk_edgeR <- function(y, row.names, curdesign, curcond, coef, contrast,
lfc, null.lfc, include.intermediates, robust=TRUE)
{
ngenes <- nrow(y)
gkeep <- filterByExpr(y, design=curdesign, group=curcond)
y <- y[gkeep,]
y <- calcNormFactors(y)
rank <- qr(curdesign)$rank
if (rank == nrow(curdesign) || rank < ncol(curdesign)) {
return(NULL)
}
if (is.character(contrast)) {
contrast <- makeContrasts(contrasts=contrast, levels=curdesign)
}
lfc.out <- .compute_offsets_by_lfc(design=curdesign, coef=coef,
contrast=contrast, filtered=gkeep, null.lfc=null.lfc)
if (!is.null(lfc.out)) {
offsets <- t(t(lfc.out)/log2(exp(1)) + getOffset(y))
y <- scaleOffset(y, offsets)
}
y <- estimateDisp(y, curdesign)
fit <- glmQLFit(y, curdesign, robust=robust)
if (lfc==0) {
res <- glmQLFTest(fit, coef=coef, contrast=contrast)
} else {
res <- glmTreat(fit, lfc=lfc, coef=coef, contrast=contrast)
}
tab <- topTags(res, n=Inf, sort.by="none")
expander <- match(seq_len(ngenes), which(gkeep))
tab <- DataFrame(tab$table[expander,,drop=FALSE])
rownames(tab) <- row.names
metadata(tab)$design <- curdesign
if (include.intermediates) {
metadata(tab)$y <- y
metadata(tab)$fit <- fit
}
tab
}
#' @importFrom limma contrastAsCoef
.compute_offsets_by_lfc <- function(design, coef, contrast, filtered, null.lfc) {
if (is.null(null.lfc) || all(null.lfc==0)) {
return(NULL)
}
if (!is.null(contrast)) {
out <- contrastAsCoef(design, contrast)
design <- out$design
coef <- out$coef
}
stopifnot(length(coef)==1)
null.lfc <- rep(null.lfc, length.out=length(filtered))
null.lfc <- null.lfc[filtered]
outer(null.lfc, design[,coef])
}
#' @importFrom S4Vectors DataFrame metadata metadata<-
#' @importFrom edgeR calcNormFactors filterByExpr
#' @importFrom limma voom voomWithQualityWeights lmFit
#' contrasts.fit eBayes treat topTable makeContrasts
.pseudo_bulk_voom <- function(y, row.names, curdesign, curcond, coef, contrast,
lfc, null.lfc, include.intermediates, qualities=TRUE, robust=TRUE)
{
ngenes <- nrow(y)
gkeep <- filterByExpr(y, design=curdesign, group=curcond)
y <- y[gkeep,]
y <- calcNormFactors(y)
rank <- qr(curdesign)$rank
if (rank == nrow(curdesign) || rank < ncol(curdesign)) {
return(NULL)
}
if (qualities) {
v <- voomWithQualityWeights(y, curdesign)
} else {
v <- voom(y, curdesign)
}
if (is.character(contrast)) {
contrast <- makeContrasts(contrasts=contrast, levels=curdesign)
}
lfc.out <- .compute_offsets_by_lfc(design=curdesign, coef=coef,
contrast=contrast, filtered=gkeep, null.lfc=null.lfc)
if (!is.null(lfc.out)) {
v$E <- v$E - lfc.out
}
fit <- lmFit(v)
if (!is.null(contrast)) {
fit <- contrasts.fit(fit, contrast)
coef <- 1
}
if (lfc==0) {
res <- eBayes(fit, robust=robust)
} else {
res <- treat(fit, lfc=lfc, robust=robust)
}
tab <- topTable(res, coef=coef, number=Inf, sort.by="none")
expander <- match(seq_len(ngenes), which(gkeep))
tab <- DataFrame(tab[expander,,drop=FALSE])
rownames(tab) <- row.names
metadata(tab)$design <- curdesign
if (include.intermediates) {
metadata(tab)$y <- y
metadata(tab)$v <- v
metadata(tab)$fit <- fit
}
tab
}
#' @export
#' @rdname pseudoBulkDGE
setGeneric("pseudoBulkDGE", function(x, ...) standardGeneric("pseudoBulkDGE"))
#' @export
#' @rdname pseudoBulkDGE
setMethod("pseudoBulkDGE", "ANY", .pseudo_bulk_master)
#' @export
#' @rdname pseudoBulkDGE
#' @importFrom SummarizedExperiment assay colData
setMethod("pseudoBulkDGE", "SummarizedExperiment", function(x, col.data=colData(x), ..., assay.type=1) {
.pseudo_bulk_master(assay(x, assay.type), col.data=col.data, ...)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.