#' Differential Expression
#'
#' We now generally recommend the ZINB-WaVE method over zingeR, since it is
#' faster, and has been show to be more sensitive for most single-cell RNA-seq
#' datasets.
#'
#' @section zinbwave:
#' We are currently using an epsilon setting of `1e12`, as recommended by the
#' ZINB-WaVE integration paper.
#'
#' @section zingeR:
#' We first perform a differential expression analysis using zingeR posterior
#' probabilities and focussing on the count component of the ZINB model. The
#' weights are estimated with the core function of zingeR, `zeroWeightsLS()`. It
#' is important to be consistent with the normalization procedure, i.e. if you
#' use TMM normalization for the analysis, you should also use it for estimating
#' the zingeR posterior probabilities.
#'
#' @section edgeR:
#' After estimation of the dispersions and posterior probabilities, the
#' `glmWeightedF()` function is used for statistical inference. This is an
#' adapted function from the `glmLRT()` function of edgeR. It uses an F-test for
#' which the denominator degrees of freedom are by default adjusted according to
#' the downweighting of excess zeros (`ZI = TRUE`). Also, independent filtering
#' can be performed on the obtained p-values (`independentFiltering = TRUE`). We
#' use the independent filtering strategy that was originally implemented in
#' DESeq2. By default, the average fitted values are used as a filter criterion.
#'
#' @section DESeq2:
#' We're providing preliminary support for DESeq2 as the differential expression
#' caller. It is currently considerably slower for large datasets than edgeR.
#'
#' @seealso Consult the following sources for more information:
#' - [zinbwave paper](https://doi.org/10.1186/s13059-018-1406-4).
#' - [zinbwave vignette](https://bit.ly/2wtDdpS).
#' - [zinbwave-DESeq2 workflow](https://github.com/mikelove/zinbwave-deseq2).
#' - [zingeR vignette](https://goo.gl/4rTK1w).
#'
#' @name diffExp
#' @family Differential Expression Functions
#' @author Michael Steinbaugh
#'
#' @inheritParams zingeR::zeroWeightsLS
#' @inheritParams general
#' @param numerator Group of cells to use in the numerator of the contrast
#' (e.g. treatment).
#' @param denominator Group of cells to use in the denominator of the contrast
#' (e.g. control).
#' @param zeroWeights Package to use for zero weight calculations. Defaults to
#' zinbwave but zingeR is also supported.
#' @param caller Package to use for differential expression calling. Defaults to
#' edgeR (faster for large datasets) but DESeq2 is also supported.
#' @param minCellsPerGene The minimum number of cells where a gene is expressed,
#' to pass low expression filtering. Set to `0` to disable (not recommended).
#' @param minCountsPerCell Minimum number of counts per cell for a gene to pass
#' low expression filtering. The number of cells is defined by
#' `minCellsPerGene`. Set to `0` to disable (not recommended).
#'
#' @seealso
#' - DESeq2: We're trying to follow the conventions used in DESeq2 for
#' contrasts, defining the name of the factor in the design formula,
#' numerator, and denominator level for the fold change calculations. See
#' [DESeq2::results()] for more information.
#' - Seurat: Note that Seurat currently uses the convention `cells.1` for the
#' numerator and `cells.2` for the denominator. See [Seurat::DiffExpTest()]
#' for additional information.
#'
#' @return
#' - `caller = "edgeR"`: `DEGLRT`.
#' - `caller = "DESeq2"`: Unshrunken `DESeqResults`. Use `lfcShrink()` if
#' shrunken results are desired.
#'
#' @examples
#' # seurat ====
#' # Expression in cluster 3 relative to cluster 2
#' object <- seurat_small
#' numerator <- Seurat::WhichCells(object, ident = 3L)
#' glimpse(numerator)
#' denominator <- Seurat::WhichCells(object, ident = 2L)
#' glimpse(denominator)
#' x <- diffExp(
#' object = object,
#' numerator = numerator,
#' denominator = denominator,
#' minCellsPerGene = 5L,
#' minCountsPerCell = 5L
#' )
NULL
.designFormula <- ~group
.zinbwave <- function(object) {
message("Running zinbwave")
requireNamespace("BiocParallel")
stopifnot(is(object, "SingleCellExperiment"))
object <- as(object, "SingleCellExperiment")
# zinbFit doesn't support `dgCMatrix``, so coerce counts to matrix
assays(object) <- list("counts" = as.matrix(counts(object)))
print(system.time({
zinb <- zinbwave::zinbwave(
Y = object,
K = 0L,
BPPARAM = BiocParallel::SerialParam(),
epsilon = 1e12
)
}))
stopifnot(is(zinb, "SingleCellExperiment"))
assert_is_factor(zinb[["group"]])
assert_is_matrix(metadata(zinb)[["design"]])
zinb
}
# Van De Berge and Perraudeau and others have shown the LRT may perform better
# for null hypothesis testing, so we use the LRT. In order to use the Wald test,
# it is recommended to set `useT = TRUE`.
#
# For UMI data, for which the expected counts may be very low, the likelihood
# ratio test implemented in nbinomLRT should be used.
#
# DESeq2 supports `weights` in assays automatically.
.zinbwave.DESeq2 <- function(object) { # nolint
stopifnot(is(object, "SingleCellExperiment"))
zinb <- .zinbwave(object)
# DESeq2 ===================================================================
message("Running DESeq2")
print(system.time({
dds <- DESeq2::DESeqDataSet(se = zinb, design = .designFormula)
dds <- DESeq2::DESeq(
object = dds,
test = "LRT",
reduced = ~ 1L,
sfType = "poscounts",
minmu = 1e-6,
minReplicatesForReplace = Inf
)
# We already performed low count filtering
res <- DESeq2::results(dds, independentFiltering = FALSE)
}))
res
}
.zinbwave.edgeR <- function(object) { # nolint
stopifnot(is(object, "SingleCellExperiment"))
zinb <- .zinbwave(object)
# edgeR ====================================================================
message("Running edgeR")
counts <- as.matrix(counts(zinb))
weights <- assay(zinb, "weights")
assert_is_matrix(weights)
design <- metadata(object)[["design"]]
assert_is_matrix(design)
group <- object[["group"]]
assert_is_factor(group)
print(system.time({
dge <- edgeR::DGEList(counts, group = group)
dge <- edgeR::calcNormFactors(dge)
dge[["weights"]] <- weights
dge <- edgeR::estimateDisp(dge, design = design)
fit <- edgeR::glmFit(dge, design = design)
# We already performed low count filtering
lrt <- zinbwave::glmWeightedF(
glmfit = fit,
coef = 2L,
independentFiltering = FALSE
)
}))
lrt
}
# Note that TMM needs to be consistently applied for both
# `calcNormFactors()` and `zeroWeightsLS()`.
.zingeR.edgeR <- function( # nolint
object,
maxit = 1000L
) {
stopifnot(is(object, "SingleCellExperiment"))
counts <- as.matrix(counts(object))
design <- metadata(object)[["design"]]
assert_is_matrix(design)
# zingeR ===================================================================
message("Running zingeR")
print(system.time({
weights <- zingeR::zeroWeightsLS(
counts = counts,
design = design,
maxit = maxit,
normalization = "TMM"
)
}))
# edgeR ====================================================================
message("Running edgeR")
group <- object[["group"]]
assert_is_factor(group)
print(system.time({
dge <- edgeR::DGEList(counts, group = group)
dge[["weights"]] <- weights
dge <- edgeR::calcNormFactors(dge, method = "TMM")
dge <- edgeR::estimateDisp(dge, design = design)
fit <- edgeR::glmFit(dge, design = design)
# We already performed low count filtering
lrt <- zingeR::glmWeightedF(
glmfit = fit,
coef = 2L,
independentFiltering = FALSE
)
}))
lrt
}
.zingeR.DESeq2 <- function( # nolint
object,
maxit = 1000L
) {
stopifnot(is(object, "SingleCellExperiment"))
counts <- as.matrix(counts(object))
se <- as(object, "RangedSummarizedExperiment")
assays(se) <- list(counts = counts)
# zingeR ===================================================================
message("Running zingeR")
design <- metadata(object)[["design"]]
assert_is_matrix(design)
print(system.time({
weights <- zingeR::zeroWeightsLS(
counts = counts,
design = design,
maxit = maxit,
normalization = "DESeq2_poscounts",
colData = colData(object),
designFormula = .designFormula
)
}))
# DESeq2 ===================================================================
message("Running DESeq2")
print(system.time({
dds <- DESeq2::DESeqDataSet(se = se, design = .designFormula)
assays(dds)[["weights"]] <- weights
dds <- DESeq2::estimateSizeFactors(dds, type = "poscounts")
dds <- DESeq2::estimateDispersions(dds)
# LRT is recommended over Wald for UMI counts.
# May want to switch to that approach here.
# See code in zinbwave method.
dds <- DESeq2::nbinomWaldTest(
object = dds,
betaPrior = TRUE,
useT = TRUE,
df = rowSums(weights) - 2L
)
# We already performed low count filtering
res <- DESeq2::results(dds, independentFiltering = FALSE)
}))
res
}
# Methods ======================================================================
#' @rdname diffExp
#' @export
setMethod(
"diffExp",
signature("SingleCellExperiment"),
function(
object,
numerator,
denominator,
zeroWeights = c("zinbwave", "zingeR"),
caller = c("edgeR", "DESeq2"),
minCellsPerGene = 25L,
minCountsPerCell = 5L
) {
object <- as(object, "SingleCellExperiment")
assert_is_character(numerator)
assert_is_character(denominator)
assert_are_disjoint_sets(numerator, denominator)
zeroWeights <- match.arg(zeroWeights)
if (zeroWeights == "zinbwave") {
requireNamespace(
package = "zinbwave",
versionCheck = list(
op = ">=",
version = package_version("1.0")
)
)
} else if (zeroWeights == "zingeR") {
requireNamespace(
package = "zingeR",
versionCheck = list(
op = ">=",
version = package_version("0.1")
)
)
}
caller <- match.arg(caller)
if (caller == "DESeq2") {
requireNamespace(
package = "DESeq2",
versionCheck = list(
op = ">=",
version = package_version("1.20")
)
)
} else if (zeroWeights == "edgeR") {
requireNamespace(
package = "edgeR",
versionCheck = list(
op = ">=",
version = package_version("3.22")
)
)
}
assertIsAnImplicitInteger(minCountsPerCell)
assertIsAnImplicitInteger(minCellsPerGene)
message(paste(
"Performing differential expression with",
paste(zeroWeights, caller, sep = "-")
))
# Subset the SCE object to contain the desired cells
cells <- c(numerator, denominator)
object <- object[, cells]
message(paste(
"Applying low gene count filter",
paste(
"Requiring at least",
minCellsPerGene,
"cells with counts of",
minCountsPerCell,
"or more per gene"
),
sep = "\n"
))
genes <- rowSums(counts(object) >= minCountsPerCell) >= minCellsPerGene
# Early return NULL if no genes pass
if (!any(genes)) {
warning("No genes passed the low count filter")
return(NULL)
}
object <- object[genes, ]
# Create a cell factor to define the group for `DGEList()`
numeratorFactor <- replicate(
n = length(numerator),
expr = "numerator"
) %>%
as.factor() %>%
set_names(numerator)
denominatorFactor <- replicate(
n = length(denominator),
expr = "denominator"
) %>%
as.factor() %>%
set_names(denominator)
group <- factor(c(
as.character(numeratorFactor),
as.character(denominatorFactor)
))
names(group) <- c(names(numeratorFactor), names(denominatorFactor))
# Ensure denominator is set as reference
group <- relevel(group, ref = "denominator")
object[["group"]] <- group
# Set up the design matrix
design <- model.matrix(~group)
metadata(object)[["design"]] <- design
fun <- get(paste("", zeroWeights, caller, sep = "."))
fun(object)
}
)
#' @rdname diffExp
#' @export
setMethod(
"diffExp",
signature("seurat"),
getMethod("diffExp", "SingleCellExperiment")
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.