#' @title Contamination estimation with decontX
#'
#' @description Identifies contamination from factors such as ambient RNA
#' in single cell genomic datasets.
#'
#' @name decontX
#'
#' @param x A numeric matrix of counts or a \linkS4class{SingleCellExperiment}
#' with the matrix located in the assay slot under \code{assayName}.
#' Cells in each batch will be subsetted and converted to a sparse matrix
#' of class \code{dgCMatrix} from package \link{Matrix} before analysis. This
#' object should only contain filtered cells after cell calling. Empty
#' cell barcodes (low expression droplets before cell calling) are not needed
#' to run DecontX.
#' @param assayName Character. Name of the assay to use if \code{x} is a
#' \linkS4class{SingleCellExperiment}.
#' @param z Numeric or character vector. Cell cluster labels. If NULL,
#' PCA will be used to reduce the dimensionality of the dataset initially,
#' '\link[uwot]{umap}' from the 'uwot' package
#' will be used to further reduce the dataset to 2 dimenions and
#' the '\link[dbscan]{dbscan}' function from the 'dbscan' package
#' will be used to identify clusters of broad cell types. Default NULL.
#' @param batch Numeric or character vector. Batch labels for cells.
#' If batch labels are supplied, DecontX is run on cells from each
#' batch separately. Cells run in different channels or assays
#' should be considered different batches. Default NULL.
#' @param background A numeric matrix of counts or a
#' \linkS4class{SingleCellExperiment} with the matrix located in the assay
#' slot under \code{assayName}. It should have the same data format as \code{x}
#' except it contains the empty droplets instead of cells. When supplied,
#' empirical distribution of transcripts from these empty droplets
#' will be used as the contamination distribution. Default NULL.
#' @param bgAssayName Character. Name of the assay to use if \code{background}
#' is a \linkS4class{SingleCellExperiment}. Default to same as
#' \code{assayName}.
#' @param bgBatch Numeric or character vector. Batch labels for
#' \code{background}. Its unique values should be the same as those in
#' \code{batch}, such that each batch of cells have their corresponding batch
#' of empty droplets as background, pointed by this parameter. Default to NULL.
#' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500.
#' @param convergence Numeric. The EM algorithm will be stopped if the maximum
#' difference in the contamination estimates between the previous and
#' current iterations is less than this. Default 0.001.
#' @param iterLogLik Integer. Calculate log likelihood every \code{iterLogLik}
#' iteration. Default 10.
#' @param delta Numeric Vector of length 2. Concentration parameters for
#' the Dirichlet prior for the contamination in each cell. The first element
#' is the prior for the native counts while the second element is the prior for
#' the contamination counts. These essentially act as pseudocounts for the
#' native and contamination in each cell. If \code{estimateDelta = TRUE},
#' this is only used to produce a random sample of proportions for an initial
#' value of contamination in each cell. Then
#' \code{\link[MCMCprecision]{fit_dirichlet}} is used to update
#' \code{delta} in each iteration.
#' If \code{estimateDelta = FALSE}, then \code{delta} is fixed with these
#' values for the entire inference procedure. Fixing \code{delta} and
#' setting a high number in the second element will force \code{decontX}
#' to be more aggressive and estimate higher levels of contamination at
#' the expense of potentially removing native expression.
#' Default \code{c(10, 10)}.
#' @param estimateDelta Boolean. Whether to update \code{delta} at each
#' iteration.
#' @param varGenes Integer. The number of variable genes to use in
#' dimensionality reduction before clustering. Variability is calcualted using
#' \code{\link[scran]{modelGeneVar}} function from the 'scran' package.
#' Used only when z is not provided. Default 5000.
#' @param dbscanEps Numeric. The clustering resolution parameter
#' used in '\link[dbscan]{dbscan}' to estimate broad cell clusters.
#' Used only when z is not provided. Default 1.
#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility,
#' a default value of 12345 is used. If NULL, no calls to
#' \link[withr]{with_seed} are made.
#' @param logfile Character. Messages will be redirected to a file named
#' `logfile`. If NULL, messages will be printed to stdout. Default NULL.
#' @param verbose Logical. Whether to print log messages. Default TRUE.
#' @param ... For the generic, further arguments to pass to each method.
#'
#' @return If \code{x} is a matrix-like object, a list will be returned
#' with the following items:
#' \describe{
#' \item{\code{decontXcounts}:}{The decontaminated matrix. Values obtained
#' from the variational inference procedure may be non-integer. However,
#' integer counts can be obtained by rounding,
#' e.g. \code{round(decontXcounts)}.}
#' \item{\code{contamination}:}{Percentage of contamination in each cell.}
#' \item{\code{estimates}:}{List of estimated parameters for each batch. If z
#' was not supplied, then the UMAP coordinates used to generated cell
#' cluster labels will also be stored here.}
#' \item{\code{z}:}{Cell population/cluster labels used for analysis.}
#' \item{\code{runParams}:}{List of arguments used in the function call.}
#' }
#'
#' If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated
#' counts will be stored as an assay and can be accessed with
#' \code{decontXcounts(x)}. The contamination values and cluster labels
#' will be stored in \code{colData(x)}. \code{estimates} and \code{runParams}
#' will be stored in \code{metadata(x)$decontX}. The UMAPs used to generated
#' cell cluster labels will be stored in
#' \code{reducedDims} slot in \code{x}.
#'
#' @author Shiyi Yang, Yuan Yin, Joshua Campbell
#'
#' @examples
#' # Generate matrix with contamination
#' s <- simulateContamination(seed = 12345)
#'
#' library(SingleCellExperiment)
#' sce <- SingleCellExperiment(list(counts = s$observedCounts))
#' sce <- decontX(sce)
#'
#' # Plot contamination on UMAP
#' plotDecontXContamination(sce)
#'
#' # Plot decontX cluster labels
#' umap <- reducedDim(sce)
#' plotDimReduceCluster(x = sce$decontX_clusters,
#' dim1 = umap[, 1], dim2 = umap[, 2], )
#'
#' # Plot percentage of marker genes detected
#' # in each cell cluster before decontamination
#' s$markers
#' plotDecontXMarkerPercentage(sce, markers = s$markers, assayName = "counts")
#'
#' # Plot percentage of marker genes detected
#' # in each cell cluster after contamination
#' plotDecontXMarkerPercentage(sce, markers = s$markers,
#' assayName = "decontXcounts")
#'
#' # Plot percentage of marker genes detected in each cell
#' # comparing original and decontaminated counts side-by-side
#' plotDecontXMarkerPercentage(sce, markers = s$markers,
#' assayName = c("counts", "decontXcounts"))
#'
#' # Plot raw counts of indiviual markers genes before
#' # and after decontamination
#' plotDecontXMarkerExpression(sce, unlist(s$markers))
NULL
#' @export
#' @rdname decontX
setGeneric("decontX", function(x, ...) standardGeneric("decontX"))
#########################
# Setting up S4 methods #
#########################
#' @export
#' @rdname decontX
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
#' @importClassesFrom Matrix dgCMatrix
setMethod("decontX", "SingleCellExperiment", function(x,
assayName = "counts",
z = NULL,
batch = NULL,
background = NULL,
bgAssayName = NULL,
bgBatch = NULL,
maxIter = 500,
delta = c(10, 10),
estimateDelta = TRUE,
convergence = 0.001,
iterLogLik = 10,
varGenes = 5000,
dbscanEps = 1,
seed = 12345,
logfile = NULL,
verbose = TRUE) {
countsBackground <- NULL
if (!is.null(background)) {
# Remove cells with the same ID between x and the background matrix
# Also update bgBatch when background is updated and bgBatch is not null
temp <- .checkBackground(x = x,
background = background,
bgBatch = bgBatch,
logfile = logfile,
verbose = verbose)
background <- temp$background
bgBatch <- temp$bgBatch
if (is.null(bgAssayName)) {
bgAssayName <- assayName
}
countsBackground <- SummarizedExperiment::assay(background, i = bgAssayName)
}
mat <- SummarizedExperiment::assay(x, i = assayName)
result <- .decontX(
counts = mat,
z = z,
batch = batch,
countsBackground = countsBackground,
batchBackground = bgBatch,
maxIter = maxIter,
convergence = convergence,
iterLogLik = iterLogLik,
delta = delta,
estimateDelta = estimateDelta,
varGenes = varGenes,
dbscanEps = dbscanEps,
seed = seed,
logfile = logfile,
verbose = verbose
)
## Add results into column annotation
SummarizedExperiment::colData(x)$decontX_contamination <- result$contamination
SummarizedExperiment::colData(x)$decontX_clusters <- as.factor(result$z)
## Put estimated UMAPs into SCE
batchIndex <- unique(result$runParams$batch)
if (length(batchIndex) > 1) {
for (i in batchIndex) {
## Each individual UMAP will only be for one batch so need
## to put NAs in for cells in other batches
tempUMAP <- matrix(NA, ncol = 2, nrow = ncol(mat))
tempUMAP[result$runParams$batch == i, ] <- result$estimates[[i]]$UMAP
colnames(tempUMAP) <- c("UMAP_1", "UMAP_2")
rownames(tempUMAP) <- colnames(mat)
SingleCellExperiment::reducedDim(
x,
paste("decontX", i, "UMAP", sep = "_")
) <- tempUMAP
}
} else {
SingleCellExperiment::reducedDim(x, "decontX_UMAP") <-
result$estimates[[batchIndex]]$UMAP
}
## Save the rest of the result object into metadata
decontXcounts(x) <- result$decontXcounts
result$decontXcounts <- NULL
S4Vectors::metadata(x)$decontX <- result
return(x)
})
#' @export
#' @rdname decontX
setMethod("decontX", "ANY", function(x,
z = NULL,
batch = NULL,
background = NULL,
bgBatch = NULL,
maxIter = 500,
delta = c(10, 10),
estimateDelta = TRUE,
convergence = 0.001,
iterLogLik = 10,
varGenes = 5000,
dbscanEps = 1,
seed = 12345,
logfile = NULL,
verbose = TRUE) {
countsBackground <- NULL
if (!is.null(background)) {
# Remove cells with the same ID between x and the background matrix
# Also update bgBatch when background is updated and bgBatch is not null
temp <- .checkBackground(x = x,
background = background,
bgBatch = bgBatch,
logfile = logfile,
verbose = verbose)
background <- temp$background
countsBackground <- background
bgBatch <- temp$bgBatch
}
.decontX(
counts = x,
z = z,
batch = batch,
countsBackground = countsBackground,
batchBackground = bgBatch,
maxIter = maxIter,
convergence = convergence,
iterLogLik = iterLogLik,
delta = delta,
estimateDelta = estimateDelta,
varGenes = varGenes,
dbscanEps = dbscanEps,
seed = seed,
logfile = logfile,
verbose = verbose
)
})
## Copied from SingleCellExperiment Package
GET_FUN <- function(exprs_values, ...) {
(exprs_values) # To ensure evaluation
function(object, ...) {
SummarizedExperiment::assay(object, i = exprs_values, ...)
}
}
SET_FUN <- function(exprs_values, ...) {
(exprs_values) # To ensure evaluation
function(object, ..., value) {
SummarizedExperiment::assay(object, i = exprs_values, ...) <- value
object
}
}
#' @title Get or set decontaminated counts matrix
#'
#' @description Gets or sets the decontaminated counts matrix from a
#' a \linkS4class{SingleCellExperiment} object.
#' @name decontXcounts
#' @param object A \linkS4class{SingleCellExperiment} object.
#' @param value A matrix to save as an assay called \code{decontXcounts}
#' @param ... For the generic, further arguments to pass to each method.
#' @return If getting, the assay from \code{object} with the name
#' \code{decontXcounts} will be returned. If setting, a
#' \linkS4class{SingleCellExperiment} object will be returned with
#' \code{decontXcounts} listed in the \code{assay} slot.
#' @seealso \code{\link{assay}} and \code{\link{assay<-}}
NULL
#' @export
#' @rdname decontXcounts
setGeneric("decontXcounts", function(object, ...) {
standardGeneric("decontXcounts")
})
#' @export
#' @rdname decontXcounts
setGeneric("decontXcounts<-", function(object, ..., value) {
standardGeneric("decontXcounts<-")
})
#' @export
#' @rdname decontXcounts
setMethod("decontXcounts", "SingleCellExperiment", GET_FUN("decontXcounts"))
#' @export
#' @rdname decontXcounts
setMethod(
"decontXcounts<-", c("SingleCellExperiment", "ANY"),
SET_FUN("decontXcounts")
)
##########################
# Core Decontx Functions #
##########################
.decontX <- function(counts,
z = NULL,
batch = NULL,
countsBackground = NULL,
batchBackground = NULL,
maxIter = 200,
convergence = 0.001,
iterLogLik = 10,
delta = c(10, 10),
estimateDelta = TRUE,
varGenes = NULL,
dbscanEps = NULL,
seed = 12345,
logfile = NULL,
verbose = TRUE) {
startTime <- Sys.time()
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose
)
.logMessages("Starting DecontX",
logfile = logfile,
append = TRUE,
verbose = verbose
)
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose
)
runParams <- list(
z = z,
batch = batch,
batchBackground = batchBackground,
maxIter = maxIter,
delta = delta,
estimateDelta = estimateDelta,
convergence = convergence,
varGenes = varGenes,
dbscanEps = dbscanEps,
logfile = logfile,
verbose = verbose
)
totalGenes <- nrow(counts)
totalCells <- ncol(counts)
geneNames <- rownames(counts)
nC <- ncol(counts)
allCellNames <- colnames(counts)
## Set up final decontaminated matrix
estRmat <- Matrix::Matrix(
data = 0,
ncol = totalCells,
nrow = totalGenes,
sparse = TRUE,
dimnames = list(geneNames, allCellNames)
)
## Generate batch labels if none were supplied
if (is.null(batch)) {
batch <- rep("all_cells", nC)
# If batch null, bgBatch has to be null
if (!is.null(batchBackground)) {
stop(
"When experiment default to no bacth, background should ",
"also default to no batch."
)
}
if (!is.null(countsBackground)) {
batchBackground <- rep("all_cells", ncol(countsBackground))
}
} else {
# If batch not null and countsBackground supplied,
# user has to supply batchBackground as well
if (!is.null(countsBackground) & is.null(batchBackground)) {
stop(
"Cell batch, and background are supplied. Please also ",
"supply background batch."
)
}
}
runParams$batch <- batch
runParams$batchBackground <- batchBackground
batchIndex <- unique(batch)
## Set result lists upfront for all cells from different batches
logLikelihood <- c()
estConp <- rep(NA, nC)
returnZ <- rep(NA, nC)
resBatch <- list()
## Cycle through each sample/batch and run DecontX
for (bat in batchIndex) {
if (length(batchIndex) == 1) {
.logMessages(
date(),
".. Analyzing all cells",
logfile = logfile,
append = TRUE,
verbose = verbose
)
} else {
.logMessages(
date(),
" .. Analyzing cells in batch '",
bat, "'",
sep = "",
logfile = logfile,
append = TRUE,
verbose = verbose
)
}
zBat <- NULL
countsBat <- counts[, batch == bat]
bgBat <- countsBackground[, batchBackground == bat]
## Convert to sparse matrix
if (!inherits(countsBat, "dgCMatrix")) {
.logMessages(
date(),
".... Converting to sparse matrix",
logfile = logfile,
append = TRUE,
verbose = verbose
)
countsBat <- methods::as(countsBat, "CsparseMatrix")
}
if (!is.null(bgBat)) {
if (!inherits(bgBat, "dgCMatrix")) {
bgBat <- methods::as(bgBat, "CsparseMatrix")
}
}
if (!is.null(z)) {
zBat <- z[batch == bat]
}
if (is.null(seed)) {
res <- .decontXoneBatch(
counts = countsBat,
z = zBat,
batch = bat,
countsBackground = bgBat,
maxIter = maxIter,
delta = delta,
estimateDelta = estimateDelta,
convergence = convergence,
iterLogLik = iterLogLik,
logfile = logfile,
verbose = verbose,
varGenes = varGenes,
dbscanEps = dbscanEps,
seed = seed
)
} else {
withr::with_seed(
seed,
res <- .decontXoneBatch(
counts = countsBat,
z = zBat,
batch = bat,
countsBackground = bgBat,
maxIter = maxIter,
delta = delta,
estimateDelta = estimateDelta,
convergence = convergence,
iterLogLik = iterLogLik,
logfile = logfile,
verbose = verbose,
varGenes = varGenes,
dbscanEps = dbscanEps,
seed = seed
)
)
}
## Try to convert class of new matrix to class of original matrix
.logMessages(
date(),
".. Calculating final decontaminated matrix",
logfile = logfile,
append = TRUE,
verbose = verbose
)
estRmat.temp <- calculateNativeMatrix(
counts = countsBat,
theta = res$theta,
eta = res$eta,
phi = res$phi,
z = as.integer(res$z),
pseudocount = 1e-20
)
# Speed up sparse matrix value assignment by cbind -> order recovery
allCol <- paste0("col_", seq_len(ncol(estRmat)))
colnames(estRmat) <- allCol
subCol <- paste0("col_", which(batch == bat))
colnames(estRmat.temp) <- subCol
estRmat <- estRmat[, !(allCol %in% subCol)]
estRmat <- cbind(estRmat, estRmat.temp)
# Recover order and set names
estRmat <- estRmat[, allCol]
dimnames(estRmat) <- list(geneNames, allCellNames)
resBatch[[bat]] <- list(
z = res$z,
phi = res$phi,
eta = res$eta,
delta = res$delta,
theta = res$theta,
contamination = res$contamination,
logLikelihood = res$logLikelihood,
UMAP = res$UMAP,
z = res$z,
iteration = res$iteration
)
estConp[batch == bat] <- res$contamination
if (length(batchIndex) > 1) {
returnZ[batch == bat] <- paste0(bat, "-", res$z)
} else {
returnZ[batch == bat] <- res$z
}
}
names(resBatch) <- batchIndex
returnResult <- list(
"runParams" = runParams,
"estimates" = resBatch,
"decontXcounts" = estRmat,
"contamination" = estConp,
"z" = returnZ
)
if (inherits(counts, c("DelayedMatrix", "DelayedArray"))) {
.logMessages(
date(),
".. Converting decontaminated matrix to", class(counts),
logfile = logfile,
append = TRUE,
verbose = verbose
)
## Determine class of seed in DelayedArray
seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]]
if (seed.class == "HDF5ArraySeed") {
returnResult$decontXcounts <-
methods::as(returnResult$decontXcounts, "HDF5Matrix")
} else {
if (isTRUE(methods::canCoerce(returnResult$decontXcounts, seed.class))) {
returnResult$decontXcounts <-
methods::as(returnResult$decontXcounts, seed.class)
}
}
returnResult$decontXcounts <-
DelayedArray::DelayedArray(returnResult$decontXcounts)
} else {
try({
if (methods::canCoerce(returnResult$decontXcounts, class(counts))) {
returnResult$decontXcounts <-
methods::as(returnResult$decontXcounts, class(counts))
}
},
silent = TRUE
)
}
endTime <- Sys.time()
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose
)
.logMessages("Completed DecontX. Total time:",
format(difftime(endTime, startTime)),
logfile = logfile,
append = TRUE,
verbose = verbose
)
.logMessages(paste(rep("-", 50), collapse = ""),
logfile = logfile,
append = TRUE,
verbose = verbose
)
return(returnResult)
}
# This function updates decontamination for one batch
# seed passed to this function is to be furhter passed to
# function .decontxInitializeZ()
.decontXoneBatch <- function(counts,
z = NULL,
batch = NULL,
countsBackground = NULL,
maxIter = 200,
delta = c(10, 10),
estimateDelta = TRUE,
convergence = 0.01,
iterLogLik = 10,
logfile = NULL,
verbose = TRUE,
varGenes = NULL,
dbscanEps = NULL,
seed = 12345) {
.checkCountsDecon(counts)
.checkDelta(delta)
# nG <- nrow(counts)
nC <- ncol(counts)
deconMethod <- "clustering"
## Generating UMAP and cell cluster labels if none are provided
umap <- NULL
if (is.null(z)) {
m <- ".... Generating UMAP and estimating cell types"
estimateCellTypes <- TRUE
} else {
m <- ".... Generating UMAP"
estimateCellTypes <- FALSE
}
.logMessages(
date(),
m,
logfile = logfile,
append = TRUE,
verbose = verbose
)
varGenes <- .processvarGenes(varGenes)
dbscanEps <- .processdbscanEps(dbscanEps)
celda.init <- .decontxInitializeZ(
object = counts,
varGenes = varGenes,
dbscanEps = dbscanEps,
estimateCellTypes = estimateCellTypes,
seed = seed
)
if (is.null(z)) {
z <- celda.init$z
}
umap <- celda.init$umap
colnames(umap) <- c(
"DecontX_UMAP_1",
"DecontX_UMAP_2"
)
rownames(umap) <- colnames(counts)
z <- .processCellLabels(z, numCells = nC)
K <- length(unique(z))
iter <- 1L
numIterWithoutImprovement <- 0L
stopIter <- 3L
.logMessages(
date(),
".... Estimating contamination",
logfile = logfile,
append = TRUE,
verbose = verbose
)
if (deconMethod == "clustering") {
## Initialization
theta <- stats::rbeta(
n = nC,
shape1 = delta[1],
shape2 = delta[2]
)
nextDecon <- decontXInitialize(
counts = counts,
theta = theta,
z = z,
pseudocount = 1e-20
)
phi <- nextDecon$phi
eta <- nextDecon$eta
# if countsBackground is not null, use empirical dist. to replace eta
if (!is.null(countsBackground)) {
# Add pseudocount to each gene in eta
eta_tilda <- Matrix::rowSums(countsBackground) + 1e-20
eta <- eta_tilda / sum(eta_tilda)
# Make eta a matrix same dimension as phi
eta <- matrix(eta, length(eta), dim(phi)[2])
}
ll <- c()
llRound <- decontXLogLik(
counts = counts,
z = z,
phi = phi,
eta = eta,
theta = theta,
pseudocount = 1e-20
)
## EM updates
theta.previous <- theta
converged <- FALSE
counts.colsums <- Matrix::colSums(counts)
while (iter <= maxIter & !isTRUE(converged) &
numIterWithoutImprovement <= stopIter) {
if (is.null(countsBackground)) {
nextDecon <- decontXEM(
counts = counts,
counts_colsums = counts.colsums,
phi = phi,
estimate_eta = TRUE,
eta = eta,
theta = theta,
z = z,
estimate_delta = isTRUE(estimateDelta),
delta = delta,
pseudocount = 1e-20
)
} else {
nextDecon <- decontXEM(
counts = counts,
counts_colsums = counts.colsums,
phi = phi,
estimate_eta = FALSE,
eta = eta,
theta = theta,
z = z,
estimate_delta = isTRUE(estimateDelta),
delta = delta,
pseudocount = 1e-20
)
}
theta <- nextDecon$theta
phi <- nextDecon$phi
eta <- nextDecon$eta
delta <- nextDecon$delta
max.divergence <- max(abs(theta.previous - theta))
if (max.divergence < convergence) {
converged <- TRUE
}
theta.previous <- theta
## Calculate likelihood and check for convergence
if (iter %% iterLogLik == 0 || converged) {
llTemp <- decontXLogLik(
counts = counts,
z = z,
phi = phi,
eta = eta,
theta = theta,
pseudocount = 1e-20
)
ll <- c(ll, llTemp)
.logMessages(date(),
"...... Completed iteration:",
iter,
"| converge:",
signif(max.divergence, 4),
logfile = logfile,
append = TRUE,
verbose = verbose
)
}
iter <- iter + 1L
}
}
resConp <- nextDecon$contamination
names(resConp) <- colnames(counts)
return(list(
"logLikelihood" = ll,
"contamination" = resConp,
"theta" = theta,
"delta" = delta,
"phi" = phi,
"eta" = eta,
"UMAP" = umap,
"iteration" = iter - 1L,
"z" = z
))
}
# This function calculates the log-likelihood
#
# counts Numeric/Integer matrix. Observed count matrix, rows represent features
# and columns represent cells
# z Integer vector. Cell population labels
# phi Numeric matrix. Rows represent features and columns represent cell
# populations
# eta Numeric matrix. Rows represent features and columns represent cell
# populations
# theta Numeric vector. Proportion of truely expressed transcripts
.deconCalcLL <- function(counts, z, phi, eta, theta) {
# ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] +
# 1e-20 ) ) # when dist_mat are K x G matrices
ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] +
(1 - theta) * t(eta)[z, ] + 1e-20))
return(ll)
}
# DEPRECATED. This is not used, but is kept as it might be useful in the future
# This function calculates the log-likelihood of background distribution
# decontamination
# bgDist Numeric matrix. Rows represent feature and columns are the times that
# the background-distribution has been replicated.
.bgCalcLL <- function(counts, globalZ, cbZ, phi, eta, theta) {
# ll <- sum(t(counts) * log(theta * t(cellDist) +
# (1 - theta) * t(bgDist) + 1e-20))
ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] +
(1 - theta) * t(eta)[globalZ, ] + 1e-20))
return(ll)
}
# This function updates decontamination
# phi Numeric matrix. Rows represent features and columns represent cell
# populations
# eta Numeric matrix. Rows represent features and columns represent cell
# populations
# theta Numeric vector. Proportion of truely expressed transctripts
#' @importFrom MCMCprecision fit_dirichlet
.cDCalcEMDecontamination <- function(counts,
phi,
eta,
theta,
z,
K,
delta) {
## Notes: use fix-point iteration to update prior for theta, no need
## to feed delta anymore
logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20)
logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20)
Pr.e <- exp(logPr)
Pc.e <- exp(logPc)
Pr <- Pr.e / (Pr.e + Pc.e)
estRmat <- t(Pr) * counts
rnGByK <- .colSumByGroupNumeric(estRmat, z, K)
cnGByK <- rowSums(rnGByK) - rnGByK
counts.cs <- colSums(counts)
estRmat.cs <- colSums(estRmat)
estRmat.cs.n <- estRmat.cs / counts.cs
estCmat.cs.n <- 1 - estRmat.cs.n
temp <- cbind(estRmat.cs.n, estCmat.cs.n)
deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha
## Update parameters
theta <-
(estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2))
phi <- normalizeCounts(rnGByK,
normalize = "proportion",
pseudocountNormalize = 1e-20
)
eta <- normalizeCounts(cnGByK,
normalize = "proportion",
pseudocountNormalize = 1e-20
)
return(list(
"estRmat" = estRmat,
"theta" = theta,
"phi" = phi,
"eta" = eta,
"delta" = deltaV2
))
}
# DEPRECATED. This is not used, but is kept as it might be useful in the
# feature.
# This function updates decontamination using background distribution
.cDCalcEMbgDecontamination <-
function(counts, globalZ, cbZ, trZ, phi, eta, theta) {
logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20)
logPc <-
log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20)
Pr <- exp(logPr) / (exp(logPr) + exp(logPc))
Pc <- 1 - Pr
deltaV2 <-
MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha
estRmat <- t(Pr) * counts
phiUnnormalized <-
.colSumByGroupNumeric(estRmat, cbZ, max(cbZ))
etaUnnormalized <-
rowSums(phiUnnormalized) - .colSumByGroupNumeric(
phiUnnormalized,
trZ, max(trZ)
)
## Update paramters
theta <-
(colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2))
phi <-
normalizeCounts(phiUnnormalized,
normalize = "proportion",
pseudocountNormalize = 1e-20
)
eta <-
normalizeCounts(etaUnnormalized,
normalize = "proportion",
pseudocountNormalize = 1e-20
)
return(list(
"estRmat" = estRmat,
"theta" = theta,
"phi" = phi,
"eta" = eta,
"delta" = deltaV2
))
}
## Make sure provided count matrix is the right type
.checkCountsDecon <- function(counts) {
if (sum(is.na(counts)) > 0) {
stop("Missing value in 'counts' matrix.")
}
if (is.null(dim(counts))) {
stop("At least 2 genes need to have non-zero expressions.")
}
}
## Make sure provided cell labels are the right type
#' @importFrom plyr mapvalues
.processCellLabels <- function(z, numCells) {
if (length(z) != numCells) {
stop(
"'z' must be of the same length as the number of cells in the",
" 'counts' matrix."
)
}
if (length(unique(z)) < 2) {
stop(
"No need to decontaminate when only one cluster",
" is in the dataset."
) # Even though
# everything runs smoothly when length(unique(z)) == 1, result is not
# trustful
}
if (!is.factor(z)) {
z <- plyr::mapvalues(z, unique(z), seq(length(unique(z))))
z <- as.factor(z)
}
return(z)
}
## Add two (veried-length) vectors of logLikelihood
addLogLikelihood <- function(llA, llB) {
lengthA <- length(llA)
lengthB <- length(llB)
if (lengthA >= lengthB) {
llB <- c(llB, rep(llB[lengthB], lengthA - lengthB))
ll <- llA + llB
} else {
llA <- c(llA, rep(llA[lengthA], lengthB - lengthA))
ll <- llA + llB
}
return(ll)
}
.decontxInitializeZ <- function(object,
varGenes = 2000,
dbscanEps = 1,
estimateCellTypes = TRUE,
seed = 12345) {
if (!is(object, "SingleCellExperiment")) {
sce <- SingleCellExperiment::SingleCellExperiment(
assays = list(counts = object)
)
}
sce <- scater::logNormCounts(sce, log = TRUE)
if (!is.null(seed)) {
with_seed(
seed,
resUmap <- scater::calculateUMAP(sce, ntop = varGenes,
n_threads = 1,
exprs_values = "logcounts")
)
} else {
resUmap <- scater::calculateUMAP(sce, ntop = varGenes,
n_threads = 1,
exprs_values = "logcounts")
}
z <- NULL
if (isTRUE(estimateCellTypes)) {
# Find clusters with dbSCAN
totalClusters <- 1
iter <- 1
while (totalClusters <= 1 & dbscanEps > 0 & iter < 10) {
resDbscan <- dbscan::dbscan(resUmap, dbscanEps)
dbscanEps <- dbscanEps - (0.25 * dbscanEps)
totalClusters <- length(unique(resDbscan$cluster))
iter <- iter + 1
}
# If dbscan was not able to get more than 2 clusters,
# use kmeans to force 2 clusters as a last resort
if (totalClusters == 1) {
cl <- stats::kmeans(t(SingleCellExperiment::logcounts(sce)), 2)
z <- cl$cluster
} else {
z <- resDbscan$cluster
}
}
return(list(
"z" = z,
"umap" = resUmap
))
}
## Initialization of cell labels for DecontX when they are not given
.decontxInitializeZ_prevous <-
function(object, # object is either a sce object or a count matrix
varGenes = 5000,
dbscanEps = 1.0,
verbose = TRUE,
seed = 12345,
logfile = NULL) {
if (!is(object, "SingleCellExperiment")) {
sce <- SingleCellExperiment::SingleCellExperiment(
assays =
list(counts = object)
)
}
sce <- sce[Matrix::rowSums(SingleCellExperiment::counts(sce)) > 0, ]
sce <- scater::logNormCounts(sce, log = TRUE)
# sce <- scater::normalize(sce)
if (nrow(sce) <= varGenes) {
topVariableGenes <- seq_len(nrow(sce))
} else if (nrow(sce) > varGenes) {
sce.var <- scran::modelGeneVar(sce)
topVariableGenes <- order(sce.var$bio,
decreasing = TRUE
)[seq(varGenes)]
}
countsFiltered <- as.matrix(SingleCellExperiment::counts(
sce[topVariableGenes, ]
))
storage.mode(countsFiltered) <- "integer"
.logMessages(
date(),
"...... Collapsing features into",
L,
"modules",
logfile = logfile,
append = TRUE,
verbose = verbose
)
## Celda clustering using recursive module splitting
L <- min(L, nrow(countsFiltered))
if (is.null(seed)) {
initialModuleSplit <- recursiveSplitModule(countsFiltered,
initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE
)
} else {
with_seed(seed,
initialModuleSplit <- recursiveSplitModule(countsFiltered,
initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE
))
}
initialModel <- subsetCeldaList(initialModuleSplit, list(L = L))
.logMessages(
date(),
"...... Reducing dimensionality with UMAP",
logfile = logfile,
append = TRUE,
verbose = verbose
)
## Louvan graph-based method to reduce dimension into 2 cluster
nNeighbors <- min(15, ncol(countsFiltered))
# resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors,
# min_dist = 0.01, spread = 1)
# rm(fm)
resUmap <- celdaUmap(countsFiltered, initialModel,
minDist = 0.01, spread = 1, nNeighbors = nNeighbors, seed = seed
)
.logMessages(
date(),
" ...... Determining cell clusters with DBSCAN (Eps=",
dbscanEps,
")",
sep = "",
logfile = logfile,
append = TRUE,
verbose = verbose
)
# Use dbSCAN on the UMAP to identify broad cell types
totalClusters <- 1
while (totalClusters <= 1 & dbscanEps > 0) {
resDbscan <- dbscan::dbscan(resUmap, dbscanEps)
dbscanEps <- dbscanEps - (0.25 * dbscanEps)
totalClusters <- length(unique(resDbscan$cluster))
}
return(list(
"z" = resDbscan$cluster,
"umap" = resUmap
))
}
## process varGenes
.processvarGenes <- function(varGenes) {
if (is.null(varGenes)) {
varGenes <- 5000
} else {
if (varGenes < 2 | length(varGenes) > 1) {
stop("Parameter 'varGenes' must be an integer larger than 1.")
}
}
return(varGenes)
}
## process dbscanEps for resolusion threshold using DBSCAN
.processdbscanEps <- function(dbscanEps) {
if (is.null(dbscanEps)) {
dbscanEps <- 1
} else {
if (dbscanEps < 0) {
stop("Parameter 'dbscanEps' needs to be non-negative.")
}
}
return(dbscanEps)
}
.checkDelta <- function(delta) {
if (!is.numeric(delta) | length(delta) != 2 | any(delta < 0)) {
stop("'delta' needs to be a numeric vector of length 2",
" containing positive values.")
}
return(delta)
}
#########################
# Simulating Data #
#########################
#' @title Simulate contaminated count matrix
#' @description This function generates a list containing two count matrices --
#' one for real expression, the other one for contamination, as well as other
#' parameters used in the simulation which can be useful for running
#' decontamination.
#' @param C Integer. Number of cells to be simulated. Default \code{300}.
#' @param G Integer. Number of genes to be simulated. Default \code{100}.
#' @param K Integer. Number of cell populations to be simulated.
#' Default \code{3}.
#' @param NRange Integer vector. A vector of length 2 that specifies the lower
#' and upper bounds of the number of counts generated for each cell. Default
#' \code{c(500, 1000)}.
#' @param beta Numeric. Concentration parameter for Phi. Default \code{0.1}.
#' @param delta Numeric or Numeric vector. Concentration parameter for Theta.
#' If input as a single numeric value, symmetric values for beta
#' distribution are specified; if input as a vector of lenght 2, the two
#' values will be the shape1 and shape2 paramters of the beta distribution
#' respectively. Default \code{c(1, 5)}.
#' @param numMarkers Integer. Number of markers for each cell population.
#' Default \code{3}.
#' @param seed Integer. Passed to \code{\link[withr]{with_seed}}.
#' For reproducibility, a default value of 12345 is used. If NULL, no calls to
#' \code{\link[withr]{with_seed}} are made.
#' @return A list containing the \code{nativeMatirx} (real expression),
#' \code{observedMatrix} (real expression + contamination), as well as other
#' parameters used in the simulation.
#' @author Shiyi Yang, Yuan Yin, Joshua Campbell
#' @examples
#' contaminationSim <- simulateContamination(K = 3, delta = c(1, 10))
#' @export
simulateContamination <- function(C = 300,
G = 100,
K = 3,
NRange = c(500, 1000),
beta = 0.1,
delta = c(1, 10),
numMarkers = 3,
seed = 12345) {
if (is.null(seed)) {
res <- .simulateContaminatedMatrix(
C = C,
G = G,
K = K,
NRange = NRange,
beta = beta,
delta = delta,
numMarkers = numMarkers
)
} else {
with_seed(
seed,
res <- .simulateContaminatedMatrix(
C = C,
G = G,
K = K,
NRange = NRange,
beta = beta,
delta = delta,
numMarkers = numMarkers
)
)
}
return(res)
}
.simulateContaminatedMatrix <- function(C = 300,
G = 100,
K = 3,
NRange = c(500, 1000),
beta = 0.5,
delta = c(1, 2),
numMarkers = 3) {
if (length(delta) == 1) {
cpByC <- stats::rbeta(
n = C,
shape1 = delta,
shape2 = delta
)
} else {
cpByC <- stats::rbeta(
n = C,
shape1 = delta[1],
shape2 = delta[2]
)
}
z <- sample(seq(K), size = C, replace = TRUE)
if (length(unique(z)) < K) {
warning(
"Only ",
length(unique(z)),
" clusters are simulated. Try to increase numebr of cells 'C' if",
" more clusters are needed"
)
K <- length(unique(z))
z <- plyr::mapvalues(z, unique(z), seq(length(unique(z))))
}
NbyC <- sample(seq(min(NRange), max(NRange)),
size = C,
replace = TRUE
)
cNbyC <- vapply(seq(C), function(i) {
stats::rbinom(
n = 1,
size = NbyC[i],
p = cpByC[i]
)
}, integer(1))
rNbyC <- NbyC - cNbyC
phi <- .rdirichlet(K, rep(beta, G))
## Select random genes to be markers in each cell population
## by setting their values to zero.
if (K * numMarkers > G) {
stop("The number of markers ('numMarkers') times the number of cell",
" populations ('K') cannot be greater than the number of",
" genes ('G').")
}
markerKIndex <- rep(seq(K), each = numMarkers)
markerRowIndex <- sample(seq(G), numMarkers * K)
for (i in seq(K)) {
ix <- markerRowIndex[markerKIndex == i]
phi[i, ix] <- max(phi[i, ])
for (j in setdiff(seq(K), i)) {
phi[j, ix] <- 0
}
}
phi <- prop.table(phi, margin = 1)
## sample real expressed count matrix
cellRmat <- vapply(seq(C), function(i) {
stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ])
}, integer(G))
rownames(cellRmat) <- paste0("Gene_", seq(G))
colnames(cellRmat) <- paste0("Cell_", seq(C))
## Get list of marker names
markerNames <- list()
for (i in seq(K)) {
markerNames[[i]] <- rownames(cellRmat)[markerRowIndex[markerKIndex == i]]
}
names(markerNames) <- paste0("CellType_", seq(K), "_Markers")
## sample contamination count matrix
nGByK <-
rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K)
eta <- normalizeCounts(counts = nGByK, normalize = "proportion")
cellCmat <- vapply(seq(C), function(i) {
stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]])
}, integer(G))
cellOmat <- cellRmat + cellCmat
contamination <- colSums(cellCmat) / colSums(cellOmat)
rownames(cellOmat) <- paste0("Gene_", seq(G))
colnames(cellOmat) <- paste0("Cell_", seq(C))
return(
list(
"nativeCounts" = cellRmat,
"observedCounts" = cellOmat,
"NByC" = NbyC,
"z" = z,
"eta" = eta,
"phi" = t(phi),
"markers" = markerNames,
"numMarkers" = numMarkers,
"contamination" = contamination
)
)
}
.checkBackground <- function(x, background, bgBatch,
logfile = NULL, verbose = FALSE) {
# Remove background barcodes that have already appeared in x
# If bgBatch param is supplied, also remove duplicate bgBatch
if (!is.null(colnames(background))) {
dupBarcode <- colnames(background) %in% colnames(x)
} else {
dupBarcode <- FALSE
warning("No column names were found for the 'background' matrix. ",
"No checking was performed between the ids in the 'backgroud' ",
"matrix and 'x'.",
" Please ensure that no true cells are included in the background ",
"matrix. Otherwise, results will be incorrect.")
}
if (any(dupBarcode)) {
.logMessages(
date(),
".. ",
sum(dupBarcode),
" cells in the background matrix were removed as they were found in",
" the filtered matrix.",
logfile = logfile,
append = TRUE,
verbose = verbose
)
background <- background[, !(dupBarcode), drop = FALSE]
if (!is.null(bgBatch)) {
if (length(bgBatch) != length(dupBarcode)) {
stop(
"Length of bgBatch must be equal to the number of columns",
"of background matrix."
)
}
bgBatch <- bgBatch[!(dupBarcode)]
}
}
re <- list(background = background,
bgBatch = bgBatch)
return(re)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.