#' @title BASiCS MCMC sampler
#'
#' @description MCMC sampler to perform Bayesian inference for single-cell
#' mRNA sequencing datasets using the model described in Vallejos et al (2015).
#'
#' @param Data A \code{\linkS4class{SingleCellExperiment}} object.
#' If \code{WithSpikes = TRUE}, this MUST be formatted to include
#' the spike-ins and/or batch information (see vignette).
#' @param N Total number of iterations for the MCMC sampler.
#' Use \code{N>=max(4,Thin)}, \code{N} being a multiple of \code{Thin}.
#' @param Thin Thining period for the MCMC sampler. Use \code{Thin>=2}.
#' @param Burn Burn-in period for the MCMC sampler. Use \code{Burn>=1},
#' \code{Burn<N}, \code{Burn} being a multiple of \code{Thin}.
#' @param Regression If \code{Regression = TRUE}, BASiCS exploits a joint prior
#' formulation for mean and over-dispersion parameters to estimate a measure of
#' residual over-dispersion is not confounded by mean expression. Recommended
#' setting is \code{Regression = TRUE}.
#' @param WithSpikes If \code{WithSpikes = TRUE}, BASiCS will use reads from
#' added spike-ins to estimate technical variability.
#' If \code{WithSpikess = FALSE},
#' BASiCS depends on replicated experiments (batches) to estimate
#' technical variability. In this case, please supply the BatchInfo vector
#' in \code{colData(Data)}. Default: \code{WithSpikes = TRUE}.
#' @param PriorParam List of prior parameters for BASiCS_MCMC.
#' Should be created using \code{\link{BASiCS_PriorParam}}.
#' @param FixNu Should the scaling normalisation factor \code{nu} be fixed
#' to the starting value when \code{WithSpikes=FALSE}?
#' These are set to scran scaling normalisation factors.
#' @param SubsetBy Character value specifying whether a divide and
#' conquer inference strategy should be used. When this is set to \code{"gene"},
#' inference is performed on batches of genes separately, and when it is set to
#' \code{"cell"}, inference is performed on batches of cells separately.
#' Posterior distributions are combined using posterior interval estimation
#' (see Li et al., 2016).
#' @param NSubsets If \code{SubsetBy="gene"} or
#' \code{SubsetBy="cell"}, \code{NSubsets} specifies the number of batches
#' to create and perform divide and conquer inference with.
#' @param CombineMethod The method used to combine
#' subposteriors if \code{SubsetBy} is set to \code{"gene"} or
#' \code{"cell"}. Options are \code{"pie"} corresponding to
#' posterior interval estimation (see Li et al., 2016) or
#' \code{"consensus"} (see Scott et al., 2016).
#' Both of these methods use a form of weighted average to
#' combine subposterior draws into the final posterior.
#' @param Weighting The weighting method used in the weighted
#' average chosen using \code{CombineMethod}. Available
#' options are \code{"naive"} (unweighted), \code{"n_weight"}
#' (weights are chosen based on the size of each partition)
#' and \code{"inverse_variance"} (subposteriors are weighted based
#' on the inverse of the variance of the subposterior for each
#' parameter).
#' @param Threads Integer specifying the number of threads to be used to
#' parallelise parameter updates. Default value is the globally set
#' \code{"Ncpus"} option, or 1 if this option is not set.
#' @param BPPARAM A \code{\link{BiocParallelParam}} instance,
#' used for divide and conquer inference.
#' @param ... Optional parameters.
#' \describe{
#' \item{
#' \code{AR}
#' }{
#' Optimal acceptance rate for adaptive Metropolis Hastings
#' updates. It must be a positive number between 0 and 1. Default
#' (and recommended): \code{AR = 0.44}.
#' }
#' \item{
#' \code{StopAdapt}
#' }{
#' Iteration at which adaptive proposals are not longer
#' adapted. Use \code{StopAdapt>=1}. Default: \code{StopAdapt = Burn}.
#' }
#' \item{
#' \code{StoreChains}
#' }{
#' If \code{StoreChains = TRUE}, the generated
#' \code{BASiCS_Chain} object is stored as a `.Rds` file (\code{RunName}
#' argument used to index the file name).
#' Default: \code{StoreChains = FALSE}.}
#' \item{
#' \code{StoreAdapt}
#' }{
#' If \code{StoreAdapt = TRUE}, trajectory of
#' adaptive proposal variances (in log-scale) for all parameters is
#' stored as a list in a `.Rds` file (\code{RunName} argument used to
#' index file name). Default: \code{StoreAdapt = FALSE}.
#' }
#' \item{
#' \code{StoreDir}
#' }{
#' Directory where output files are stored.
#' Only required if \code{StoreChains = TRUE} and/or
#' \code{StoreAdapt = TRUE}. Default: \code{StoreDir = getwd()}.
#' }
#' \item{
#' \code{RunName}
#' }{
#' String used to index `.Rds` files storing chains
#' and/or adaptive proposal variances.
#' }
#' \item{
#' \code{PrintProgress}
#' }{
#' If \code{PrintProgress = FALSE}, console-based
#' progress report is suppressed.}
#' \item{
#' \code{Start}
#' }{
#' Starting values for the MCMC sampler. We do not advise
#' to use this argument. Default options have been tuned to facilitate
#' convergence. If changed, it must be a list containing the following
#' elements:
#' \code{mu0}, \code{delta0}, \code{phi0}, \code{s0}, \code{nu0},
#' \code{theta0}, \code{ls.mu0}, \code{ls.delta0}, \code{ls.phi0},
#' \code{ls.nu0} and \code{ls.theta0}
#' }
#' \item{
#' \code{GeneExponent/CellExponent}
#' }{
#' Exponents applied to the prior for MCMC updates. Intended for use only
#' when performing divide & conquer MCMC strategies.
#' }
#' }
#' @return An object of class \code{\link[BASiCS]{BASiCS_Chain}}.
#'
#' @examples
#'
#' # Built-in simulated dataset
#' set.seed(1)
#' Data <- makeExampleBASiCS_Data()
#' # To analyse real data, please refer to the instructions in:
#' # https://github.com/catavallejos/BASiCS/wiki/2.-Input-preparation
#'
#' # Only a short run of the MCMC algorithm for illustration purposes
#' # Longer runs migth be required to reach convergence
#' Chain <- BASiCS_MCMC(Data, N = 50, Thin = 2, Burn = 10, Regression = FALSE,
#' PrintProgress = FALSE, WithSpikes = TRUE)
#'
#' # To run the regression version of BASiCS, use:
#' Chain <- BASiCS_MCMC(Data, N = 50, Thin = 2, Burn = 10, Regression = TRUE,
#' PrintProgress = FALSE, WithSpikes = TRUE)
#'
#' # To run the non-spike version BASiCS requires the data to contain at least
#' # 2 batches:
#' set.seed(2)
#' Data <- makeExampleBASiCS_Data(WithBatch = TRUE)
#' Chain <- BASiCS_MCMC(Data, N = 50, Thin = 2, Burn = 10, Regression = TRUE,
#' PrintProgress = FALSE, WithSpikes = FALSE)
#'
#' # For illustration purposes we load a built-in 'BASiCS_Chain' object
#' # (obtained using the 'BASiCS_MCMC' function)
#' data(ChainSC)
#'
#' # `displayChainBASiCS` can be used to extract information from this output.
#' # For example:
#' head(displayChainBASiCS(ChainSC, Param = 'mu'))
#'
#' # Traceplot (examples only)
#' plot(ChainSC, Param = 'mu', Gene = 1)
#' plot(ChainSC, Param = 'phi', Cell = 1)
#' plot(ChainSC, Param = 'theta', Batch = 1)
#'
#' # Calculating posterior medians and 95% HPD intervals
#' ChainSummary <- Summary(ChainSC)
#'
#' # `displaySummaryBASiCS` can be used to extract information from this output
#' # For example:
#' head(displaySummaryBASiCS(ChainSummary, Param = 'mu'))
#'
#' # Graphical display of posterior medians and 95% HPD intervals
#' # For example:
#' plot(ChainSummary, Param = 'mu', main = 'All genes')
#' plot(ChainSummary, Param = 'mu', Genes = 1:10, main = 'First 10 genes')
#' plot(ChainSummary, Param = 'phi', main = 'All cells')
#' plot(ChainSummary, Param = 'phi', Cells = 1:5, main = 'First 5 cells')
#' plot(ChainSummary, Param = 'theta')
#'
#' # To constrast posterior medians of cell-specific parameters
#' # For example:
#' par(mfrow = c(1,2))
#' plot(ChainSummary, Param = 'phi', Param2 = 's', SmoothPlot = FALSE)
#' # Recommended for large numbers of cells
#' plot(ChainSummary, Param = 'phi', Param2 = 's', SmoothPlot = TRUE)
#'
#' # To constrast posterior medians of gene-specific parameters
#' par(mfrow = c(1,2))
#' plot(ChainSummary, Param = 'mu', Param2 = 'delta', log = 'x',
#' SmoothPlot = FALSE)
#' # Recommended
#' plot(ChainSummary, Param = 'mu', Param2 = 'delta', log = 'x',
#' SmoothPlot = TRUE)
#'
#' # To obtain denoised rates / counts, see:
#' # help(BASiCS_DenoisedRates)
#' # and
#' # help(BASiCS_DenoisedCounts)
#'
#' # For examples of differential analyses between 2 populations of cells see:
#' # help(BASiCS_TestDE)
#'
#' @author Catalina A. Vallejos \email{cnvallej@@uc.cl}
#' @author Nils Eling \email{eling@@ebi.ac.uk}
#'
#' @references
#' Vallejos, Marioni and Richardson (2015). PLoS Computational Biology.
#'
#' Vallejos, Richardson and Marioni (2016). Genome Biology.
#'
#' Eling et al (2018). Cell Systems
#'
#' Simple, Scalable and Accurate Posterior Interval Estimation
#' Cheng Li and Sanvesh Srivastava and David B. Dunson
#' arXiv (2016)
#'
#' Bayes and Big Data: The Consensus Monte Carlo Algorithm
#' Steven L. Scott, Alexander W. Blocker, Fernando V. Bonassi,
#' Hugh A. Chipman, Edward I. George and Robert E. McCulloch
#' International Journal of Management Science and Engineering
#' Management (2016)
#' @export
BASiCS_MCMC <- function(
Data,
N,
Thin,
Burn,
Regression,
WithSpikes = TRUE,
PriorParam = BASiCS_PriorParam(Data, PriorMu = "EmpiricalBayes"),
FixNu = FALSE,
SubsetBy = c("none", "gene", "cell"),
NSubsets = 1,
CombineMethod = c("pie", "consensus"),
Weighting = c("naive", "n_weight", "inverse_variance"),
Threads = getOption("Ncpus", default = 1L),
BPPARAM = BiocParallel::bpparam(),
...) {
# Checks to ensure input arguments are valid
.BASiCS_MCMC_InputCheck(Data, N, Thin, Burn, Regression, WithSpikes, Threads,
NSubsets)
SubsetBy <- match.arg(SubsetBy)
if (SubsetBy != "none" && NSubsets != 1) {
if (SubsetBy == "cell") {
warning(
"Divide and conquer using cell-wise partitions is not recommended."
)
}
CombineMethod <- match.arg(CombineMethod)
Weighting <- match.arg(Weighting)
Chains <- BASiCS_DivideAndConquer(
Data,
N = N,
NSubsets = NSubsets,
Thin = Thin,
Burn = Burn,
Regression = Regression,
WithSpikes = WithSpikes,
SubsetBy = SubsetBy,
PriorParam = PriorParam,
BPPARAM = BPPARAM,
FixNu = FixNu,
...
)
Chain <- .combine_subposteriors(
Chains,
CombineMethod = CombineMethod,
SubsetBy = SubsetBy,
Weighting = Weighting,
GeneOrder = rownames(Data),
## can't reorder cells because of _Batch* renaming
# CellOrder = colnames(Data),
BPPARAM = BPPARAM
)
return(Chain)
}
# Some global values used throughout the MCMC algorithm and checks
# Numbers of cells/genes/batches + count table + design matrix for batch
GPar <- .BASiCS_MCMC_GlobalParams(Data)
# Total counts per cell and/or gene
sum.bycell.bio <- Matrix::rowSums(counts(Data))
sum.bygene.bio <- Matrix::colSums(counts(Data))
if (WithSpikes) {
sum.bycell.tech <- Matrix::rowSums(assay(altExp(Data)))
sum.bygene.tech <- Matrix::colSums(assay(altExp(Data)))
sum.bycell.all <- c(sum.bycell.bio, sum.bycell.tech)
sum.bygene.all <- sum.bygene.bio + sum.bygene.tech
} else {
sum.bycell.all <- sum.bycell.bio
sum.bygene.all <- sum.bygene.bio
}
# Assignment of optional parameters (default values if input not provided)
ArgsDef <- .BASiCS_MCMC_ExtraArgs(
Data = Data,
Burn = Burn,
GPar = GPar,
Regression = Regression,
WithSpikes = WithSpikes,
PriorParam = PriorParam,
...
)
# Starting values for the MCMC (parameters and adaptive variances)
# Loaded separately to simplify calling to the elements of its list
# Same for prior parameters
Start <- ArgsDef$Start
PriorParam <- ArgsDef$PriorParam
MCMCArgs <- list(
N = N,
Thin = Thin,
Burn = Burn,
Counts = GPar$BioCounts,
BatchDesign = GPar$BatchDesign,
mu0 = Start$mu0,
delta0 = Start$delta0,
s0 = Start$s0,
nu0 = Start$nu0,
theta0 = rep(Start$theta0, GPar$nBatch),
mu_mu = PriorParam$mu.mu,
s2mu = PriorParam$s2.mu,
as = PriorParam$a.s,
bs = PriorParam$b.s,
atheta = PriorParam$a.theta,
btheta = PriorParam$b.theta,
ar = ArgsDef$AR,
LSmu0 = Start$ls.mu0,
LSdelta0 = Start$ls.delta0,
LSnu0 = Start$ls.nu0,
LStheta0 = rep(Start$ls.theta0, GPar$nBatch),
sumByGeneAll = sum.bygene.all,
StoreAdapt = as.numeric(ArgsDef$StoreAdapt),
EndAdapt = ArgsDef$StopAdapt,
PrintProgress = as.numeric(ArgsDef$PrintProgress),
mintol_mu = ArgsDef$mintol_mu,
mintol_delta = ArgsDef$mintol_delta,
mintol_nu = ArgsDef$mintol_nu,
mintol_theta = ArgsDef$mintol_theta,
geneExponent = PriorParam$GeneExponent,
cellExponent = PriorParam$CellExponent,
threads = Threads
)
NonRegressionArgs <- list(
prior_delta = (PriorParam$PriorDelta == "log-normal") + 1,
s2delta = PriorParam$s2.delta,
adelta = PriorParam$a.delta,
bdelta = PriorParam$b.delta
)
RegressionArgs <- list(
k = PriorParam$k,
m0 = PriorParam$m,
V0 = PriorParam$V,
sigma2_a0 = PriorParam$a.sigma2,
sigma2_b0 = PriorParam$b.sigma2,
beta0 = Start$beta0,
sigma20 = Start$sigma20,
eta0 = PriorParam$eta,
lambda0 = Start$lambda0,
variance = PriorParam$variance,
FixLocations = PriorParam$FixLocations,
RBFMinMax = PriorParam$RBFMinMax,
RBFLocations = PriorParam$RBFLocations
)
NoSpikeArgs <- list(
sumByCellAll = sum.bycell.all,
SizeTimesConstrain = length(ArgsDef$ConstrainGene) * ArgsDef$Constrain,
Index = ArgsDef$Index,
RefGene = ArgsDef$RefGene,
RefGenes = ArgsDef$RefGenes,
ConstrainGene = ArgsDef$ConstrainGene,
NotConstrainGene = ArgsDef$NotConstrainGene,
fixNu = FixNu,
StochasticRef = as.numeric(PriorParam$StochasticRef)
)
# If spikes are available
if (WithSpikes) {
SpikeArgs <- list(
sumByCellBio = sum.bycell.bio,
sumByGeneBio = sum.bygene.bio,
phi0 = Start$phi0,
aphi = PriorParam$p.phi,
LSphi0 = Start$ls.phi0,
muSpikes = rowData(altExp(Data))[, 2]
)
MCMCArgs <- c(MCMCArgs, SpikeArgs)
# If regression case is chosen
if (Regression) {
message("Running with spikes BASiCS sampler (regression case) ... \n")
MCMCArgs <- c(MCMCArgs, RegressionArgs)
MCMCFun <- ".BASiCS_MCMCcppReg"
} else {
message("Running with spikes BASiCS sampler (no regression) ... \n")
MCMCArgs <- c(MCMCArgs, NonRegressionArgs)
MCMCFun <- ".BASiCS_MCMCcpp"
}
} else {
# If spikes are not available
MCMCArgs <- c(MCMCArgs, NoSpikeArgs)
if (Regression) {
message("Running no spikes BASiCS sampler (regression case) ... \n")
MCMCArgs <- c(MCMCArgs, RegressionArgs)
MCMCFun <- ".BASiCS_MCMCcppRegNoSpikes"
} else {
message("Running no spikes BASiCS sampler (no regression) ... \n")
MCMCArgs <- c(MCMCArgs, NonRegressionArgs)
MCMCFun <- ".BASiCS_MCMCcppNoSpikes"
}
}
Time <- system.time(Chain <- do.call(what = MCMCFun, args = MCMCArgs))
if (Regression) {
# Remove epsilons for genes that are not expressed in at least 2 cells
# Discuss this with John (potentially include an optional arg about this)
AtLeast2Cells <- Matrix::rowSums(GPar$BioCounts > 0) > 1
Chain$epsilon[, !AtLeast2Cells] <- NA
Chain$delta[, !AtLeast2Cells] <- NA
}
# Format column names of MCMC chains
colnames(Chain$mu) <- colnames(Chain$delta) <- rownames(GPar$BioCounts)
if (Regression) {
colnames(Chain$epsilon) <- colnames(Chain$mu)
Chain$lambda <- NULL # Remove to reduce storage
}
CellLabels <- paste0(colnames(counts(Data)), "_Batch", GPar$BatchInfo)
colnames(Chain$s) <- CellLabels
if (WithSpikes) {
colnames(Chain$phi) <- CellLabels
}
colnames(Chain$nu) <- CellLabels
colnames(Chain$theta) <- paste0("Batch", unique(GPar$BatchInfo))
message(
"-------------------------------------------------------------\n",
"MCMC running time \n",
"-------------------------------------------------------------\n",
"user: ", round(Time['user.self'], 3), "\n",
"system: ", round(Time['sys.self'], 3), "\n",
"elapsed: ", round(Time['elapsed'], 3), "\n"
)
message(
"-------------------------------------------------------------\n",
"Output \n",
"-------------------------------------------------------------\n"
)
# Convert output into a `BASiCS_Chain` object
ChainClass <- newBASiCS_Chain(parameters = Chain[!grepl("ls.", names(Chain))])
# Store chain and/or adaptive variances
.BASiCS_MCMC_OutputStore(
ChainClass = ChainClass,
Chain = Chain,
StoreChains = ArgsDef$StoreChains,
StoreAdapt = ArgsDef$StoreAdapt,
StoreDir = ArgsDef$StoreDir,
RunName = ArgsDef$RunName
)
# Store reference gene information (no spikes case only)
if (!WithSpikes) {
if (ArgsDef$StoreChains) {
.BASiCS_MCMC_RefFreqStore(
Data = Data,
Chain = Chain,
RefGene = ArgsDef$RefGene,
RefGenes = ArgsDef$RefGenes,
StoreDir = ArgsDef$StoreDir,
RunName = ArgsDef$RunName
)
}
} else {
message(
"-------------------------------------------------------------\n",
"BASiCS version ", packageVersion("BASiCS"), " : \n",
"vertical integration (spikes case) \n",
"-------------------------------------------------------------\n"
)
}
# Return `BASiCS_MCMC` object
return(ChainClass)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.