R/runBatchCorrection.R

Defines functions runZINBWaVE runSCMerge runSCANORAMA runMNNCorrect runLimmaBC runFastMNN runComBat runBBKNN

Documented in runBBKNN runComBat runFastMNN runLimmaBC runMNNCorrect runSCANORAMA runSCMerge runZINBWaVE

#' Apply BBKNN batch effect correction method to SingleCellExperiment object
#'
#' BBKNN, an extremely fast graph-based data integration algorithm. It modifies
#' the neighbourhood construction step to produce a graph that is balanced
#' across all batches of the data.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param reducedDimName A single character. The name for the corrected
#' low-dimensional representation. Will be saved to \code{reducedDim(inSCE)}.
#' Default \code{"BBKNN"}.
#' @param nComponents An integer. Number of principle components or the
#' dimensionality, adopted in the pre-PCA-computation step, the BBKNN step (for
#' how many PCs the algorithm takes into account), and the final UMAP
#' combination step where the value represent the dimensionality of the updated
#' reducedDim. Default \code{50L}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{reducedDim(inSCE, reducedDimName)} updated.
#' @export
#' @references Krzysztof Polanski et al., 2020
#' @examples
#' \dontrun{
#' data('sceBatches', package = 'singleCellTK')
#' sceCorr <- runBBKNN(sceBatches)
#' }
runBBKNN <-function(inSCE, useAssay = 'logcounts', batch = 'batch',
                    reducedDimName = 'BBKNN', nComponents = 50L){
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!reticulate::py_module_available(module = "bbknn")){
    warning("Cannot find python module 'bbknn', please install Conda and",
            " run sctkPythonInstallConda() or run ",
            "sctkPythonInstallVirtualEnv(). If one of these have been ",
            "previously run to install the modules, make sure to run ",
            "selectSCTKConda() or selectSCTKVirtualEnvironment(),",
            " respectively, if R has been restarted since the module ",
            "installation. Alternatively, bbknn can be installed on the local ",
            "machine with pip (e.g. pip install bbknn) and then the ",
            "'use_python()' function from the 'reticulate' package can be used",
            " to select the correct Python environment.")
    return(inSCE)
  }
  if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
    stop(paste("\"batch\" name:", batch, "not found"))
  }
  reducedDimName <- gsub(' ', '_', reducedDimName)
  nComponents <- as.integer(nComponents)
  ## Run algorithm
  adata <- .sce2adata(inSCE, useAssay = useAssay)
  sc$tl$pca(adata, n_comps = nComponents)
  bbknn$bbknn(adata, batch_key = batch, n_pcs = nComponents)
  sc$tl$umap(adata, n_components = nComponents)
  bbknnUmap <- adata$obsm[["X_umap"]]
  SingleCellExperiment::reducedDim(inSCE, reducedDimName) <- bbknnUmap
  return(inSCE)
}

#' Apply ComBat batch effect correction method to SingleCellExperiment object
#'
#' The ComBat batch adjustment approach assumes that batch effects represent
#' non-biological but systematic shifts in the mean or variability of genomic
#' features for all samples within a processing batch. It uses either parametric
#' or non-parametric empirical Bayes frameworks for adjusting data for batch
#' effects.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param par.prior A logical scalar. TRUE indicates parametric adjustments
#' will be used, FALSE indicates non-parametric adjustments will be used.
#' Default \code{TRUE}.
#' @param covariates List of other column names in colData to be added to the
#' ComBat model as covariates. Default \code{NULL}.
#' @param mean.only If TRUE ComBat only corrects the mean of the batch effect.
#' Default \code{FALSE}.
#' @param ref.batch If given, will use the selected batch as a reference for
#' batch adjustment. Default \code{NULL}.
#' @param assayName A single characeter. The name for the corrected assay. Will
#' be saved to \code{\link{assay}}. Default
#' \code{"ComBat"}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{assay(inSCE, assayName)} updated.
#' @examples
#' \dontrun{
#' data('sceBatches', package = 'singleCellTK')
#' # parametric adjustment
#' sceCorr <- runComBat(sceBatches)
#' # non-parametric adjustment, mean-only version
#' sceCorr <- runComBat(sceBatches, par.prior=FALSE, mean.only=TRUE)
#' # reference-batch version, with covariates
#' sceCorr <- runComBat(sceBatches, covariates = "cell_type", ref.batch = 'w')
#' }
#' @export
runComBat <- function(inSCE, useAssay = "logcounts", batch = 'batch',
                      par.prior = TRUE, covariates = NULL,
                      mean.only = FALSE, ref.batch = NULL,
                      assayName = "ComBat") {
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
    stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found."))
  }
  if(any(!c(batch, covariates) %in%
         names(SummarizedExperiment::colData(inSCE)))){
    anns <- c(batch, covariates)
    notFound <- which(!anns %in% names(SummarizedExperiment::colData(inSCE)))
    notFound <- anns[notFound]
    stop("\"annotation\" name:", paste(notFound, collapse = ', '), "not found")
  }
  #prepare model matrix
  mod <- NULL
  if (!is.null(covariates)){
    mod <- stats::model.matrix(
      stats::as.formula(paste0("~", paste0(covariates, collapse = "+"))),
      data = data.frame(SummarizedExperiment::colData(inSCE)))
  }

  resassay <-
    sva::ComBat(dat = SummarizedExperiment::assay(inSCE, useAssay),
                batch = SummarizedExperiment::colData(inSCE)[[batch]],
                mod = mod, par.prior = par.prior,
                mean.only = mean.only, ref.batch = ref.batch)

  SummarizedExperiment::assay(inSCE, assayName) <- resassay
  return(inSCE)
}

#' Apply a fast version of the mutual nearest neighbors (MNN) batch effect
#' correction method to SingleCellExperiment object
#'
#' fastMNN is a variant of the classic MNN method, modified for speed and more
#' robust performance. For introduction of MNN, see \code{\link{runMNNCorrect}}.
#' @param inSCE  inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}. Alternatively, see
#' \code{pcInput} parameter.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param reducedDimName A single character. The name for the corrected
#' low-dimensional representation. Will be saved to \code{reducedDim(inSCE)}.
#' Default \code{"fastMNN"}.
#' @param pcInput A logical scalar. Whether to use a low-dimension matrix for
#' batch effect correction. If \code{TRUE}, \code{useAssay} will be searched
#' from \code{reducedDimNames(inSCE)}. Default \code{FALSE}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{reducedDim(inSCE, reducedDimName)} updated.
#' @export
#' @references Lun ATL, et al., 2016
#' @examples
#' \dontrun{
#' data('sceBatches', package = 'singleCellTK')
#' sceCorr <- runFastMNN(sceBatches, useAssay = 'logcounts', pcInput = FALSE)
#' }
runFastMNN <- function(inSCE, useAssay = "logcounts",
                       reducedDimName = "fastMNN", batch = 'batch',
                       pcInput = FALSE){
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(isTRUE(pcInput)){
    if(!(useAssay %in% SingleCellExperiment::reducedDimNames(inSCE))) {
      stop(paste("\"useAssay\" (reducedDim) name: ", useAssay, " not found."))
    }
  } else {
    if(!(useAssay %in% SummarizedExperiment::assayNames(inSCE))) {
      stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found."))
    }
  }

  if(!(batch %in% names(SummarizedExperiment::colData(inSCE)))){
    stop(paste("\"batch name:", batch, "not found."))
  }
  reducedDimName <- gsub(' ', '_', reducedDimName)

  ## Run algorithm
  batches <- list()
  batchCol <- SummarizedExperiment::colData(inSCE)[[batch]]
  batchFactor <- as.factor(batchCol)

  if(pcInput){
    mat <- SingleCellExperiment::reducedDim(inSCE, useAssay)
    redMNN <- batchelor::reducedMNN(mat, batch = batchFactor)
    newRedDim <- redMNN$corrected
  } else {
    mat <- SummarizedExperiment::assay(inSCE, useAssay)
    mnnSCE <- batchelor::fastMNN(mat, batch = batchFactor)
    newRedDim <- SingleCellExperiment::reducedDim(mnnSCE, 'corrected')
  }
  SingleCellExperiment::reducedDim(inSCE, reducedDimName) <- newRedDim
  return(inSCE)
}

# #' Apply Harmony batch effect correction method to SingleCellExperiment object
# #'
# #' Harmony is an algorithm that projects cells into a shared embedding in which
# #' cells group by cell type rather than dataset-specific conditions.
# #' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
# #' @param useAssay A single character indicating the name of the assay requiring
# #' batch correction. Default \code{"logcounts"}.
# #' @param batch A single character indicating a field in
# #' \code{\link{colData}} that annotates the batches.
# #' Default \code{"batch"}.
# #' @param reducedDimName A single character. The name for the corrected
# #' low-dimensional representation. Will be saved to \code{reducedDim(inSCE)}.
# #' Default \code{"HARMONY"}.
# #' @param pcInput A logical scalar. Whether to use a low-dimension matrix for
# #' batch effect correction. If \code{TRUE}, \code{useAssay} will be searched
# #' from \code{reducedDimNames(inSCE)}. Default \code{FALSE}.
# #' @param nComponents An integer. The number of principle components or
# #' dimensionality to generate in the resulting matrix. If \code{pcInput} is set
# #' to \code{TRUE}, the output dimension will follow the low-dimension matrix,
# #' so this argument will be ignored. Default \code{50L}.
# #' @param nIter An integer. The max number of iterations to perform. Default
# #' \code{10L}.
# #' @param theta A Numeric scalar. Diversity clustering penalty parameter,
# #' Larger value results in more diverse clusters. Default \code{5}
# #' @return The input \linkS4class{SingleCellExperiment} object with
# #' \code{reducedDim(inSCE, reducedDimName)} updated.
# #' @export
# #' @references Ilya Korsunsky, et al., 2019
# #' @examples
# #' data('sceBatches', package = 'singleCellTK')
# #' sceCorr <- runHarmony(sceBatches, nComponents = 10L)
# runHarmony <- function(inSCE, useAssay = "logcounts", pcInput = FALSE,
#                        batch = "batch", reducedDimName = "HARMONY",
#                        nComponents = 50L, theta = 5, nIter = 10L){
#   if (!requireNamespace("harmony", quietly = TRUE)) {
#     stop("The Harmony package is required to run this function. ",
#          "Install harmony with: ",
#          "devtools::install_github('joshua-d-campbell/harmony') ",
#          call. = FALSE)
#   }
#   ## Input check
#   if(!inherits(inSCE, "SingleCellExperiment")){
#     stop("\"inSCE\" should be a SingleCellExperiment Object.")
#   }
#   if(pcInput){
#     if(!useAssay %in% SingleCellExperiment::reducedDimNames(inSCE)) {
#       stop(paste("\"useAssay\" (reducedDim) name: ", useAssay, " not found."))
#     }
#   } else {
#     if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
#       stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found."))
#     }
#   }
#   if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
#     stop(paste("\"batch\" name:", batch, "not found"))
#   }
#   reducedDimName <- gsub(' ', '_', reducedDimName)
#   nComponents <- as.integer(nComponents)
#   nIter <- as.integer(nIter)
#   ## Run algorithm
#   batchCol <- SummarizedExperiment::colData(inSCE)[[batch]]
#   if(pcInput){
#     mat <- SingleCellExperiment::reducedDim(inSCE, useAssay)
#   } else{
#     sceTmp <- scater::runPCA(inSCE, exprs_values = useAssay,
#                              ncomponents = nComponents)
#     mat <- SingleCellExperiment::reducedDim(sceTmp, 'PCA')
#   }
#   h <- harmony::HarmonyMatrix(mat, batchCol, do_pca = FALSE,
#                               theta = theta, max.iter.harmony = nIter)
#   SingleCellExperiment::reducedDim(inSCE, reducedDimName) <- h
#   return(inSCE)
# }

# #' Apply LIGER batch effect correction method to SingleCellExperiment object
# #'
# #' LIGER relies on integrative non-negative matrix factorization to identify
# #' shared and dataset-specific factors.
# #' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
# #' @param useAssay A single character indicating the name of the assay requiring
# #' batch correction. Default \code{"logcounts"}.
# #' @param batch A single character indicating a field in
# #' \code{\link{colData}} that annotates the batches.
# #' Default \code{"batch"}.
# #' @param reducedDimName A single character. The name for the corrected
# #' low-dimensional representation. Will be saved to \code{reducedDim(inSCE)}.
# #' Default \code{"LIGER"}.
# #' @param nComponents An integer. The number of principle components or
# #' dimensionality to generate in the resulting matrix. Default \code{20L}.
# #' @param lambda A numeric scalar. Algorithmic parameter, the penalty
# #' parameter which limits the dataset-specific component of the factorization.
# #' Default \code{5.0}.
# #' @param resolution A numeric scalar. Algorithmic paramter, the clustering
# #' resolution, increasing this increases the number of communities detected.
# #' Default \code{1.0}
# #' @return The input \linkS4class{SingleCellExperiment} object with
# #' \code{reducedDim(inSCE, reducedDimName)} updated.
# #' @export
# #' @references Joshua Welch, et al., 2018
# #' @examples
# #' \dontrun{
# #' data('sceBatches', package = 'singleCellTK')
# #' sceCorr <- runLIGER(sceBatches)
# #' }
# runLIGER <- function(inSCE, useAssay = 'logcounts', batch = 'batch',
#                      reducedDimName = 'LIGER', nComponents = 20L, lambda = 5.0,
#                      resolution = 1.0){
#   if (!requireNamespace("liger", quietly = TRUE)) {
#     stop("The Liger package is required to run this function. ",
#          "Install liger with: ",
#          "devtools::install_github('joshua-d-campbell/liger') \n",
#          "NOTICE that the one on cran/BiocManager is not what we need.",
#          call. = FALSE)
#   }
#   if (!exists('createLiger', where=asNamespace('liger'), mode='function')) {
#     stop("You are using the wrong source of Liger, please reinstall with: ",
#          "devtools::install_github('joshua-d-campbell/liger') \n",
#          "NOTICE that the one on cran/BiocManager is not what we need.",
#          call. = FALSE)
#   }
#   ## Input check
#   if(!inherits(inSCE, "SingleCellExperiment")){
#     stop("\"inSCE\" should be a SingleCellExperiment Object.")
#   }
#   if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
#     stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found"))
#   }
#   if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
#     stop(paste("\"batch\" name:", batch, "not found"))
#   }
#   reducedDimName <- gsub(' ', '_', reducedDimName)

#   ## Run algorithm
#   batchCol <- SummarizedExperiment::colData(inSCE)[[batch]]
#   batches <- unique(batchCol)
#   batchMatrices <- list()
#   for(i in seq_along(batches)){
#     b <- batches[i]
#     batchMatrices[[b]] <-
#       SummarizedExperiment::assay(inSCE, useAssay)[,batchCol == b]
#   }
#   ligerex <- liger::createLiger(batchMatrices)
#   ligerex <- liger::normalize(ligerex)
#   ligerex <- liger::selectGenes(ligerex)
#   ligerex <- liger::scaleNotCenter(ligerex)
#   ligerex <- liger::optimizeALS(ligerex, k = nComponents, lambda = lambda,
#                                 resolution = resolution)
#   ligerex <- liger::quantile_norm(ligerex)
#   SingleCellExperiment::reducedDim(inSCE, reducedDimName) <- ligerex@H.norm
#   return(inSCE)
# }

#' Apply Limma's batch effect correction method to SingleCellExperiment object
#'
#' Limma's batch effect removal function fits a linear model to the data, then
#' removes the component due to the batch effects.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param assayName A single characeter. The name for the corrected assay. Will
#' be saved to \code{\link{assay}}. Default
#' \code{"LIMMA"}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{assay(inSCE, assayName)} updated.
#' @export
#' @references Gordon K Smyth, et al., 2003
#' @examples
#' data('sceBatches', package = 'singleCellTK')
#' \dontrun{
#' sceCorr <- runLimmaBC(sceBatches)
#' }
runLimmaBC <- function(inSCE, useAssay = "logcounts", assayName = "LIMMA",
                       batch = "batch") {
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
    stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found."))
  }
  if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
    stop(paste("\"batch\" name:", batch, "not found."))
  }

  assayName <- gsub(' ', '_', assayName)

  ## Run algorithm
  ## One more check for the batch names
  batchCol <- SummarizedExperiment::colData(inSCE)[[batch]]
  mat <- SummarizedExperiment::assay(inSCE, useAssay)
  newMat <- limma::removeBatchEffect(mat, batch = batchCol)
  SummarizedExperiment::assay(inSCE, assayName) <- newMat
  return(inSCE)
}

#' Apply the mutual nearest neighbors (MNN) batch effect correction method to
#' SingleCellExperiment object
#'
#' MNN is designed for batch correction of single-cell RNA-seq data where the
#' batches are partially confounded with biological conditions of interest. It
#' does so by identifying pairs of MNN in the high-dimensional log-expression
#' space. For each MNN pair, a pairwise correction vector is computed by
#' applying a Gaussian smoothing kernel with bandwidth `sigma`.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param k An integer. Specifies the number of nearest neighbours to
#' consider when defining MNN pairs. This should be interpreted as the minimum
#' frequency of each cell type or state in each batch. Larger values will
#' improve the precision of the correction by increasing the number of MNN
#' pairs, at the cost of reducing accuracy by allowing MNN pairs to form between
#' cells of different type. Default \code{20L}.
#' @param sigma A Numeric scalar. Specifies how much information is
#' shared between MNN pairs when computing the batch effect. Larger values will
#' share more information, approaching a global correction for all cells in the
#' same batch. Smaller values allow the correction to vary across cell types,
#' which may be more accurate but comes at the cost of precision. Default
#' \code{0.1}.
#' @param assayName A single characeter. The name for the corrected assay. Will
#' be saved to \code{\link{assay}}. Default
#' \code{"MNN"}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{assay(inSCE, assayName)} updated.
#' @export
#' @references Lun ATL, et al., 2016 & 2018
#' @examples
#' data('sceBatches', package = 'singleCellTK')
#' \dontrun{
#' sceCorr <- runMNNCorrect(sceBatches)
#' }
runMNNCorrect <- function(inSCE, useAssay = 'logcounts', batch = 'batch',
                          assayName = 'MNN', k = 20L, sigma = 0.1){
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
    stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found."))
  }
  if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
    stop(paste("\"batch name:", batch, "not found."))
  }
  assayName <- gsub(' ', '_', assayName)
  k <- as.integer(k)

  ## Run algorithm
  batchCol <- SummarizedExperiment::colData(inSCE)[[batch]]
  batchFactor <- as.factor(batchCol)
  mnnSCE <- batchelor::mnnCorrect(inSCE, assay.type = useAssay,
                                  batch = batchFactor, k = k, sigma = sigma)
  corrected <- SummarizedExperiment::assay(mnnSCE, 'corrected')
  SummarizedExperiment::assay(inSCE, assayName) <- corrected
  return(inSCE)
}

#' Apply the mutual nearest neighbors (MNN) batch effect correction method to
#' SingleCellExperiment object
#'
#' SCANORAMA is analogous to computer vision algorithms for panorama stitching
#' that identify images with overlapping content and merge these into a larger
#' panorama.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param SIGMA A numeric scalar. Algorithmic parameter, correction smoothing
#' parameter on Gaussian kernel. Default \code{15}.
#' @param ALPHA A numeric scalar. Algorithmic parameter, alignment score
#' minimum cutoff. Default \code{0.1}.
#' @param KNN An integer. Algorithmic parameter, number of nearest neighbors to
#' use for matching. Default \code{20L}.
#' @param assayName A single characeter. The name for the corrected assay. Will
#' be saved to \code{\link{assay}}. Default
#' \code{"SCANORAMA"}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{assay(inSCE, assayName)} updated.
#' @export
#' @references Brian Hie et al, 2019
#' @examples
#' \dontrun{
#' data('sceBatches', package = 'singleCellTK')
#' sceCorr <- runSCANORAMA(sceBatches)
#' }
runSCANORAMA <- function(inSCE, useAssay = 'logcounts', batch = 'batch',
                         SIGMA = 15, ALPHA = 0.1, KNN = 20L,
                         assayName = 'SCANORAMA'){
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!reticulate::py_module_available(module = "scanorama")){
    warning("Cannot find python module 'scanorama', please install Conda and",
            " run sctkPythonInstallConda() or run ",
            "sctkPythonInstallVirtualEnv(). If one of these have been ",
            "previously run to install the modules, make sure to run ",
            "selectSCTKConda() or selectSCTKVirtualEnvironment(), ",
            "respectively, if R has been restarted since the module ",
            "installation. Alternatively, scanorama can be installed on the ",
            "local machine with pip (e.g. pip install scanorama) and then the ",
            "'use_python()' function from the 'reticulate' package can be used",
            " to select the correct Python environment.")
    return(inSCE)
  }
  if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
    stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found"))
  }
  if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
    stop(paste("\"batch\" name:", batch, "not found"))
  }
  assayName <- gsub(' ', '_', assayName)
  adata <- .sce2adata(inSCE, useAssay)
  py <- reticulate::py
  py$adata <- adata
  py$batch <- batch
  py$sigma <- SIGMA
  py$alpha <- ALPHA
  py$KNN <- as.integer(KNN)
  reticulate::py_run_string('
import numpy as np
import scanorama
batches = list(set(adata.obs[batch]))
adatas = [adata[adata.obs[batch] == b,] for b in batches]
datasets_full = [a.X.toarray() for a in adatas]
genes_list = [a.var_names.to_list() for a in adatas]
corrected, genes = scanorama.correct(datasets_full, genes_list, sigma=sigma,
                                     alpha=alpha, knn=KNN)
corrected = [m.toarray() for m in corrected]
cellOrders = [adata.obs[batch] == b for b in batches]
integrated = np.zeros(adata.shape)
integrated[:,:] = np.NAN
for i in range(len(batches)):
    integrated[cellOrders[i],] = corrected[i]
geneidx = {gene: idx for idx, gene in enumerate(genes)}
orderIdx = []
for gene in adata.var_names:
    orderIdx.append(geneidx[gene])
integrated = integrated[:, orderIdx]
', convert = FALSE)
  mat <- t(py$integrated)
  rownames(mat) <- rownames(inSCE)
  colnames(mat) <- colnames(inSCE)
  SummarizedExperiment::assay(inSCE, assayName) <- mat
  return(inSCE)
}

#' Apply scMerge batch effect correction method to SingleCellExperiment object
#'
#' The scMerge method leverages factor analysis, stably expressed genes (SEGs)
#' and (pseudo-) replicates to remove unwanted variations and merge multiple
#' scRNA-Seq data.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Default \code{"logcounts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param kmeansK An integer vector. Indicating the kmeans' K-value for each
#' batch (i.e. how many subclusters in each batch should exist), in order to
#' construct pseudo-replicates. The length of code{kmeansK} needs to be the same
#' as the number of batches. Default \code{NULL}, and this value will be
#' auto-detected by default, depending on \code{cellType}.
#' @param cellType A single character. A string indicating a field in
#' \code{colData(inSCE)} that defines different cell types. Default
#' \code{'cell_type'}.
#' @param seg A vector of gene names or indices that specifies SEG (Stably
#' Expressed Genes) set as negative control. Pre-defined dataset with human and
#' mouse SEG lists is available to user by running \code{data('SEG')}. Default
#' \code{NULL}, and this value will be auto-detected by default with
#' \code{\link[scMerge]{scSEGIndex}}.
#' @param nCores An integer. The number of cores of processors to allocate for
#' the task. Default \code{1L}.
#' @param assayName A single characeter. The name for the corrected assay. Will
#' be saved to \code{\link{assay}}. Default
#' \code{"scMerge"}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{assay(inSCE, assayName)} updated.
#' @export
#' @references Hoa, et al., 2020
#' @examples
#' data('sceBatches', package = 'singleCellTK')
#' \dontrun{
#' sceCorr <- runSCMerge(sceBatches)
#' }
runSCMerge <- function(inSCE, useAssay = "logcounts", batch = 'batch',
                       assayName = "scMerge", seg = NULL, kmeansK = NULL,
                       cellType = 'cell_type',
                       nCores = 1L){
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
    stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found."))
  }
  if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
    stop(paste("\"batch\" name:", batch, "not found"))
  }
  if(is.null(cellType) & is.null(kmeansK)){
    stop("\"cellType\" and \"kmeansK\" cannot be NULL at the same time")
  }
  if(!cellType %in% names(SummarizedExperiment::colData(inSCE))){
    # If NULL, scMerge still works
    stop(paste("\"cellType\" name:", cellType, "not found"))
  }

  nCores <- min(as.integer(nCores), parallel::detectCores())
  assayName <- gsub(' ', '_', assayName)

  ## Run algorithm

  batchCol <- SummarizedExperiment::colData(inSCE)[[batch]]
  uniqBatch <- unique(batchCol)

  # Infer parameters
  if(is.null(cellType)){
    cellTypeCol <- NULL
  } else {
    cellTypeCol <- SummarizedExperiment::colData(inSCE)[[cellType]]
  }
  ## kmeansK
  if(!is.null(cellType) && is.null(kmeansK)){
    # If kmeansK not given, detect by cell type.
    cellTypeCol <- SummarizedExperiment::colData(inSCE)[[cellType]]
    kmeansK <- c()
    for (i in seq_along(uniqBatch)){
      cellTypePerBatch <- cellTypeCol[batchCol == uniqBatch[i]]
      kmeansK <- c(kmeansK, length(unique(cellTypePerBatch)))
    }
    cat("Detected kmeansK:\n")
    print(t(data.frame(K = kmeansK, row.names = uniqBatch)))
  }
  ## SEG
  if(is.null(seg)){
    bpParam <- BiocParallel::MulticoreParam(workers = nCores)
    seg <- scMerge::scSEGIndex(SummarizedExperiment::assay(inSCE, useAssay),
                               cell_type = cellTypeCol,
                               BPPARAM = bpParam)
    ctl <- rownames(seg[order(seg$segIdx, decreasing = TRUE)[seq_len(1000)],])
  } else {
    ctl <- seg
  }

  # scMerge automatically search for the column called "batch"...
  colDataNames <- names(SummarizedExperiment::colData(inSCE))
  names(SummarizedExperiment::colData(inSCE))[colDataNames == batch] <- 'batch'
  bpParam <- BiocParallel::MulticoreParam(workers = nCores)
  inSCE <- scMerge::scMerge(sce_combine = inSCE, exprs = useAssay,
                            hvg_exprs = useAssay,
                            assay_name = assayName,
                            ctl = ctl, kmeansK = kmeansK,
                            #marker_list = topVarGenesPerBatch,
                            cell_type = cellTypeCol,
                            BPPARAM = bpParam)
  colDataNames <- names(SummarizedExperiment::colData(inSCE))
  names(SummarizedExperiment::colData(inSCE))[colDataNames == 'batch'] <- batch
  return(inSCE)
}

#' Apply ZINBWaVE Batch effect correction method to SingleCellExperiment object
#'
#' A general and flexible zero-inflated negative binomial model that can be
#' used to provide a low-dimensional representations of scRNAseq data. The
#' model accounts for zero inflation (dropouts), over-dispersion, and the count
#' nature of the data. The model also accounts for the difference in library
#' sizes and optionally for batch effects and/or other covariates.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay A single character indicating the name of the assay requiring
#' batch correction. Note that ZINBWaVE works for counts (integer) input rather
#' than logcounts that other methods prefer. Default \code{"counts"}.
#' @param batch A single character indicating a field in
#' \code{\link{colData}} that annotates the batches.
#' Default \code{"batch"}.
#' @param nHVG An integer. Number of highly variable genes to use when fitting
#' the model. Default \code{1000L}.
#' @param nComponents An integer. The number of principle components or
#' dimensionality to generate in the resulting matrix. Default \code{50L}.
#' @param nIter An integer, The max number of iterations to perform. Default
#' \code{10L}.
#' @param epsilon An integer. Algorithmic parameter. Empirically, a high epsilon
#' is often required to obtained a good low-level representation. Default
#' \code{1000L}.
#' @param reducedDimName A single character. The name for the corrected
#' low-dimensional representation. Will be saved to \code{reducedDim(inSCE)}.
#' Default \code{"zinbwave"}.
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{reducedDim(inSCE, reducedDimName)} updated.
#' @export
#' @references Pollen, Alex A et al., 2014
#' @examples
#' data('sceBatches', package = 'singleCellTK')
#' \dontrun{
#'     sceCorr <- runZINBWaVE(sceBatches, nIter = 5)
#' }
runZINBWaVE <- function(inSCE, useAssay = 'counts', batch = 'batch',
                        nHVG = 1000L, nComponents = 50L, epsilon = 1000,
                        nIter = 10L, reducedDimName = 'zinbwave'){
  ## Input check
  if(!inherits(inSCE, "SingleCellExperiment")){
    stop("\"inSCE\" should be a SingleCellExperiment Object.")
  }
  if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)) {
    stop(paste("\"useAssay\" (assay) name: ", useAssay, " not found"))
  }
  if(!batch %in% names(SummarizedExperiment::colData(inSCE))){
    stop(paste("\"batch name:", batch, "not found."))
  }
  reducedDimName <- gsub(' ', '_', reducedDimName)
  nHVG <- as.integer(nHVG)
  nComponents <- as.integer(nComponents)
  epsilon <- as.integer(epsilon)
  nIter <- as.integer(nIter)
  # Run algorithm
  ##ZINBWaVE tutorial style of HVG selection
  if(nHVG < nrow(inSCE)){
    logAssay <- log1p(SummarizedExperiment::assay(inSCE, useAssay))
    vars <- matrixStats::rowVars(logAssay)
    names(vars) <- rownames(inSCE)
    vars <- sort(vars, decreasing = TRUE)
    tmpSCE <- inSCE[names(vars)[seq_len(nHVG)],]
  } else {
    tmpSCE <- inSCE
  }
  epsilon <- min(nrow(tmpSCE), epsilon)
  tmpSCE <- zinbwave::zinbwave(tmpSCE, K = nComponents, epsilon = epsilon,
                               which_assay = useAssay,
                               X = paste('~', batch, sep = ''),
                               maxiter.optimize = nIter, verbose = TRUE)
  SingleCellExperiment::reducedDim(inSCE, reducedDimName) <-
    SingleCellExperiment::reducedDim(tmpSCE, 'zinbwave')
  return(inSCE)
}

Try the singleCellTK package in your browser

Any scripts or data that you put into this service are public.

singleCellTK documentation built on Nov. 8, 2020, 5:21 p.m.