R/plotDEAnalysis.R

Defines functions plotMASTThresholdGenes plotDEGVolcano plotDEGHeatmap getDEGTopTable plotDEGRegression plotDEGViolin .checkDiffExpResultExists

Documented in getDEGTopTable plotDEGHeatmap plotDEGRegression plotDEGViolin plotDEGVolcano plotMASTThresholdGenes

#' Check if the specified MAST result in SingleCellExperiment object is
#' complete. But does not garantee the biological correctness.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. a
#' differential expression analysis function has to be run in advance.
#' @param useResult character. A string specifying the \code{analysisName}
#' used when running a differential expression analysis function.
#' @param labelBy A single character for a column of \code{rowData(inSCE)} as
#' where to search for the labeling text. Default \code{NULL}.
#' @return Stop point if found
#' @noRd
.checkDiffExpResultExists <- function(inSCE, useResult, labelBy = NULL){
  if(!inherits(inSCE, 'SingleCellExperiment')){
    stop('Given object is not a valid SingleCellExperiment object.')
  }
  if(!'diffExp' %in% names(S4Vectors::metadata(inSCE))){
    stop('"diffExp" not in metadata, please run runMAST() first.')
  }
  if(!useResult %in% names(S4Vectors::metadata(inSCE)$diffExp)){
    p <- paste0('"', useResult, '"', ' not in metadata(inSCE)$diffExp. ')
    stop(p,
         'Please check.')
  }
  result <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]
  if(!all(c('groupNames', 'select', 'result', 'useAssay') %in% names(result))){
    p <- paste0('"', useResult, '"', ' result is not complete. ')
    stop(p,
         'You might need to rerun it.')
  }
  if(!is.null(labelBy)){
    if(!labelBy %in% names(SummarizedExperiment::rowData(inSCE))){
      stop("labelBy: '", labelBy, "' not found.")
    }
  }
}

#' Generate violin plot to show the expression of top DEGs
#' @details Any of the differential expression analysis method from SCTK should
#' be performed prior to using this function
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useResult character. A string specifying the \code{analysisName}
#' used when running a differential expression analysis function.
#' @param threshP logical. Whether to plot threshold values from adaptive
#' thresholding, instead of using the assay used by \code{runMAST()}. Default
#' \code{FALSE}.
#' @param labelBy A single character for a column of \code{rowData(inSCE)} as
#' where to search for the labeling text. Default \code{NULL}.
#' @param nrow Integer. Number of rows in the plot grid. Default \code{6}.
#' @param ncol Integer. Number of columns in the plot grid. Default \code{6}.
#' @param defaultTheme Logical scalar. Whether to use default SCTK theme in
#' ggplot. Default \code{TRUE}.
#' @param isLogged Logical scalar. Whether the assay used for the analysis is
#' logged. If not, will do a \code{log(assay + 1)} transformation. Default
#' \code{TRUE}.
#' @param check_sanity Logical scalar. Whether to perform MAST's sanity check
#' to see if the counts are logged. Default \code{TRUE}
#' @return A ggplot object of violin plot
#' @export
#' @examples
#' data("sceBatches")
#' logcounts(sceBatches) <- log1p(counts(sceBatches))
#' sce.w <- subsetSCECols(sceBatches, colData = "batch == 'w'")
#' sce.w <- runWilcox(sce.w, class = "cell_type",
#'                    classGroup1 = "alpha", classGroup2 = "beta",
#'                    groupName1 = "w.alpha", groupName2 = "w.beta",
#'                    analysisName = "w.aVSb")
#' plotDEGViolin(sce.w, "w.aVSb")
plotDEGViolin <- function(inSCE, useResult, threshP = FALSE, labelBy = NULL,
                          nrow = 6, ncol = 6, defaultTheme = TRUE,
                          isLogged = TRUE, check_sanity = TRUE){
  #TODO: DO we split the up/down regulation too?
  # Check
  .checkDiffExpResultExists(inSCE, useResult, labelBy)
  # Extract
  result <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]
  deg <- result$result
  useAssay <- result$useAssay
  useReducedDim <- result$useReducedDim
  deg <- deg[order(deg$FDR),]
  geneToPlot <- deg[seq_len(min(nrow(deg), nrow*ncol)), "Gene"]
  groupName1 <- result$groupNames[1]
  ix1 <- result$select$ix1
  cells1 <- colnames(inSCE)[ix1]
  groupName2 <- result$groupNames[2]
  ix2 <- result$select$ix2
  cells2 <- colnames(inSCE)[ix2]
  if (!is.null(useAssay)) {
    if(!is.null(labelBy)){
      replGeneName <- SummarizedExperiment::rowData(inSCE[geneToPlot,])[[labelBy]]
    } else {
      replGeneName <- geneToPlot
    }
    expres <- expData(inSCE[geneToPlot, c(cells1, cells2)],
                      useAssay)
    useMat <- useAssay
  } else {
    if(!is.null(labelBy)){
      warning("Analysis performed on reducedDim. Cannot use rowData for ",
              "labelBy. Ignored.")
    }
    replGeneName <- geneToPlot
    expres <- t(expData(inSCE[, c(cells1, cells2)], useReducedDim))[geneToPlot,]
    useMat <- useReducedDim
  }

  if(!is.matrix(expres)){
    expres <- as.matrix(expres)
  }
  rownames(expres) <- replGeneName
  # Format
  cdat <- data.frame(wellKey = colnames(expres),
                     condition = factor(c(rep(groupName1, length(cells1)),
                                          rep(groupName2, length(cells2))),
                                        levels = result$groupNames),
                     ngeneson = rep("", (length(cells1) + length(cells2))),
                     stringsAsFactors = FALSE)
  if (!isTRUE(isLogged)) {
    expres <- log(expres + 1)
  }
  sca <- suppressMessages(MAST::FromMatrix(expres, cdat,
                                           check_sanity = check_sanity))
  if(threshP){
    #TODO: if nrow*ncol < `min_per_bin`` below, there would be an error.
    invisible(utils::capture.output(thres <-
                                      MAST::thresholdSCRNACountMatrix(expres, nbins = 20,
                                                                      min_per_bin = 30)))
    SummarizedExperiment::assay(sca) <- thres$counts_threshold
  }
  flatDat <- methods::as(sca, "data.table")
  flatDat$primerid <- factor(flatDat$primerid, levels = replGeneName)
  names(flatDat)[5] <- useMat
  # Plot
  violinplot <- ggplot2::ggplot(flatDat,
                                ggplot2::aes_string(x = 'condition',
                                                    y = useMat,
                                                    color = 'condition')) +
    ggplot2::geom_jitter() +
    ggplot2::facet_wrap(~primerid, scale = "free_y",
                        ncol = ncol) +
    ggplot2::geom_violin() +
    ggplot2::ggtitle(paste0("Violin Plot for ", useResult))
  if(isTRUE(defaultTheme)){
    violinplot <- .ggSCTKTheme(violinplot)
  }
  return(violinplot)
}

#' Create linear regression plot to show the expression the of top DEGs
#' @details Any of the differential expression analysis method from SCTK should
#' be performed prior to using this function
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useResult character. A string specifying the \code{analysisName}
#' used when running a differential expression analysis function.
#' @param threshP logical. Whether to plot threshold values from adaptive
#' thresholding, instead of using the assay used by when performing DE analysis.
#' Default \code{FALSE}.
#' @param labelBy A single character for a column of \code{rowData(inSCE)} as
#' where to search for the labeling text. Default \code{NULL}.
#' @param nrow Integer. Number of rows in the plot grid. Default \code{6}.
#' @param ncol Integer. Number of columns in the plot grid. Default \code{6}.
#' @param defaultTheme Logical scalar. Whether to use default SCTK theme in
#' ggplot. Default \code{TRUE}.
#' @param isLogged Logical scalar. Whether the assay used for the analysis is
#' logged. If not, will do a \code{log(assay + 1)} transformation. Default
#' \code{TRUE}.
#' @param check_sanity Logical scalar. Whether to perform MAST's sanity check
#' to see if the counts are logged. Default \code{TRUE}
#' @return A ggplot object of linear regression
#' @export
#' @examples
#' data("sceBatches")
#' logcounts(sceBatches) <- log1p(counts(sceBatches))
#' sce.w <- subsetSCECols(sceBatches, colData = "batch == 'w'")
#' sce.w <- runWilcox(sce.w, class = "cell_type",
#'                    classGroup1 = "alpha", classGroup2 = "beta",
#'                    groupName1 = "w.alpha", groupName2 = "w.beta",
#'                    analysisName = "w.aVSb")
#' plotDEGRegression(sce.w, "w.aVSb")
plotDEGRegression <- function(inSCE, useResult, threshP = FALSE, labelBy = NULL,
                              nrow = 6, ncol = 6, defaultTheme = TRUE,
                              isLogged = TRUE, check_sanity = TRUE){
  #TODO: DO we split the up/down regulation too?
  # Check
  .checkDiffExpResultExists(inSCE, useResult, labelBy)
  # Extract
  result <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]
  deg <- result$result
  useReducedDim <- result$useReducedDim
  useAssay <- result$useAssay
  geneToPlot <- deg[seq_len(min(nrow(deg), nrow*ncol)), "Gene"]
  groupName1 <- result$groupNames[1]
  ix1 <- result$select$ix1
  cells1 <- colnames(inSCE)[ix1]
  groupName2 <- result$groupNames[2]
  ix2 <- result$select$ix2
  cells2 <- colnames(inSCE)[ix2]
  if (!is.null(useAssay)) {
    if(!is.null(labelBy)){
      replGeneName <- SummarizedExperiment::rowData(inSCE[geneToPlot,])[[labelBy]]
    } else {
      replGeneName <- geneToPlot
    }
    expres <- expData(inSCE[geneToPlot, c(cells1, cells2)],
                      useAssay)
    useMat <- useAssay
  } else {
    if (!is.null(labelBy)) {
      warning("Analysis performed on reducedDim. Cannot use rowData for ",
              "labelBy. Ignored.")
    }
    replGeneName <- geneToPlot
    expres <- t(expData(inSCE[,c(cells1, cells2)], useReducedDim))[geneToPlot,]
    useMat <- useReducedDim
  }
  if(!is.matrix(expres)){
    expres <- as.matrix(expres)
  }
  rownames(expres) <- replGeneName
  # Format
  cdat <- data.frame(wellKey = colnames(expres),
                     condition = factor(c(rep(groupName1, length(cells1)),
                                          rep(groupName2, length(cells2))),
                                        levels = result$groupNames),
                     ngeneson = rep("", (length(cells1) + length(cells2))),
                     stringsAsFactors = FALSE)
  if (!isTRUE(isLogged)) {
    expres <- log(expres + 1)
  }
  sca <- suppressMessages(MAST::FromMatrix(expres, cData = cdat,
                                           check_sanity = check_sanity))
  cdr2 <- colSums(SummarizedExperiment::assay(sca) > 0)
  SummarizedExperiment::colData(sca)$cngeneson <- scale(cdr2)
  if(length(unique(SummarizedExperiment::colData(sca)$cngeneson)) < 2){
    stop("Standardized cellular detection rate not various, unable to plot.")
  }
  if(threshP){
    #TODO: if nrow*ncol < `min_per_bin`` below, there would be an error.
    invisible(utils::capture.output(thres <-
                                      MAST::thresholdSCRNACountMatrix(expres,
                                                                      nbins = 20,
                                                                      min_per_bin = 30)))
    SummarizedExperiment::assay(sca) <- thres$counts_threshold
  }
  flatDat <- methods::as(sca, "data.table")
  flatDat$primerid <- factor(flatDat$primerid, levels = replGeneName)
  names(flatDat)[6] <- useMat
  # Calculate
  resData <- NULL
  for (i in unique(flatDat$primerid)){
    resdf <- flatDat[flatDat$primerid == i, ]
    resdf$lmPred <- stats::lm(
      stats::as.formula(paste0(useMat, "~cngeneson+", 'condition')),
      data = flatDat[flatDat$primerid == i, ])$fitted
    if (is.null(resData)){
      resData <- resdf
    } else {
      resData <- rbind(resData, resdf)
    }
  }
  # Plot
  ggbase <- ggplot2::ggplot(resData, ggplot2::aes_string(
    x = 'condition',
    y = useMat,
    color = 'condition')) +
    ggplot2::geom_jitter() +
    ggplot2::facet_wrap(~primerid, scale = "free_y",
                        ncol = ncol)
  regressionplot <- ggbase +
    ggplot2::aes_string(x = "cngeneson") +
    ggplot2::geom_line(ggplot2::aes_string(y = "lmPred"),
                       lty = 1) +
    ggplot2::xlab("Standardized Cellular Detection Rate") +
    ggplot2::ggtitle(paste0("Linear Model Plot for ",
                            useResult))
  if(isTRUE(defaultTheme)){
    regressionplot <- .ggSCTKTheme(regressionplot)
  }
  return(regressionplot)
}

#' Get Top Table of a DEG analysis
#' @description Users have to run \code{runDEAnalysis()} first, any of the
#' wrapped functions of this generic function. Users can set further filters on
#' the result. A \code{data.frame} object, with variables of \code{Gene},
#' \code{Log2_FC}, \code{Pvalue}, and \code{FDR}, will be returned.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object, with of the
#' singleCellTK DEG method performed in advance.
#' @param useResult character. A string specifying the \code{analysisName}
#' used when running a differential expression analysis function.
#' @param labelBy A single character for a column of \code{rowData(inSCE)} as
#' where to search for the labeling text. Leave \code{NULL} for \code{rownames}.
#' Default \code{metadata(inSCE)$featureDisplay} (see
#' \code{\link{setSCTKDisplayRow}}).
#' @param onlyPos logical. Whether to only fetch DEG with positive log2_FC
#' value. Default \code{FALSE}.
#' @param log2fcThreshold numeric. Only fetch DEGs with the absolute values of
#' log2FC larger than this value. Default \code{0.25}.
#' @param fdrThreshold numeric. Only fetch DEGs with FDR value smaller than this
#' value. Default \code{0.05}.
#' @param minGroup1MeanExp numeric. Only fetch DEGs with mean expression in
#' group1 greater then this value. Default \code{NULL}.
#' @param maxGroup2MeanExp numeric. Only fetch DEGs with mean expression in
#' group2 less then this value. Default \code{NULL}.
#' @param minGroup1ExprPerc numeric. Only fetch DEGs expressed in greater then
#' this fraction of cells in group1. Default \code{NULL}.
#' @param maxGroup2ExprPerc numeric. Only fetch DEGs expressed in less then this
#' fraction of cells in group2. Default \code{NULL}.
#' @return A \code{data.frame} object of the top DEGs, with variables of
#' \code{Gene}, \code{Log2_FC}, \code{Pvalue}, and \code{FDR}.
#' @export
#' @examples
#' data("sceBatches")
#' sceBatches <- scaterlogNormCounts(sceBatches, "logcounts")
#' sce.w <- subsetSCECols(sceBatches, colData = "batch == 'w'")
#' sce.w <- runWilcox(sce.w, class = "cell_type",
#'                    classGroup1 = "alpha", classGroup2 = "beta",
#'                    groupName1 = "w.alpha", groupName2 = "w.beta",
#'                    analysisName = "w.aVSb")
#' getDEGTopTable(sce.w, "w.aVSb")
getDEGTopTable <- function(inSCE, useResult,
                           labelBy = S4Vectors::metadata(inSCE)$featureDisplay,
                           onlyPos = FALSE, log2fcThreshold = 0.25,
                           fdrThreshold = 0.05, minGroup1MeanExp = NULL,
                           maxGroup2MeanExp = NULL, minGroup1ExprPerc = NULL,
                           maxGroup2ExprPerc = NULL){
  # Check
  .checkDiffExpResultExists(inSCE, useResult, labelBy)
  # Extract
  result <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]$result
  if (!is.null(labelBy)) {
    genes <- result$Gene
    geneIdx <- featureIndex(genes, inSCE)
    result$Gene <- SummarizedExperiment::rowData(inSCE)[[labelBy]][geneIdx]
  }
  # Filter
  result <- .filterDETable(result, onlyPos, log2fcThreshold, fdrThreshold,
                           minGroup1MeanExp, maxGroup2MeanExp,
                           minGroup1ExprPerc, maxGroup2ExprPerc)
  return(result)
}

#' Heatmap visualization of DEG result
#'
#' @details A differential expression analysis function has to be run in advance
#' so that information is stored in the metadata of the input SCE object. This
#' function wraps \code{\link{plotSCEHeatmap}}.
#' A feature annotation basing on the log2FC level called \code{"regulation"}
#' will be automatically added. A cell annotation basing on the condition
#' selection while running the analysis called \code{"condition"}, and the
#' annotations used from \code{colData(inSCE)} while setting the condition and
#' covariates will also be added.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useResult character. A string specifying the \code{analysisName}
#' used when running a differential expression analysis function.
#' @param doLog Logical scalar. Whether to do \code{log(assay + 1)}
#' transformation on the assay used for the analysis. Default \code{FALSE}.
#' @param onlyPos logical. Whether to only plot DEG with positive log2_FC
#' value. Default \code{FALSE}.
#' @param log2fcThreshold numeric. Only plot DEGs with the absolute values of
#' log2FC larger than this value. Default \code{0.25}.
#' @param fdrThreshold numeric. Only plot DEGs with FDR value smaller than this
#' value. Default \code{0.05}.
#' @param minGroup1MeanExp numeric. Only plot DEGs with mean expression in
#' group1 greater then this value. Default \code{NULL}.
#' @param maxGroup2MeanExp numeric. Only plot DEGs with mean expression in
#' group2 less then this value. Default \code{NULL}.
#' @param minGroup1ExprPerc numeric. Only plot DEGs expressed in greater then
#' this fraction of cells in group1. Default \code{NULL}.
#' @param maxGroup2ExprPerc numeric. Only plot DEGs expressed in less then this
#' fraction of cells in group2. Default \code{NULL}.
#' @param useAssay character. A string specifying an assay of expression value
#' to plot. By default the assay used for \code{runMAST()} will be used.
#' Default \code{NULL}.
#' @param featureAnnotations \code{data.frame}, with \code{rownames} containing
#' all the features going to be plotted. Character columns should be factors.
#' Default \code{NULL}.
#' @param cellAnnotations \code{data.frame}, with \code{rownames} containing
#' all the cells going to be plotted. Character columns should be factors.
#' Default \code{NULL}.
#' @param featureAnnotationColor A named list. Customized color settings for
#' feature labeling. Should match the entries in the \code{featureAnnotations}
#' or \code{rowDataName}. For each entry, there should be a list/vector of
#' colors named with categories. Default \code{NULL}.
#' @param cellAnnotationColor A named list. Customized color settings for
#' cell labeling. Should match the entries in the \code{cellAnnotations} or
#' \code{colDataName}. For each entry, there should be a list/vector of colors
#' named with categories. Default \code{NULL}.
#' @param rowDataName character. The column name(s) in \code{rowData} that need
#' to be added to the annotation. Default \code{NULL}.
#' @param colDataName character. The column name(s) in \code{colData} that need
#' to be added to the annotation. Default \code{NULL}.
#' @param rowSplitBy character. Do semi-heatmap based on the grouping of
#' this(these) annotation(s). Should exist in either \code{rowDataName} or
#' \code{names(featureAnnotations)}. Default \code{"regulation"}.
#' @param colSplitBy character. Do semi-heatmap based on the grouping of
#' this(these) annotation(s). Should exist in either \code{colDataName} or
#' \code{names(cellAnnotations)}. Default \code{"condition"}.
#' @param rowLabel \code{FALSE} for not displaying; a variable in \code{rowData}
#' to display feature identifiers stored there; if have run
#' \code{\link{setSCTKDisplayRow}}, display the specified feature name;
#' \code{TRUE} for the \code{rownames} of \code{inSCE}; \code{NULL} for
#' auto-display \code{rownames} when the number of filtered feature is less
#' than 60. Default looks for \code{\link{setSCTKDisplayRow}} information.
#' @param title character. Main title of the heatmap. Default
#' \code{"DE Analysis: <useResult>"}.
#' @param ... Other arguments passed to \code{\link{plotSCEHeatmap}}
#' @examples
#' data("sceBatches")
#' logcounts(sceBatches) <- log1p(counts(sceBatches))
#' sce.w <- subsetSCECols(sceBatches, colData = "batch == 'w'")
#' sce.w <- runWilcox(sce.w, class = "cell_type",
#'                    classGroup1 = "alpha", classGroup2 = "beta",
#'                    groupName1 = "w.alpha", groupName2 = "w.beta",
#'                    analysisName = "w.aVSb")
#' plotDEGHeatmap(sce.w, "w.aVSb")
#' @return A \code{\link[ggplot2]{ggplot}} object
#' @export
#' @author Yichen Wang
plotDEGHeatmap <- function(inSCE, useResult, onlyPos = FALSE,
                           log2fcThreshold = 0.25, fdrThreshold = 0.05,
                           minGroup1MeanExp = NULL, maxGroup2MeanExp = NULL,
                           minGroup1ExprPerc = NULL, maxGroup2ExprPerc = NULL,
                           useAssay = NULL, doLog = FALSE,
                           featureAnnotations = NULL,
                           cellAnnotations = NULL,
                           featureAnnotationColor = NULL,
                           cellAnnotationColor = NULL,
                           rowDataName = NULL, colDataName = NULL,
                           colSplitBy = 'condition', rowSplitBy = 'regulation',
                           rowLabel = S4Vectors::metadata(inSCE)$featureDisplay,
                           title = paste0("DE Analysis: ", useResult), ...){
  # Check
  .checkDiffExpResultExists(inSCE, useResult)
  extraArgs <- list(...)
  warnArgs <- c('featureIndex', 'cellIndex')
  if(any(warnArgs %in% names(extraArgs))){
    warning('"', paste(warnArgs[warnArgs %in% names(extraArgs)],
                       collapse = ', '), '" are not allowed at this point.')
    extraArgs[c('cellIndex', 'featureIndex')] <- NULL
  }
  # Extract
  result <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]
  if (!is.null(result$useReducedDim)) {
    # Analysis performed with reducedDim, cannot set useAssay in plotDEGHeatmap
    if (!is.null(useAssay)) {
      warning("Analysis performed on reducedDim, cannot set `useAssay`, ",
              "ignored.")
    }
    useAssay <- NULL
    useReducedDim <- result$useReducedDim
  } else {
    if(is.null(useAssay)){
      useAssay <- result$useAssay
    }
    useReducedDim <- NULL
  }

  ix1 <- result$select$ix1
  ix2 <- result$select$ix2
  deg.filtered <- getDEGTopTable(inSCE, useResult = useResult, labelBy = NULL,
                                 onlyPos = onlyPos,
                                 log2fcThreshold = log2fcThreshold,
                                 fdrThreshold = fdrThreshold,
                                 minGroup1MeanExp, maxGroup2MeanExp,
                                 minGroup1ExprPerc, maxGroup2ExprPerc)
  if(dim(deg.filtered)[1] <= 1){
    stop('Too few genes that pass filtration, unable to plot')
  }
  # Not directly using deg.filtered$Gene because rownames might be deduplicated
  # to avoid error when performing DEG.
  if (!is.null(useReducedDim)) {
    mat <- t(expData(inSCE, useReducedDim))
    assayList <- list(mat)
    names(assayList) <- useReducedDim
    tmpSCE <- SingleCellExperiment::SingleCellExperiment(assays = assayList)
    SummarizedExperiment::colData(tmpSCE) <- SummarizedExperiment::colData(inSCE)
    assayName <- useReducedDim
  } else {
    tmpSCE <- inSCE
    assayName <- useAssay
  }
  gene.ix <- rownames(tmpSCE) %in% deg.filtered$Gene
  cell.ix <- which(ix1 | ix2)
  allGenes <- rownames(tmpSCE)[gene.ix]
  allCells <- colnames(tmpSCE)[cell.ix]

  # Annotation organization
  ## Cells
  group <- rep(NA, ncol(tmpSCE))
  group[ix1] <- result$groupNames[1]
  group[ix2] <- result$groupNames[2]
  group <- factor(group[cell.ix], levels = result$groupNames)
  if(!is.null(cellAnnotations)){
    if(!all(allCells %in% rownames(cellAnnotations))){
      stop('Not all cells involved in comparison found in given ',
           '`cellAnnotations`. ')
    }
    cellAnnotations <- cellAnnotations[allCells, , drop = FALSE]
    cellAnnotations <- data.frame(cellAnnotations, condition = group)
  } else {
    cellAnnotations <- data.frame(condition = group,
                                  row.names = allCells)
  }
  kCol <- celda::distinctColors(2)
  names(kCol) <- result$groupNames
  if(!is.null(cellAnnotationColor)){
    if(!"condition" %in% names(cellAnnotationColor)){
      cellAnnotationColor <- c(list(condition = kCol),
                               cellAnnotationColor)
    }
  } else {
    cellAnnotationColor <- list(condition = kCol)
  }
  if(!is.null(colDataName)){
    if (length(which(colDataName %in% result$annotation)) > 0) {
      colDataName <- colDataName[-which(colDataName %in% result$annotation)]
    }
    colDataName <- c(colDataName, result$annotation)
  } else {
    colDataName <- result$annotation
  }
  ## Genes
  regulation <- vector()
  genes.up <- deg.filtered[deg.filtered$Log2_FC > 0, "Gene"]
  genes.down <- deg.filtered[deg.filtered$Log2_FC < 0, "Gene"]
  regulation[rownames(tmpSCE) %in% genes.up] <- 'up'
  regulation[rownames(tmpSCE) %in% genes.down] <- 'down'
  regulation <- factor(regulation[gene.ix], levels = c('up', 'down'))
  if(!is.null(featureAnnotations)){
    if(!all(allGenes %in% rownames(featureAnnotations))){
      stop('Not all genes involved in comparison found in given ',
           '`featureAnnotations`. ')
    }
    featureAnnotations <- featureAnnotations[allGenes, , drop = FALSE]
    featureAnnotations <- data.frame(featureAnnotations,
                                     regulation = regulation)
  } else {
    featureAnnotations <- data.frame(regulation = regulation,
                                     row.names = allGenes)
  }
  lCol <- celda::distinctColors(2)
  names(lCol) <- c('up', 'down')
  if(!is.null(featureAnnotationColor)){
    if(!"regulation" %in% names(featureAnnotationColor)){
      featureAnnotationColor <- c(list(regulation = lCol),
                                  featureAnnotationColor)
    }
  } else {
    featureAnnotationColor <- list(regulation = lCol)
  }
  if (is.null(rowLabel) & nrow(deg.filtered) < 60) {
    rowLabel <- TRUE
  }
  # Plot
  hm <- plotSCEHeatmap(inSCE = tmpSCE, useAssay = assayName, doLog = doLog,
                       featureIndex = gene.ix, cellIndex = cell.ix,
                       featureAnnotations = featureAnnotations,
                       cellAnnotations = cellAnnotations,
                       rowDataName = rowDataName,
                       colDataName = colDataName,
                       featureAnnotationColor = featureAnnotationColor,
                       cellAnnotationColor = cellAnnotationColor,
                       rowSplitBy = rowSplitBy, colSplitBy = colSplitBy,
                       title = title, rowLabel = rowLabel, ...)
  return(hm)
}

#' Generate volcano plot for DEGs
#' @details Any of the differential expression analysis method from SCTK should
#' be performed prior to using this function to generate volcano plots.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useResult character. A string specifying the \code{analysisName}
#' used when running a differential expression analysis function.
#' @param labelTopN Integer, label this number of top DEGs that pass the
#' filters. \code{FALSE} for not labeling. Default \code{10}.
#' @param log2fcThreshold numeric. Label genes with the absolute values of
#' log2FC greater than this value as regulated. Default \code{0.25}.
#' @param fdrThreshold numeric. Label genes with FDR value less than this
#' value as regulated. Default \code{0.05}.
#' @param featureDisplay A character string to indicate a variable in
#' \code{rowData(inSCE)} for feature labeling. \code{NULL} for using
#' \code{rownames}. Default \code{metadata(inSCE)$featureDisplay} (see
#' \code{\link{setSCTKDisplayRow}})
#' @return A \code{ggplot} object of volcano plot
#' @seealso \code{\link{runDEAnalysis}}, \code{\link{plotDEGHeatmap}}
#' @export
#' @examples
#' data("sceBatches")
#' sceBatches <- scaterlogNormCounts(sceBatches, "logcounts")
#' sce.w <- subsetSCECols(sceBatches, colData = "batch == 'w'")
#' sce.w <- runWilcox(sce.w, class = "cell_type",
#'                    classGroup1 = "alpha", classGroup2 = "beta",
#'                    groupName1 = "w.alpha", groupName2 = "w.beta",
#'                    analysisName = "w.aVSb")
#' plotDEGVolcano(sce.w, "w.aVSb")
plotDEGVolcano <- function(
  inSCE,
  useResult,
  labelTopN = 10,
  log2fcThreshold = 0.25,
  fdrThreshold = 0.05,
  featureDisplay = S4Vectors::metadata(inSCE)$featureDisplay
) {
  .checkDiffExpResultExists(inSCE, useResult, featureDisplay)
  deg <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]$result
  deg <- deg[order(abs(deg$Log2_FC), decreasing = TRUE),]
  if (!is.null(featureDisplay)) {
    geneIdx <- featureIndex(deg$Gene, inSCE)
    deg$Gene <- SummarizedExperiment::rowData(inSCE)[[featureDisplay]][geneIdx]
  }
  rownames(deg) <- deg$Gene
  groupNames <- S4Vectors::metadata(inSCE)$diffExp[[useResult]]$groupNames
  # Prepare for coloring that shows the filtering
  deg$Regulation <- NA
  deg$Regulation[deg$Log2_FC > 0] <- "Up"
  deg$Regulation[deg$Log2_FC < 0] <- "Down"
  if (!is.null(log2fcThreshold)) {
    deg$Regulation[abs(deg$Log2_FC) <= log2fcThreshold] <- "No"
  }
  if (!is.null(fdrThreshold)) {
    deg$Regulation[deg$FDR >= fdrThreshold] <- "No"
  }
  # Prepare for Top DEG text labeling
  passIdx <- deg$Regulation != "No"
  deg$label <- NA
  if (!is.null(labelTopN) & !isFALSE(labelTopN)) {
    labelTopN <- min(labelTopN, length(which(passIdx)))
    if (labelTopN > 0) {
      deg.pass <- deg[passIdx,]
      label.origTable.idx <- deg$Gene %in% deg.pass$Gene[seq(labelTopN)]
      deg$label[label.origTable.idx] <- deg$Gene[label.origTable.idx]
    }
  }
  # Prepare for lines that mark the cutoffs
  vlineLab <- data.frame(
    X = c(-log2fcThreshold, log2fcThreshold),
    text = c(paste("lower log2FC cutoff:", -log2fcThreshold),
             paste("upper log2FC cutoff:", log2fcThreshold)),
    h = c(1.01, -0.01)
  )
  hlineLab <- data.frame(
    Y = c(-log10(fdrThreshold)),
    text = paste("FDR cutoff:", fdrThreshold)
  )
  # Plot
  ggplot2::ggplot() +
    ggplot2::geom_point(data = deg,
                        ggplot2::aes_string(x = "Log2_FC", y = "-log10(FDR)",
                                            col = "Regulation")) +
    ggplot2::scale_color_manual(values = c("Down" = "#619cff",
                                           "No" = "light grey",
                                           "Up" = "#f8766d")) +
    ggrepel::geom_text_repel(data = deg,
                             ggplot2::aes_string(x = "Log2_FC",
                                                 y = "-log10(FDR)",
                                                 label = "label"),
                             colour = "black", na.rm = TRUE) +
    ggplot2::geom_vline(data = vlineLab,
                        ggplot2::aes_string(xintercept = "X"),
                        linetype = "longdash") +
    ggplot2::geom_text(data = vlineLab,
                       ggplot2::aes_string(x = "X", y = 0, label = "text",
                                           hjust = "h"),
                       size = 3, vjust = 1) +
    ggplot2::geom_hline(data = hlineLab,
                        ggplot2::aes_string(yintercept = "Y"),
                        linetype = "longdash") +
    ggplot2::geom_text(data = hlineLab,
                       ggplot2::aes_string(x = -Inf, y = "Y", label = "text"),
                       size = 3, vjust = -0.5, hjust = -.03) +
    ggplot2::xlab("Fold Change (log2)") +
    ggplot2::ylab("FDR (-Log10 q-value)") +
    ggplot2::theme(panel.grid.major = ggplot2::element_blank(),
                   panel.grid.minor = ggplot2::element_blank(),
                   panel.background = ggplot2::element_blank(),
                   axis.line = ggplot2::element_line(colour = "black")) +
    ggplot2::xlim(-max(abs(deg$Log2_FC)), max(abs(deg$Log2_FC))) +
    ggplot2::ggtitle(paste("DEG between", groupNames[1],
                           "and", groupNames[2]))
}

#' MAST Identify adaptive thresholds
#'
#' Calculate and produce a list of thresholded counts (on natural scale),
#' thresholds, bins, densities estimated on each bin, and the original data from
#' \code{\link[MAST]{thresholdSCRNACountMatrix}}
#' @param inSCE SingleCellExperiment object
#' @param useAssay character, default \code{"logcounts"}
#' @param doPlot Logical scalar. Whether to directly plot in the plotting area.
#' If \code{FALSE}, will return a graphical object which can be visualized with
#' \code{grid.draw()}. Default \code{TRUE}.
#' @param isLogged Logical scalar. Whether the assay used for the analysis is
#' logged. If not, will do a \code{log(assay + 1)} transformation. Default
#' \code{TRUE}.
#' @param check_sanity Logical scalar. Whether to perform MAST's sanity check
#' to see if the counts are logged. Default \code{TRUE}
#' @return Plot the thresholding onto the plotting region if \code{plot == TRUE}
#' or a graphical object if \code{plot == FALSE}.
#' @export
#' @examples
#' data("mouseBrainSubsetSCE")
#' plotMASTThresholdGenes(mouseBrainSubsetSCE)
plotMASTThresholdGenes <- function(inSCE, useAssay="logcounts", doPlot = TRUE,
                                   isLogged = TRUE, check_sanity = TRUE){
  # data preparation
  expres <- expData(inSCE, useAssay)
  if(!is.matrix(expres)){
    expres <- as.matrix(expres)
  }
  expres <- dedupRowNames(expres)
  fdata <- data.frame(Gene = rownames(expres))
  rownames(fdata) <- fdata$Gene
  SCENew <- MAST::FromMatrix(expres, SingleCellExperiment::colData(inSCE),
                             fdata, check_sanity = check_sanity)
  SCENew <- SCENew[which(MAST::freq(SCENew) > 0), ]
  invisible(utils::capture.output(thres <- MAST::thresholdSCRNACountMatrix(
    SummarizedExperiment::assay(SCENew), nbins = 20, min_per_bin = 30,
    data_log = isLogged)))
  # plotting
  plotNRow <- ceiling(length(thres$valleys) / 4)
  thres.grob <- ggplotify::as.grob(function(){
    graphics::par(mfrow = c(plotNRow, 4), mar = c(3, 3, 2, 1),
        mgp = c(2, 0.7, 0), tck = -0.01, new = TRUE)
    plot(thres)
  })
  if (isTRUE(doPlot)) {
    grid::grid.draw(thres.grob)
  } else {
    return(thres.grob)
  }
}
compbiomed/singleCellTK documentation built on Oct. 27, 2024, 3:26 a.m.