R/plotting_functions.R

Defines functions PlotCoverage relative_location PlotUTRLengthShift PlotRelativeExpressionViolin PlotRelativeExpressionBox PlotRelativeExpressionUMAP PlotRelativeExpressionTSNE do_arrow_plot get_relative_expression_sce get_relative_expression_seurat GetRelativeExpression

Documented in do_arrow_plot GetRelativeExpression get_relative_expression_sce get_relative_expression_seurat PlotCoverage PlotRelativeExpressionBox PlotRelativeExpressionTSNE PlotRelativeExpressionUMAP PlotRelativeExpressionViolin PlotUTRLengthShift relative_location

##########################################################
#'
#' Calculate relative expression between two or more peaks
#'
#' Calculate a relative expression between two or more peaks by dividing
#' the expression of each peak by the mean of the peak expression for that gene -
#' or set of provided peaks
#'
#' @param peaks.object Seurat object
#' @param peak.set set of peaks
#' @param gene.name gene name for retrieving a set of peaks
#' @param feature.type features to consider. 3'UTR and exon by default.
#' @param p.count Pseudo count 
#'
#' @return a matrix of relative expression
#'
#' @examples
#' 
#' ## Load example data for two peaks from the Cxcl12 gene
#' extdata_path <- system.file("extdata",package = "Sierra")
#' load(paste0(extdata_path, "/Cxcl12_example.RData"))
#' load(paste0(extdata_path, "/TIP_cell_info.RData"))
#' 
#' ## Create an seurat object holding the peak data
#'                         
#'  peaks.seurat <- NewPeakSeurat(peak.data = peak.counts, 
#'                         annot.info = peak.annotations, 
#'                         cell.idents = tip.populations, 
#'                         tsne.coords = tip.tsne.coordinates,
#'                         min.cells = 0, min.peaks = 0)
#'                         
#' ## Plot relative expression of example peaks on t-SNE coordinates
#' relative.exp <- GetRelativeExpression(peaks.object = peaks.seurat, 
#'                  peak.set = c("Cxcl12:6:117174603-117175050:1", "Cxcl12:6:117180974-117181367:1"))
#'
#' @export
#'
GetRelativeExpression <- function(peaks.object, 
                                  peak.set = NULL, 
                                  gene.name = NULL,
                                  feature.type = c("UTR3", "exon"),
                                  p.count = 1) {

  if (class(peaks.object) == "Seurat") {
    relative.expression.data <- get_relative_expression_seurat(peaks.seurat.object = peaks.object,
                                                               peak.set = peak.set, gene.name = gene.name,
                                                               feature.type = feature.type,
                                                               p.count = p.count)
    return(relative.expression.data)
  } else if (class(peaks.object) == "SingleCellExperiment") {
    relative.expression.data <- get_relative_expression_sce(peaks.sce.object = peaks.object,
                                                            peak.set = peak.set, gene.name = gene.name,
                                                            feature.type = feature.type,
                                                            p.count = p.count)
    return(relative.expression.data)
  }

}


##########################################################
#'
#' Calculate relative expression between two or more peaks
#'
#' Calculate a relative expression between two or more peaks by dividing
#' the expression of each peak by the mean of the peak expression for that gene -
#' or set of provided peaks
#'
#' @param peaks.seurat.object Seurat object
#' @param peak.set set of peaks
#' @param gene.name gene name for retrieving a set of peaks
#' @param feature.type features to consider. 3'UTR and exon by default.
#' @param p.count Pseudo count 
#'
#' @return a matrix of relative expression
#'
#' @examples
#' \dontrun{
#' get_relative_expression_seurat(peaks.seurat.object, gene.name = "Cxcl12")
#' }
get_relative_expression_seurat <- function(peaks.seurat.object, 
                                           peak.set = NULL, 
                                           gene.name = NULL,
                                           feature.type = c("UTR3", "exon"),
                                           p.count = 1) {

  ## make sure either a gene or peak set has been provided
  if (is.null(gene.name) & is.null(peak.set)) {
    stop("Please provide a gene or set of peaks")
  }

  ## if no peaks are provided, use the gene name to select peaks
  if (is.null(peak.set)) {
    peak.set <- SelectGenePeaks(peaks.seurat.object, gene.name, feature.type = feature.type)
  }

  ## Check that peaks correspond to the same gene
  peak.gene.names = sub("(.*).*:.*:.*-.*:.*", "\\1", peak.set)
  if (length(unique(peak.gene.names)) > 1) {
    stop("Multiple genes detected in peak set - please ensure input peaks are from one gene")
  }

  ## access expression data for this set of peaks
  expression.data <- Seurat::GetAssayData(peaks.seurat.object)[peak.set, ]

  if (length(peak.set) == 1) {
    return(expression.data)
  }

  cell.names <- colnames(peaks.seurat.object)

  population.names <- Seurat::Idents(peaks.seurat.object)

  ## Calculate population-level gene-mean and relative peak expression values
  population.names <- names(table(Seurat::Idents(peaks.seurat.object)))
  population.relative.usage <- c()
  gene.population.means <- c()
  for (cl in population.names) {
    cell.set <- colnames(peaks.seurat.object)[which(Seurat::Idents(peaks.seurat.object) == cl)]
    expression.set <- expression.data[, cell.set]

    ## Calculate relative usage of each peak
    peak.means <- apply(expression.set, 1, function(x){mean(exp(x) - 1)})
    if (mean(peak.means) == 0) {
      relative.usage <- peak.means
    } else {
      relative.usage <- peak.means / mean(peak.means)
    }

    population.relative.usage <- cbind(population.relative.usage, relative.usage)

    ## Calculate mean expression across the gene
    gene.mean <- mean(exp(as.matrix(expression.set)) - 1)
    gene.population.means <- append(gene.population.means, gene.mean)
  }
  colnames(population.relative.usage) <- population.names
  names(gene.population.means) <- population.names

  ### Divide peak expression for each cell by cell-type expression average
  relative.expression.data <- c()
  population.names <- names(table(Seurat::Idents(peaks.seurat.object)))
  for (cl in population.names) {
    cell.set <- colnames(peaks.seurat.object)[which(Seurat::Idents(peaks.seurat.object) == cl)]
    expression.set <- expression.data[, cell.set]
    this.mean <- gene.population.means[cl]
    rel.values <- population.relative.usage[, cl]
    relative.exp.values <- ( (exp(expression.set) - 1) / (this.mean + p.count) ) * rel.values
    relative.expression.data <- cbind(relative.expression.data, relative.exp.values)
  }
  relative.expression.data <- relative.expression.data[, colnames(peaks.seurat.object)]

  relative.expression.data <- log2(relative.expression.data + 1)
  relative.expression.data <- as(relative.expression.data, "sparseMatrix")

  return(relative.expression.data)
}

##########################################################
#'
#' Calculate relative expression between two or more peaks
#'
#' Calculate a relative expression between two or more peaks by dividing
#' the expression of each peak by the mean of the peak expression for that gene -
#' or set of provided peaks
#'
#' @param peaks.sce.object Seurat object
#' @param peak.set set of peaks
#' @param gene.name gene name for retrieving a set of peaks
#' @param feature.type features to consider. 3'UTR and exon by default.
#' @param p.count Pseudo-count 
#'
#' @return a matrix of relative expression
#'
#' @examples
#' \dontrun{
#' get_relative_expression(peaks.seurat, gene.name = "Cxcl12")
#' }
get_relative_expression_sce <- function(peaks.sce.object, 
                                        peak.set = NULL, 
                                        gene.name = NULL,
                                        feature.type = c("UTR3", "exon"),
                                        p.count = 1) {

  ## make sure either a gene or peak set has been provided
  if (is.null(gene.name) & is.null(peak.set)) {
    print("Please provide a gene or set of peaks")
  }

  ## if no peaks are provided, use the gene name to select peaks
  if (is.null(peak.set)) {
    peak.set <- SelectGenePeaks(peaks.sce.object, gene.name, feature.type = feature.type)
  }

  ## Check that peaks correspond to the same gene
  peak.gene.names = sub("(.*).*:.*:.*-.*:.*", "\\1", peak.set)
  if (length(unique(peak.gene.names)) > 1) {
    stop("Multiple genes detected in peak set - please ensure input peaks are from one gene")
  }

  ## access expression data for this set of peaks
  expression.data <- SingleCellExperiment::logcounts(peaks.sce.object)[peak.set, ]

  if (length(peak.set) == 1) {
    return(expression.data)
  }

  cell.names <- colnames(peaks.sce.object)

  ## Calculate population-level gene-mean and relative peak expression values
  population.names <- names(table(colData(peaks.sce.object)$CellIdent))
  population.relative.usage <- c()
  gene.population.means <- c()
  for (cl in population.names) {
    cell.set <- colnames(peaks.sce.object)[which(colData(peaks.sce.object)$CellIdent == cl)]
    expression.set <- expression.data[, cell.set]

    ## Calculate relative usage of each peak
    peak.means <- apply(expression.set, 1, function(x){mean(exp(x) - 1)})
    if (mean(peak.means) == 0) {
      relative.usage <- peak.means
    } else {
      relative.usage <- peak.means / mean(peak.means)
    }

    population.relative.usage <- cbind(population.relative.usage, relative.usage)

    ## Calculate mean expression across the gene
    gene.mean <- mean(exp(as.matrix(expression.set)) - 1)
    gene.population.means <- append(gene.population.means, gene.mean)
  }
  colnames(population.relative.usage) <- population.names
  names(gene.population.means) <- population.names

  ### Divide peak expression for each cell by cell-type expression average
  relative.expression.data <- c()
  population.names <- names(table(colData(peaks.sce.object)$CellIdent))
  for (cl in population.names) {
    cell.set <- colnames(peaks.sce.object)[which(colData(peaks.sce.object)$CellIdent == cl)]
    expression.set <- expression.data[, cell.set]
    this.mean <- gene.population.means[cl]
    rel.values <- population.relative.usage[, cl]
    relative.exp.values <- ( (exp(expression.set) - 1) / (this.mean + p.count) ) * rel.values
    relative.expression.data <- cbind(relative.expression.data, relative.exp.values)
  }
  relative.expression.data <- relative.expression.data[, colnames(peaks.sce.object)]

  relative.expression.data <- log2(relative.expression.data + 1)
  relative.expression.data <- as(relative.expression.data, "sparseMatrix")

  return(relative.expression.data)
}

#########################################################
#'
#' Produce an arrow plot of peak expression
#'
#' Produce an arrow plot of peak expression, utlising the gggenes package.
#'
#' @param peaks.seurat.object a Seurat object containing t-SNE coordinates and cluster ID's in @ident slot
#' @param gene_name optional plot title
#' @param peaks.use whether to print the plot to output (default: TRUE).
#' @param population.ids size of the point (default: 0.75)
#' @param return.plot whether to return the ggplot object (default: FALSE)
#' @return NULL by default. Returns a ggplot2 object if return.plot = TRUE
#' @examples
#' \dontrun{
#' do_arrow_plot(peaks.seurat.object, gene_name = Favouritegene1)
#' }
#' @import ggplot2
#'
do_arrow_plot <- function(peaks.seurat.object, gene_name, peaks.use = NULL, population.ids = NULL,
                          return.plot = FALSE) {

  if (!'gggenes' %in% rownames(x = installed.packages())) {
    stop("Please install the gggenes package (dev. version) before using this function
         (https://github.com/wilkox/gggenes)")
  }

  peak.data = subset(Tool(peaks.seurat.object, "Sierra"), Gene_name == gene_name)
  if (!is.null(peaks.use)) peak.data = subset(peak.data, rownames(peak.data) %in% peaks.use)
  n.peaks = nrow(peak.data)

  if (is.null(population.ids)) population.ids = names(table(Seurat::Idents(peaks.seurat.object)))

  ave.expression = Seurat::AverageExpression(peaks.seurat.object, features = rownames(peak.data), verbose = FALSE)
  ave.expression = t(as.matrix(ave.expression$RNA))
  ave.expression = ave.expression[population.ids, ]
  ave.expression = log2(ave.expression + 1)

  peak.info = c()
  for (this.peak in rownames(peak.data)) {
    this.info = data.frame(Peak = rep(this.peak, nrow(ave.expression)),
                           start = rep(peak.data[this.peak, "start"], nrow(ave.expression)),
                           end = rep(peak.data[this.peak, "end"], nrow(ave.expression)),
                           strand = rep(peak.data[this.peak, "strand"], nrow(ave.expression)),
                           direction = rep(peak.data[this.peak, "strand"], nrow(ave.expression)))
    peak.info = rbind(peak.info, this.info)
  }
  peak.info$strand = plyr::mapvalues(peak.info$strand, from = c("+", "-"), to = c("forward", "reverse"))
  peak.info$direction = plyr::mapvalues(peak.info$direction, from = c("+", "-"), to = c("1", "-1"))

  gggenesData = data.frame(Cluster = rep(rownames(ave.expression), n.peaks),
                           Expression = as.vector(ave.expression))
  gggenesData = cbind(gggenesData, peak.info)

  pl <- ggplot(gggenesData, aes(xmin = start, xmax = end, y = Cluster, fill = Expression)) +
    gggenes::geom_gene_arrow() + ggtitle(paste0(gene_name, " peak-specific expression")) +
    facet_wrap(~ Cluster, scales = "free", ncol = 1) +
    gggenes::theme_genes() + scale_fill_gradient2(low="#d9d9d9", mid="red", high="brown",
    midpoint=min(gggenesData$Expression) + (max(gggenesData$Expression)-min(gggenesData$Expression))/2,
    name="Expression (log2)") + theme(legend.position = "bottom", legend.box = "horizontal") +
    guides(fill = guide_colourbar(barwidth = 10))
  print(pl)

  if (return.plot) return(pl)
}

##########################################################
#'
#' Generate a t-SNE plot using relative expression
#'
#' Given two or more peaks to plot, a relative expression score and
#' plot on t-SNE coordinates
#'
#' @param peaks.object peak object either Seurat or SingleCellExperiment class
#' @param peaks.to.plot Set of peaks to plot
#' @param do.plot Whether to plot to output (TRUE by default)
#' @param figure.title Optional figure title
#' @param return.plot Boolean of whether to return plot. Default is TRUE.
#' @param pt.size Size of the points on the t-SNE plot. Default 0.5
#' @param txt.size Size of text. Default 14
#' @param legend.position position of the legend (right, left, bottom or top)
#' @param use.facet Whether to plot peaks using ggplot facets. If set to FALSE will use cowplot to plot each peak 
#' @param p.count Pseudo-count  
#'
#' @return a ggplot2 object
#'
#' @examples
#' 
#' ## Load example data for two peaks from the Cxcl12 gene
#' extdata_path <- system.file("extdata",package = "Sierra")
#' load(paste0(extdata_path, "/Cxcl12_example.RData"))
#' load(paste0(extdata_path, "/TIP_cell_info.RData"))
#' 
#' ## Create an SCE object holding the peak data
#' peaks.sce <- NewPeakSCE(peak.data = peak.counts, 
#'                         annot.info = peak.annotations, 
#'                         cell.idents = tip.populations, 
#'                         tsne.coords = tip.tsne.coordinates,
#'                         min.cells = 0, min.peaks = 0)
#'                         
#' ## Plot relative expression of example peaks on t-SNE coordinates
#' PlotRelativeExpressionTSNE(peaks.object = peaks.sce, 
#'       peaks.to.plot = c("Cxcl12:6:117174603-117175050:1", "Cxcl12:6:117180974-117181367:1"))
#'  
#' @import ggplot2
#'
#' @export
#'
PlotRelativeExpressionTSNE <- function(peaks.object, 
                                       peaks.to.plot, 
                                       do.plot=FALSE, 
                                       figure.title=NULL,
                                       return.plot = TRUE, 
                                       pt.size = 0.5, 
                                       txt.size = 14,
                                       legend.position = "right",
                                       use.facet = TRUE,
                                       p.count = 1) {

  ## Check multiple peaks have been provided
  if (length(peaks.to.plot) < 2) {
    stop("Please provide at least two peaks for plotting relative expression")
  }

  relative.exp.data <- GetRelativeExpression(peaks.object, 
                                             peak.set = peaks.to.plot, p.count = p.count)

  ggData <- data.frame(Expression = as.vector(t(as.matrix(relative.exp.data))),
                       Peak = unlist(lapply(peaks.to.plot, function(x) rep(x, ncol(relative.exp.data)))))

  # Pull out the t-SNE coordinates
  if (class(peaks.object) == "Seurat") {
    peaks.object.tsne1 <- peaks.object@reductions$tsne@cell.embeddings[, 1]
    peaks.object.tsne2 <- peaks.object@reductions$tsne@cell.embeddings[, 2]
  } else if (class(peaks.object) == "SingleCellExperiment") {
    peaks.object.tsne1 <- SingleCellExperiment::reducedDims(peaks.object)$tsne[, 1]
    peaks.object.tsne2 <- SingleCellExperiment::reducedDims(peaks.object)$tsne[, 2]
  }

  names(peaks.object.tsne1) <- colnames(peaks.object)
  names(peaks.object.tsne2) <- colnames(peaks.object)

  ggData$tSNE_1 = rep(peaks.object.tsne1, length(peaks.to.plot))
  ggData$tSNE_2 = rep(peaks.object.tsne2, length(peaks.to.plot))

  ggData$Cell_ID = rep(colnames(peaks.object), length(peaks.to.plot))

  ggData$Peak <- factor(ggData$Peak, levels = peaks.to.plot)
  if (use.facet) {
    pl <- ggplot(ggData, aes(tSNE_1, tSNE_2, color=Expression)) + geom_point(size=pt.size) + 
      xlab("t-SNE 1") + ylab("t-SNE 2") +
      scale_color_gradient2(low="#d9d9d9", mid="red", high="brown", midpoint=min(ggData$Expression) +
                              (max(ggData$Expression)-min(ggData$Expression))/2, name="") +
      theme_classic(base_size = txt.size) + 
      theme(strip.background = element_blank(), legend.position = legend.position) +
      theme(strip.text.x = element_text(size = txt.size)) + facet_wrap(~ Peak)
    
    if (!is.null(figure.title)) {
      pl <- pl + ggtitle(figure.title)
    }
    
    if (do.plot) {
      plot(pl)
    }
  } else {
    plot.list <- list()
    for (i in seq(1:length(peaks.to.plot))) {
      
      plot.list[[i]] <- local({
        this.peak <- peaks.to.plot[i]
        ggDataSubset <- subset(ggData, Peak == this.peak)
        
        this.plot <- ggplot(ggDataSubset, aes(tSNE_1, tSNE_2, color=Expression)) + geom_point(size=pt.size) + 
          xlab("t-SNE 1") + ylab("t-SNE 2") +
          scale_color_gradient2(low="#d9d9d9", mid="red", high="brown", midpoint=min(ggDataSubset$Expression) +
                                  (max(ggDataSubset$Expression)-min(ggDataSubset$Expression))/2, name="") +
          theme_classic(base_size = txt.size) + 
          theme(strip.background = element_blank(), legend.position = legend.position) +
          theme(strip.text.x = element_text(size = txt.size)) + ggtitle(this.peak)
        return(this.plot)
      })
    }
    pl <- cowplot::plot_grid(plotlist = plot.list)
    
    if (do.plot)
      plot(pl)
  } 

  if (return.plot) {
    return(pl)
  }
}


##########################################################
#'
#' Generate a t-SNE plot using relative expression
#'
#' Given two or more peaks to plot, calculate a relative expression score and
#' plot on UMAP coordinates
#'
#' @param peaks.object Seurat object
#' @param peaks.to.plot Set of peaks to plot
#' @param do.plot Whether to plot to output (TRUE by default)
#' @param figure.title Optional figure title
#' @param return.plot Boolean of whether to return plot (default TRUE)
#' @param pt.size size of the points on the t-SNE plot. Default 0.5
#' @param txt.size size of text. Default 14
#' @param legend.position position of the legend (right, left, bottom or top)
#' @param use.facet Whether to plot peaks using ggplot facets. If set to FALSE will use cowplot to plot each peak 
#' @param p.count Pseudo count  
#'
#' @return a ggplot2 object
#'
#' @examples
#' 
#' ## Load example data for two peaks from the Cxcl12 gene
#' extdata_path <- system.file("extdata",package = "Sierra")
#' load(paste0(extdata_path, "/Cxcl12_example.RData"))
#' load(paste0(extdata_path, "/TIP_cell_info.RData"))
#' 
#' ## Create an SCE object holding the peak data
#' ## Note, for this example we are recycling t-SNE coordinates to demonstrate running of the function
#' peaks.sce <- NewPeakSCE(peak.data = peak.counts, 
#'                         annot.info = peak.annotations, 
#'                         cell.idents = tip.populations, 
#'                         umap.coords = tip.tsne.coordinates,
#'                         min.cells = 0, min.peaks = 0)
#'                         
#' ## Plot relative expression of example peaks on t-SNE coordinates
#' PlotRelativeExpressionUMAP(peaks.object = peaks.sce, 
#'       peaks.to.plot = c("Cxcl12:6:117174603-117175050:1", "Cxcl12:6:117180974-117181367:1"))
#' 
#' @import ggplot2
#'
#' @export
#'
PlotRelativeExpressionUMAP <- function(peaks.object, 
                                       peaks.to.plot, 
                                       do.plot=FALSE, 
                                       figure.title=NULL,
                                       return.plot = TRUE, 
                                       pt.size = 0.5, 
                                       txt.size = 14,
                                       legend.position = "right",
                                       use.facet = TRUE,
                                       p.count = 1) {

  ## Check multiple peaks have been provided
  if (length(peaks.to.plot) < 2) {
    stop("Please provide at least two peaks for plotting relative expression")
  }

  relative.exp.data <- GetRelativeExpression(peaks.object, 
                                             peak.set = peaks.to.plot, p.count = p.count)

  ggData <- data.frame(Expression = as.vector(t(as.matrix(relative.exp.data))),
                       Peak = unlist(lapply(peaks.to.plot, function(x) rep(x, ncol(relative.exp.data)))))

  # Pull out the UMAP coordinates
  if (class(peaks.object) == "Seurat") {
    peaks.object.umap1 <- peaks.object@reductions$umap@cell.embeddings[, 1]
    peaks.object.umap2 <- peaks.object@reductions$umap@cell.embeddings[, 2]
  } else if (class(peaks.object) == "SingleCellExperiment") {
    peaks.object.umap1 <- SingleCellExperiment::reducedDims(peaks.object)$umap[, 1]
    peaks.object.umap2 <- SingleCellExperiment::reducedDims(peaks.object)$umap[, 2]
  }

  names(peaks.object.umap1) <- colnames(peaks.object)
  names(peaks.object.umap2) <- colnames(peaks.object)

  ggData$UMAP_1 = rep(peaks.object.umap1, length(peaks.to.plot))
  ggData$UMAP_2 = rep(peaks.object.umap2, length(peaks.to.plot))

  ggData$Cell_ID = rep(colnames(peaks.object), length(peaks.to.plot))

  ggData$Peak <- factor(ggData$Peak, levels = peaks.to.plot)
  
  if (use.facet) {
    pl <- ggplot(ggData, aes(UMAP_1, UMAP_2, color=Expression)) + geom_point(size=pt.size) + xlab("UMAP 1") + ylab("UMAP 2") +
      scale_color_gradient2(low="#d9d9d9", mid="red", high="brown", midpoint=min(ggData$Expression) +
                              (max(ggData$Expression)-min(ggData$Expression))/2, name="") +
      theme_classic(base_size = txt.size) + 
      theme(strip.background = element_blank()) +
      theme(strip.text.x = element_text(size = txt.size)) + facet_wrap(~ Peak)
    
    if (!is.null(figure.title)) {
      pl <- pl + ggtitle(figure.title)
    }
    
    if (do.plot) {
      plot(pl)
    }
  } else {
    plot.list <- list()
    for (i in seq(1:length(peaks.to.plot))) {
      
      plot.list[[i]] <- local({
        this.peak <- peaks.to.plot[i]
        ggDataSubset <- subset(ggData, Peak == this.peak)
        
        this.plot <- ggplot(ggDataSubset, aes(UMAP_1, UMAP_2, color=Expression)) + geom_point(size=pt.size) + 
          xlab("UMAP 1") + ylab("UMAP 2") +
          scale_color_gradient2(low="#d9d9d9", mid="red", high="brown", midpoint=min(ggDataSubset$Expression) +
                                  (max(ggDataSubset$Expression)-min(ggDataSubset$Expression))/2, name="") +
          theme_classic(base_size = txt.size) + 
          theme(strip.background = element_blank(), legend.position = legend.position) +
          theme(strip.text.x = element_text(size = txt.size)) + ggtitle(this.peak)
        return(this.plot)
      })

    }
    pl <- cowplot::plot_grid(plotlist = plot.list)
    
    if (do.plot) {
      plot(pl)
    }
  }

  if (return.plot) {
    return(pl)
  }
}


##########################################################
#'
#' Generate a box plot plot using relative expression
#'
#' Given two or more peaks to plot, a relative expression score and
#' generate a box plot according to cell identities
#'
#' @param peaks.object Peak object of either Seurat or SCE class
#' @param peaks.to.plot Set of peaks to plot
#' @param do.plot Whether to plot to output (TRUE by default)
#' @param figure.title Optional figure title
#' @param return.plot Boolean (default True) identifying if plot should be returned.
#' @param pt.size Size of the points on the t-SNE plot (default 0.5)
#' @param col.set col set (default NULL)
#' @param txt.size sie of text (default 14)
#' @param p.count Pseudo count  
#'
#' @return a ggplot2 object
#'
#' @examples
#' 
#' ## Load example data for two peaks from the Cxcl12 gene
#' extdata_path <- system.file("extdata",package = "Sierra")
#' load(paste0(extdata_path, "/Cxcl12_example.RData"))
#' load(paste0(extdata_path, "/TIP_cell_info.RData"))
#' 
#' ## Create an SCE object holding the peak data
#' peaks.sce <- NewPeakSCE(peak.data = peak.counts, 
#'                         annot.info = peak.annotations, 
#'                         cell.idents = tip.populations, 
#'                         tsne.coords = tip.tsne.coordinates,
#'                         min.cells = 0, min.peaks = 0)
#'                         
#' ## Plot relative expression of example peaks on t-SNE coordinates
#' PlotRelativeExpressionBox(peaks.object = peaks.sce, 
#'       peaks.to.plot = c("Cxcl12:6:117174603-117175050:1", "Cxcl12:6:117180974-117181367:1"))
#' 
#'
#' @export
#'
PlotRelativeExpressionBox <- function(peaks.object, 
                                      peaks.to.plot, 
                                      do.plot=FALSE, 
                                      figure.title=NULL,
                                      return.plot = TRUE, 
                                      pt.size = 0.5, 
                                      col.set = NULL, 
                                      txt.size = 14,
                                      p.count = 1) {

  ## Check multiple peaks have been provided
  if (length(peaks.to.plot) < 2) {
    stop("Please provide at least two peaks for plotting relative expression")
  }

  relative.exp.data <- GetRelativeExpression(peaks.object, 
                                             peak.set = peaks.to.plot, p.count = p.count)

  ggData <- data.frame(Expression = as.vector(t(as.matrix(relative.exp.data))),
                       Peak = unlist(lapply(peaks.to.plot, function(x) rep(x, ncol(relative.exp.data)))))

  # Define the colour scale
  if (class(peaks.object) == "Seurat") {
    cell.idents <- Seurat::Idents(peaks.object)
    if (is.null(col.set)){
      col.set = scales::hue_pal()(length(table(Seurat::Idents(peaks.object))))
    }
  } else if (class(peaks.object) == "SingleCellExperiment") {
    cell.idents <- colData(peaks.object)$CellIdent
    if (is.null(col.set)){
      col.set = scales::hue_pal()(length(table(colData(peaks.object)$CellIdent)))
    }
  }

  ggData$Cell_ID = rep(colnames(peaks.object), length(peaks.to.plot))

  ggData$Peak <- factor(ggData$Peak, levels = peaks.to.plot)

  ## Add cell population identities. Order according to order of input
  ggData$Identity <- rep(cell.idents, length(peaks.to.plot))
  ggData$Identity = factor(ggData$Identity, levels = names(table(cell.idents)))

  pl <- ggplot(ggData, aes(y=Expression, x=Identity, fill=Identity)) + geom_boxplot(colour = "black", outlier.size = 0.75) +
    ylab("Relative expression") + scale_fill_manual(values=col.set) + theme_classic(base_size = 18) +
    theme(legend.position="none", text = element_text(size = txt.size), axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
    theme(strip.background = element_blank()) + theme(strip.text.x = element_text(size = txt.size)) +
    xlab("") + facet_wrap(~ Peak)

  if (!is.null(figure.title)) {
    pl <- pl + ggtitle(figure.title)
  }

  if (do.plot) {
    plot(pl)
  }

  if (return.plot) {
    return(pl)
  }
}

##########################################################
#'
#' Generate a violin plot plot using relative expression
#'
#' Given two or more peaks to plot, a relative expression score and
#' generate a violin plot according to cell identities
#'
#' @param peaks.object Peak object of either Seurat or SCE class
#' @param peaks.to.plot Set of peaks to plot
#' @param do.plot Whether to plot to output (TRUE by default)
#' @param figure.title Optional figure title
#' @param return.plot Boolean of whether to return plot (default TRUE)
#' @param pt.size size of the points on the t-SNE plot
#' @param col.set default NULL
#' @param txt.size size of text. Default 14
#' @param add.jitter whether to add a geom_jitter to the plot (default: TRUE)
#' @param jitter.pt.size size of point for geom_jitter (default = 0.25)
#' @param p.count Pseudo count  
#'
#' @return a ggplot2 object
#'
#' @examples
#' 
#' ## Load example data for two peaks from the Cxcl12 gene
#' extdata_path <- system.file("extdata",package = "Sierra")
#' load(paste0(extdata_path, "/Cxcl12_example.RData"))
#' load(paste0(extdata_path, "/TIP_cell_info.RData"))
#' 
#' ## Create an SCE object holding the peak data
#' peaks.sce <- NewPeakSCE(peak.data = peak.counts, 
#'                         annot.info = peak.annotations, 
#'                         cell.idents = tip.populations, 
#'                         tsne.coords = tip.tsne.coordinates,
#'                         min.cells = 0, min.peaks = 0)
#'                         
#' ## Plot relative expression of example peaks on t-SNE coordinates
#' PlotRelativeExpressionViolin(peaks.object = peaks.sce, 
#'       peaks.to.plot = c("Cxcl12:6:117174603-117175050:1", "Cxcl12:6:117180974-117181367:1"))
#' 
#' @import ggplot2
#'
#' @export
#'
PlotRelativeExpressionViolin <- function(peaks.object, 
                                         peaks.to.plot, 
                                         do.plot=FALSE, 
                                         figure.title=NULL,
                                         return.plot = TRUE, 
                                         pt.size = 0.5, 
                                         col.set = NULL, 
                                         txt.size = 14,
                                         add.jitter = TRUE, 
                                         jitter.pt.size = 0.25,
                                         p.count = 1) {

  ## Check multiple peaks have been provided
  if (length(peaks.to.plot) < 2) {
    stop("Please provide at least two peaks for plotting relative expression")
  }

  relative.exp.data <- GetRelativeExpression(peaks.object, 
                                             peak.set = peaks.to.plot, p.count = p.count)

  ggData <- data.frame(Expression = as.vector(t(as.matrix(relative.exp.data))),
                       Peak = unlist(lapply(peaks.to.plot, function(x) rep(x, ncol(relative.exp.data)))))

  # Define the colour codes if not provided
  if (class(peaks.object) == "Seurat") {
    cell.idents <- Seurat::Idents(peaks.object)
    if (is.null(col.set)){
      col.set = scales::hue_pal()(length(table(Seurat::Idents(peaks.object))))
    }
  } else if (class(peaks.object) == "SingleCellExperiment") {
    cell.idents <- colData(peaks.object)$CellIdent
    if (is.null(col.set)){
      col.set = scales::hue_pal()(length(table(colData(peaks.object)$CellIdent)))
    }
  }

  ggData$Cell_ID = rep(colnames(peaks.object), length(peaks.to.plot))

  ggData$Peak <- factor(ggData$Peak, levels = peaks.to.plot)

  ## Add cell population identities. Order according to order of input
  ggData$Identity <- rep(cell.idents, length(peaks.to.plot))
  ggData$Identity = factor(ggData$Identity, levels = names(table(cell.idents)))

  pl <- ggplot(ggData, aes(y=Expression, x=Identity, fill=Identity)) + geom_violin(colour = "black") +
    ylab("Relative expression") + scale_fill_manual(values=col.set) + theme_classic(base_size = 18) +
    theme(legend.position="none", text = element_text(size = txt.size), axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
    theme(strip.background = element_blank()) + theme(strip.text.x = element_text(size = txt.size)) +
    xlab("") + facet_wrap(~ Peak)

  if (add.jitter) {
    pl <- pl + geom_jitter(size = jitter.pt.size)
  }

  if (!is.null(figure.title)) {
    pl <- pl + ggtitle(figure.title)
  }

  if (do.plot) {
    plot(pl)
  }

  if (return.plot) {
    return(pl)
  }
}

##########################################################
#'
#' Plot global shifts in 3'UTR length
#'
#' Plot global shifts in 3'UTR lengths between cell populations.
#' Input is a table of results from the Detect3UTRLengthShift functions.
#' By default evaluates whether there is a significant shift in 3'UTR length
#' between upregulated and downregulated peaks using the Wilcoxon Rank-sum test.
#' 
#' @param results.table table produced by the DetectUTRLengthShift function
#' @param plot.title optional title
#' @param do.ranksum.test whether to perform a ranksum test on the shift in UTR usage
#' @param return.plot whether to return the ggplot2 object
#' @param do.plot whether to print the figure to output
#' 
#' 
#' @examples
#' 
#' extdata_path <- system.file("extdata",package = "Sierra")
#' results.file <- paste0(extdata_path,"/Cycling_vs_resting_fibro_UTR_length_res.RData")
#' load(results.file)
#' 
#' PlotUTRLengthShift(res.table)
#'
#' @import ggplot2
#' @import dplyr
#' @importFrom genefilter plot
#' 
#' @export
#'
PlotUTRLengthShift <- function(results.table,
                                plot.title = "Global shift in 3'UTR length",
                                do.ranksum.test = TRUE,
                                return.plot = TRUE,
                                do.plot = FALSE) {

  locations.res.table.up <- subset(results.table, FC_direction == "Up")
  pos.upreg <- apply(as.matrix(locations.res.table.up[, c("SiteLocation","NumSites")]), 1, 
                function(x) {relative_location(x[1], x[2])})
  
  locations.res.table.down <- subset(results.table, FC_direction == "Down")
  pos.downreg <- apply(as.matrix(locations.res.table.down[, c("SiteLocation","NumSites")]), 1, 
                function(x) {relative_location(x[1], x[2])})
  
  if (do.ranksum.test) {
    this.test <- wilcox.test(pos.upreg, pos.downreg)
    print("Wilcoxon Rank-sum test comparing relative peak locations for up- vs down-regulated peaks:")
    print(paste0("P-value = ", this.test$p.value)) 
  }
  
  ggData <- data.frame(Peak_location = c(pos.upreg, pos.downreg),
                       FC_direction = c(rep("Up", length(pos.upreg)), rep("Down", length(pos.downreg))))
  ggData$FC_direction <- factor(ggData$FC_direction, levels = c("Up", "Down"))
  
  pl.density <- ggplot(ggData, aes(Peak_location, stat(count), fill = FC_direction)) + 
    geom_density(alpha = 0.8) + ylab("") + theme_void(base_size = 18) + 
    theme(axis.text = element_blank(), axis.title=element_blank(), axis.ticks = element_blank()) +
    ggtitle(plot.title) + theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) +
    scale_fill_brewer(palette="Set1")
  
  pl.histogram <- ggplot(ggData, aes(Peak_location, fill = FC_direction)) + 
    geom_histogram(position = position_dodge(), colour = "black", binwidth = 0.1, alpha = 0.8) +
    theme_classic(base_size = 18) + xlab("Relative peak location") + ylab("Peak count") +
    scale_fill_brewer(palette="Set1") + theme(legend.position = "right") +
    guides(fill = guide_legend(title = "Fold-change\ndirection", title.position = "top"))
    
  
  ### Combine the plots together
  pl.combined <- cowplot::plot_grid(pl.density, pl.histogram, ncol=1, 
                                    rel_heights = c(0.3, 0.8), axis = "lr", align = "v")
  
  if (do.plot) {
    plot(pl.combined)
  }
  
  if (return.plot) {
    return(pl.combined)
  }
}

####################################################
#'
#' Given a peak position in a 3'UTR out of some n number of peaks,
#' relative to the terminating exon, calculate the relative position
#' of the query peak location on a scale of 0 to 1, where 0 indicates
#' the most proximal location and 1 indicates most distal.
#'
#' @param location location
#' @param n number of locations
#'
#'
relative_location <- function(location, n) {
  make_range <- function(x){(x-min(x))/(max(x)-min(x))}
  relative.locations <- make_range( (1:n) / n )
  this.location <- relative.locations[location]
  return(this.location)
}

#####################################################################
##
#' PlotCoverage
#'
#' @description
#' Plots read coverage across a gene for a set of BAM files and/or wig data.
#'
#' @param genome_gr : genome granges object
#' @param geneSymbol : Name of gene symbol
#' @param wig_data can be a data frame or a genomic ranges object. Must be stranded.
#' @param bamfiles : BAM filenames that are to be displayed as data tracks
#' @param peaks.annot an optionally named vector of peaks to annotate on the plot. 
#' @param label.transcripts if set to TRUE, adds transcript identifiers to the gene model 
#' @param wig_same_strand Display same strand or opposing strand of wig data (compared to reference gene)
#' @param genome  : genome object
#' @param bamfile.tracknames : BAM track display names. Assumed to be in same order as bamfiles.
#' @param wig_data.tracknames : WIG track display names. Assumed to be in same order as wig_data.
#' @param pdf_output : If true will create output pdf files
#' @param output_file_name : Used if pdf_output is true. Location of where files will be placed.
#' @param zoom_3UTR : If TRUE will create a second figure which will zoom in on 3'UTR.
#' @param annotation.fontsize font size for optional peak and transcript annotations 
#' @param axis.fontsize font size for the axis labels
#' @param ylims manually set the y-axis scale
#' @return NULL by default.
#' @examples
#' 
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#' gtf_gr <- rtracklayer::import(reference.file)
#' bam.files <- c(paste0(extdata_path,"/Vignette_example_TIP_mi.bam"),
#'                  paste0(extdata_path,"/Vignette_example_TIP_sham.bam"))
#' 
#' 
#' PlotCoverage(genome_gr = gtf_gr, geneSymbol = "Lrrc58", genome = "mm10", 
#'            bamfiles = bam.files, bamfile.tracknames=c("MI", "sham"))
#'            
#' ## Alternatively, plot with annotated peaks
#' peaks.annot <- c("Lrrc58:16:37888444-37888858:1", "Lrrc58:16:37883336-37883588:1")
#' names(peaks.annot) <- c("Peak 1", "Peak 2")
#' 
#' PlotCoverage(genome_gr = gtf_gr, geneSymbol = "Lrrc58", genome = "mm10", 
#'           peaks.annot = peaks.annot, bamfiles = bam.files, 
#'           bamfile.tracknames=c("MI", "sham"))
#'
#' @import Gviz
#' @export
PlotCoverage<-function(genome_gr, 
                       geneSymbol="", 
                       wig_data=NULL, 
                       bamfiles=NULL,
                       peaks.annot = NULL,
                       label.transcripts = FALSE,
                       wig_same_strand=TRUE, 
                       genome=NULL, 
                       pdf_output = FALSE, 
                       wig_data.tracknames=NULL, 
                       bamfile.tracknames=NULL,
                       output_file_name='', 
                       zoom_3UTR=FALSE,
                       annotation.fontsize=NULL,
                       axis.fontsize=NULL,
                       ylims = NULL
                       )
{
  # Check that gene_name field exists
  GenomeInfoDb::seqlevelsStyle(genome_gr) <- "UCSC"
  idx <-which(genome_gr$gene_name == geneSymbol)
  if (length(idx) == 0) { 
    warning("Could not find gene name. Please check spelling (and case)")
    return(NULL)
  }
  # Work out the genomic range to extract from
  genome_gr <- genome_gr[idx]
  start <- min(IRanges::start(IRanges::ranges(genome_gr)))
  end <- max(IRanges::end(IRanges::ranges(genome_gr)))
  # should I check that all returned chromosomes and strands are the same? They should be the same
  # Currently just grabbing first entry
  chrom <- as.character(GenomicRanges::seqnames(genome_gr))[1]
  gene_strand <- as.character(BiocGenerics::strand(genome_gr))[1]
  toExtract_gr <- GenomicRanges::GRanges(seqnames=chrom, ranges=IRanges::IRanges(start-1 , width=end-start+3), strand=gene_strand)

  # Assemble gene annotation track
  gene_gr <- IRanges::subsetByOverlaps(genome_gr, toExtract_gr)
  GenomeInfoDb::seqlevelsStyle(gene_gr) <- "UCSC"
  GenomeInfoDb::seqlevels(gene_gr) <- chrom

  # Following lines is a hack to get gene symbol to be name of transcripts.
  gene_name_idx <- which(names(GenomicRanges::elementMetadata(gene_gr)) == "gene_name")
  gene_id_idx <- which(names(GenomicRanges::elementMetadata(gene_gr)) == "gene_id")
  names(GenomicRanges::elementMetadata(gene_gr))[gene_id_idx] <- "ensemble_id"
  names(GenomicRanges::elementMetadata(gene_gr))[gene_name_idx] <- "gene_id"

  gene_txdb <- GenomicFeatures::makeTxDbFromGRanges(gene_gr)

  if (label.transcripts) {
    transcript.fontsize = annotation.fontsize
  } else {transcript.fontsize = 0}
  gtrack <- Gviz::GeneRegionTrack(gene_txdb, 
                                  start = start, 
                                  end = end, 
                                  chromosome=chrom, 
                                  name= geneSymbol,
                                  just.group = "above",
                                  transcriptAnnotation = "symbol",
                                  showId = FALSE,
                                  fontsize.group = transcript.fontsize)

  ### Add peaks annotations if provided
  if (!is.null(peaks.annot)) {
    start.sites <- as.numeric(sub(".*:.*:(.*)-.*:.*", "\\1", peaks.annot))
    end.sites <- as.numeric(sub(".*:.*:.*-(.*):.*", "\\1", peaks.annot))
    peak.widths <- end.sites - start.sites
    peak.names <- names(peaks.annot)
    if (is.null(peak.names)) peak.names <- peaks.annot
    
    atrack <- Gviz::AnnotationTrack(start=start.sites, 
                                    width=peak.widths, 
                                    chromosome=chrom, 
                                    strand="*",
                                    name="Peak",
                                    group=peak.names, 
                                    genome=genome, 
                                    just.group="above",
                                    showId = TRUE,
                                    fontsize.group = annotation.fontsize,
                                    rotation.title=90)
    gtrack <- c(gtrack, atrack)
  }
  
  ##### Assemble data track(s)
  dtrack <- list()  # Add data tracks assembled on this list

  ## Assemble wig data tracks
  wig_tracks <- list()
  if (! is.null(wig_data)) {
    if (typeof(wig_data) != "S4")  # assume a dataframe or list which we can create several df.
    { nc <- ncol(wig_data)  # 4 columns are chrom, start, end, strand. Thereafter are sample data
    wig_data <-  GenomicRanges::makeGRangesFromDataFrame(wig_data,keep.extra.columns=TRUE)
    }
    GenomeInfoDb::seqlevelsStyle(wig_data) <- "UCSC"
    
    if (! wig_same_strand) {  
      toExtract_gr <- GenomicRanges::invertStrand(toExtract_gr)  }
    dtrack_gr <- IRanges::subsetByOverlaps(wig_data, toExtract_gr)
    
    
    GenomeInfoDb::seqlevels(dtrack_gr) <- chrom
    sample_col_idx <- 1: ncol(S4Vectors::mcols(wig_data))
    
    # Now assemble coverage plots
    for(i in sample_col_idx)  {
      tmp_gr <- dtrack_gr
      S4Vectors::mcols(tmp_gr) <- S4Vectors::mcols(tmp_gr)[i]
      dtrack_name <- names(S4Vectors::mcols(tmp_gr))
      wig_tracks[[length(wig_tracks)+1]] <- Gviz::DataTrack(tmp_gr, name=dtrack_name, type = "histogram", genome=genome)
    }
    
  }


  ## Load BAM files onto dtrack
  if (length(bamfiles) > 0) { 
    # Set track naming
    if (length(bamfile.tracknames) > 0) {  # Defined track names has been passed to function.
      if (length(bamfile.tracknames) == length(bamfiles)) { 
        names(bamfile.tracknames) <- bamfiles 
      } else { warning("BAM track names does not match number of bam files passed. 
              Replacing with filenames.") 
        bamfile.tracknames <- bamfiles
        names(bamfile.tracknames) <- bamfiles
      }
    } else { # Default is to use bamfile names.
      bamfile.tracknames <- bamfiles
      names(bamfile.tracknames) <- bamfiles
    }
    
    # Extend gene window 50nt in both directions    
    toExtract_gr <- GenomicRanges::GRanges(seqnames=chrom, ranges=IRanges::IRanges(start-50 , width=end-start+50), strand=gene_strand)
    
    
    for(i in bamfiles) {
      bamHeader <- Rsamtools::scanBamHeader(i)
      if (length(grep(pattern = chrom,x = names(bamHeader[[i]]$targets))) == 0) {   
        GenomeInfoDb::seqlevelsStyle(toExtract_gr) <- "NCBI" 
      } else {   
        GenomeInfoDb::seqlevelsStyle(toExtract_gr) <- "UCSC" }
      
      param <- Rsamtools::ScanBamParam(which = toExtract_gr)
      bf <-Rsamtools::BamFile(i)
      
      open(bf)
      chunk0 <- GenomicAlignments::readGAlignments(bf,param=param)
      GenomeInfoDb::seqlevelsStyle(chunk0) <- "UCSC"
      close(bf)
      idx <- which(as.character(BiocGenerics::strand(chunk0)) == gene_strand)
      if (length(idx) == 0) {        
        next; }
      tmp <-GenomicRanges::coverage(chunk0[idx])
      
      gr <- GenomicRanges::GRanges(seqnames=chrom, ranges=IRanges::IRanges(start:end, width=1), strand=gene_strand)
      S4Vectors::mcols(gr) <- as.numeric(tmp[[chrom]])[start:end]
      dtrack[[length(dtrack)+1]] <- Gviz::DataTrack(gr, name=bamfile.tracknames[i], type = "histogram", genome=genome)
      
    }
  }


  if (length(wig_tracks) > 0)
  {
    dtrack <- c(wig_tracks, dtrack)
  }

  toPlot <- c(gtrack, dtrack)

  if (pdf_output)
  {
    if (output_file_name == '')
    { warning("No file name provided")
      pdf_output = FALSE
    }
    else
    {  pdf(file=output_file_name,width = 24,height = 18)
    }
  }
  extra.space = round( (end - start) * 0.02 )
  Gviz::plotTracks(toPlot, 
                   from = start,
                   to = end,
                   extend.left = extra.space,
                   extend.right = extra.space,
                   chromosome= chrom, 
                   transcriptAnnotation = "transcript",
                   showId = TRUE,
                   fontsize = axis.fontsize,
                   ylim = ylims)

  if (zoom_3UTR)
  {
    idx <- which(genome_gr$type == 'three_prime_utr')
    # Work out the genomic range of UTR
    start <- min(IRanges::start(IRanges::ranges(genome_gr[idx])))
    end <- max(IRanges::end(IRanges::ranges(genome_gr[idx])))
    extra.space = round( (end - start) * 0.02 )
    Gviz::plotTracks(toPlot, 
                     from = start, 
                     to = end, 
                     extend.left = extra.space,
                     extend.right = extra.space,
                     chromosome= chrom, 
                     transcriptAnnotation = "transcript",
                     showId = FALSE,
                     fontsize = axis.fontsize,
                     ylim = ylims)
  }



  if (pdf_output)
  {  dev.off() }

}
VCCRI/Sierra documentation built on July 3, 2023, 6:39 a.m.