R/visualization.R

Defines functions plotAnnoScoreDist regionQuantileByTargetVar rsScoreHeatmap signalAlongAxis

Documented in plotAnnoScoreDist regionQuantileByTargetVar rsScoreHeatmap signalAlongAxis

################################################################################
# functions to visualize results of COCOA, relevant regions, 
# and variation in the dataset
###############################################################################
# Some imported functions:
# data.table: copy, :=, setorder
# MIRA: dtToGr
# ComplexHeatmap
###

# functions to visualize results of COCOA, relevant regions, 
# and variation in the dataset
# 

# color schemes: red/blue, yellow/red, red/grey, skyblue/coral, skyblue/yellow

#' Visualize how genomic signal in a region set changes 
#' along a given axis
#' 
#' Look at genomic signal (e.g., DNA methylation values) in regions of 
#' interest across samples, 
#' with samples ordered according to a variable of interest (e.g. PC score). 
#' The ComplexHeatmap package
#' is used and additional parameters for the ComplexHeatmap::Heatmap function
#' may be passed to this function to modify the heatmap.   
#' 
#' @templateVar requireSampleNames TRUE
#' @template genomicSignal
#' @template signalCoord
#' @templateVar refGenomeWarning TRUE
#' @templateVar rsVisualization TRUE
#' @template regionSet
#' @param sampleScores A matrix. Must contain a column for the 
#' variable of interest/target variable.
#' E.g. The variable of interest could be 
#' the principal component scores for the samples. 
#' `sampleScores` must have sample names/IDs as row names,
#' These same sample names must be column names of genomicSignal.
#' @param orderByCol A character object. A variable to order samples by
#' (order rows of heatmap by variable, from high to low value).
#' Must be the name of a column in sampleScores. For instance, if doing
#' unsupervised COCOA with PCA, orderByCol might be the name of one of the PCs
#' (e.g. "PC1"). If doing supervised COCOA, orderByCol might be the name
#' of the target variable of the supervised analysis. 
#' @param topXVariables Numeric. The number of variables from genomicSignal
#' to plot. The variables with the highest scores according to 
#' variableScores will be plotted. Can help to reduce the size of the plot.
#' @param variableScores Numeric. A vector that has a numeric score for
#' each variable in genomicSignal (length(variableScores) should equal
#' nrow(genomicSignal)). Only used if topXVariables is given. The highest
#' `topXVariables` will be plotted.
#' @param decreasing Logical. Whether samples should be sorted in 
#' decreasing order of `orderByCol` or not (FALSE is increasing order).
#' @param cluster_rows Logical. Whether rows should be clustered. 
#' This should be kept as FALSE to keep the correct ranking of 
#' samples/observations according to their target variable score.
#' @param cluster_columns Logical. Whether to cluster columns 
#' (the genomic signal, e.g. DNA methylation values for each CpG).
#' @param row_title Character object, row title
#' @param column_title Character object, column title
#' @param column_title_side Character object, where to put the column title:
#' "top" or "bottom"
#' @param name Character object, legend title
#' @param col A vector of colors or a color mapping function which
#' will be passed to the ComplexHeatmap::Heatmap() function. See ?Heatmap
#' (the "col" parameter) for more details. "#EEEEEE" is the code for a
#' color similar to white.
#' @param ... Optional parameters for ComplexHeatmap::Heatmap()
#' @return A heatmap of genomic signal values (eg DNA methylation levels) 
#' in regions of interest (regionSet), with rows ordered by the
#' column of sampleScores given with `orderByCol`.
#' Each row is a patient/sample and 
#' each column is an individual genomic signal value. 
#'
#' @examples data("brcaMethylData1")
#' data("brcaMCoord1")
#' data("esr1_chr1")
#' data("brcaPCScores")
#' signalHM <- signalAlongAxis(genomicSignal=brcaMethylData1,
#'                              signalCoord=brcaMCoord1,
#'                              regionSet=esr1_chr1,
#'                              sampleScores=brcaPCScores,
#'                              orderByCol="PC1", cluster_columns=TRUE)
#' 
#' @export
# previously called rsMethylHeatmap
signalAlongAxis <- function(genomicSignal, signalCoord, regionSet, 
                          sampleScores, orderByCol="PC1", topXVariables=NULL, 
                          variableScores=NULL,
                          decreasing=TRUE,
                          cluster_columns = FALSE, 
                          cluster_rows = FALSE, 
                          row_title = "Sample",
                          column_title = "Genomic Signal", 
                          column_title_side = "bottom",
                          name = "Genomic Signal Value",
                          col = c("blue", "#EEEEEE", "red"), ...) {
    
    
    if (!(is(genomicSignal, "matrix") || is(genomicSignal, "data.frame"))) {
        stop("genomicSignal should be a matrix or data.frame. Check object class.")
    }
    
    # test for appropriateness of inputs/right format
    if (is(signalCoord, "GRanges")) {
        coordGR <- signalCoord
    } else if (is(signalCoord, "data.frame")) {
        # UPDATE: does the work on data.frames that are not data.tables?
        coordGR <- dtToGr(signalCoord)
    } else {
        stop("signalCoord should be a data.frame or GRanges object.")
    }
    
    if (!(is(sampleScores, "matrix") || is(sampleScores, "data.frame"))) {
        stop("sampleScores should be a matrix or data.frame.")
    }
    
    
    # PCA object must have subject_ID as row.names (corresponding 
    # to column names of genomicSignal)
    if (sum(row.names(sampleScores) %in% colnames(genomicSignal)) < 2) {
        stop(cleanws("Sample names on pca data (row names) 
                     must match sample names on methylation
                     (column names)"))
    }
    
    
    if (!is(regionSet, "GRanges")) {
        stop("regionSet should be a GRanges object. Check object class.")
    }
    
    if (!is.null(topXVariables)) {
        if (is.null(variableScores)) {
            stop("To plot the topXVariables, variableScores must be given.")
        }
        if (!(length(variableScores) == nrow(genomicSignal))) {
            stop("length(variableScores) should equal nrow(genomicSignal)")
        }
        
    }
    
    
    
    # coordGR =
    olList <- findOverlaps(regionSet, coordGR)
    # regionHitInd <- sort(unique(queryHits(olList)))
    cytosineHitInd <- sort(unique(subjectHits(olList)))
    thisRSMData <- t(genomicSignal[cytosineHitInd, , drop=FALSE])
    nRegion = length(unique(queryHits(olList)))
    # get top variables
    if (!is.null(topXVariables)) {
        if (nRegion > topXVariables) {
            # select top variables
            thisRSMData <- thisRSMData[, order(variableScores[cytosineHitInd], decreasing = TRUE)][, 1:topXVariables]
        }
    }
    subject_ID <- row.names(thisRSMData)
    # centeredPCAMeth <- t(apply(t(genomicSignal), 1, 
    #                            function(x) x - pcaData$center)) #center first 
    # reducedValsPCA <- centeredPCAMeth %*% pcaData$rotation
    # reducedValsPCA <- pcaData$x
    # pcaData must have subject_ID as row name
    # Error in thisRSMData[names(sort(sampleScores[, orderByCol], decreasing = decreasing)),  : 
    #                          subscript out of bounds 
    thisRSMData <- thisRSMData[names(sort(sampleScores[, orderByCol], 
                                          decreasing = decreasing)), ]
    
    # message(paste0("Number of epigenetic features: ", length(cytosineHitInd)))
    # message(paste0("Number of region set regions: ", nRegion))
    
    
    
    ComplexHeatmap::Heatmap(thisRSMData, 
                            col = col,
                            row_title = row_title,
                            column_title = column_title,
                            column_title_side = column_title_side,
                            cluster_rows = cluster_rows, 
                            cluster_columns = cluster_columns, 
                            name = name, ...)
}


#' Heatmap of region set scores
#' 
#' Heatmap of the ranking of region set scores across target variables.
#' A visualization of the rank of region sets in each target variable, 
#' allowing the
#' user to see if a region set is ranked highly for all target variables 
#' or only a subset.
#' Region sets will be ranked from highest scoring to lowest based on 
#' their score for `orderByCol`.
#' The ComplexHeatmap package
#' is used and additional parameters for the ComplexHeatmap::Heatmap function
#' may be passed to this function to modify the heatmap.  
#' 
#' @template rsScores
#' @templateVar usesRSScores TRUE
#' @template signalCol
#' @param orderByCol A character object. Target variable to order by in heatmap 
#' (arranged in decreasing order for scores so p values should 
#' be -log transformed). Must be the name of a column in rsScores.
#' @param rsNameCol Character. Name of the column in rsScores that has the
#' names/identifiers for the region sets so these can be included 
#' in the plot as row names.
#' @param topX Number of top region sets to include in the heatmap
#' @param row_title Character object, row title
#' @param column_title Character object, column title
#' @param column_title_side Character object, where to put the column title:
#' "top" or "bottom"
#' @param cluster_rows Logical object, whether rows should be clustered. 
#' This should be kept as FALSE to keep the correct ranking of region sets.
#' @param cluster_columns Logical object, whether to cluster columns. 
#' It is recommended
#' to keep this as FALSE so it will be easier to compare target variables
#' that are ordered (such as principal components). 
#' With cluster_columns = FALSE, they will be in the same specified
#' order in different heatmaps.
#' @param show_row_names Logical object, display row names (ie region set names)
#' @param row_names_max_width "unit" object. The amount of room to 
#' allocate for row names. See ?grid::unit for object type.
#' @param name Character object, legend title
#' @param col A vector of colors or a color mapping function which
#' will be passed to the ComplexHeatmap::Heatmap() function. See ?Heatmap
#' (the "col" parameter) for more details. "#EEEEEE" is the code for a
#' color similar to white.
#' @param ... Optional parameters for ComplexHeatmap::Heatmap().
#' @return A heatmap of region set scores across. Each row is a region set,
#' each column is a target variable. 
#' The color corresponds to the relative rank of a 
#' region set's score for a given target variable out of all tested region sets.
#'
#' @examples data("rsScores")
#' scoreHeatmap <- rsScoreHeatmap(rsScores, 
#'           signalCol=paste0("PC", 1:2), orderByCol = "PC2")
#' @export

rsScoreHeatmap <- function(rsScores, signalCol=paste0("PC", 1:5),
                            orderByCol="PC1", rsNameCol = "rsName", topX = 20, 
                           col=c("red", "#EEEEEE", "blue"),
                           row_title = "Region Set", column_title = "Principal Component",
                           column_title_side = "bottom",
                           cluster_rows = FALSE, cluster_columns = FALSE,
                           show_row_names = TRUE, 
                           row_names_max_width = unit(10000, "mm"),
                           name="Rank within PC", ...) {
    
    
    if (!is(signalCol, "character")) {
        stop("signalCol should be a character object (eg 'PC1').")
    }
    
    # function is not meant for making a heatmap of a single PC but should work
    if (is(rsScores, "numeric")) {
        warning("rsScores should be a data.frame")
        rsScores <- data.frame(rsScores)
        colnames(rsScores) <- orderByCol
    }
    if (!is(rsScores, "data.frame")) {
        stop("rsScores should be a data.frame.")
    }
    
    if (!(orderByCol %in% colnames(rsScores))) {
        stop(cleanws(paste0("orderByCol parameter:", orderByCol, 
                            ", was not a column of rsScores")))
    }
    if (!(rsNameCol %in% colnames(rsScores))) {
        stop(cleanws(paste0(rsNameCol, "was not a column of rsScores. 
                            Specify the column containing region set names 
                            with the rsNameCol parameter.")))
    }
    
    # so by reference operations will not affect original object
    rsScores <- as.data.table(data.table::copy(rsScores))
    
    
    # prevent indexing out of bounds later
    if (nrow(rsScores) < topX) {
        topX = nrow(rsScores)
    }
    
    
    # only ones you have data for
    signalCol <- signalCol[signalCol %in% colnames(rsScores)]
    if (length(signalCol) == 0) {
        stop("Please check format of PC names in signalCol.")
    }
     
    # apparently erases row names
    rsScores <- rsScores[, c(signalCol, rsNameCol), with=FALSE] 

    
    # how to deal with NA?
    
    # number of region sets tested
    rsNum <- nrow(rsScores)
    
    # convert to data.table to do some data.table operations
    rsScores = as.data.table(rsScores)
    
    for (i in seq_along(signalCol)) {
        # first convert to rank
        setorderv(rsScores, signalCol[i], order = -1L, na.last=TRUE) # descending order
        rsScores[, signalCol[i] := seq_len(rsNum)]
        
        # center around zero
        # rsScores[, signalCol[i] := ((rsNum + 1) / 2) - get(signalCol[i])]
    }
    
    # heatmap of the centered ranks
    # back to first order, -1L means decreasing order
    setorderv(rsScores, orderByCol, order = 1L) 
    rowNames <-  rsScores[, get(rsNameCol)] # redefined/reordered later
    row.names(rsScores) <- rowNames
    rsScores[, c(rsNameCol) := NULL]
    rsScores <- as.matrix(rsScores)
    row.names(rsScores) <- rowNames
    Heatmap(rsScores[seq_len(topX), ], 
            col = col,
            row_title = row_title, column_title = column_title, 
            cluster_rows = cluster_rows, 
            cluster_columns = cluster_columns, 
            column_title_side = column_title_side,
            show_row_names = show_row_names, 
            row_names_max_width = row_names_max_width, 
            name = name,
            heatmap_legend_param=, ...)
    
}


#' Visualize how individual regions are associated with target variable
#' 
#' Visualize how much each region in a region set 
#' is associated with each target variable. 
#' For each target variable (`signalCol`), the average (absolute) 
#' signal value is calculated for 
#' each region in the region set. Then for a given target variable, 
#' the average signal is converted to a percentile/quantile based 
#' on the distribution of all signal values 
#' for that target variable. These values are
#' plotted in a heatmap.
#' 
#' @template signal
#' @template signalCoord
#' @templateVar refGenomeWarning TRUE
#' @templateVar rsVisualization TRUE
#' @template regionSet
#' @param rsName Character. Name of the region set. 
#' For use as a title for the heatmap.
#' @template signalCol
#' @param maxRegionsToPlot How many top regions from region set to include
#' in heatmap. Including too many may slow down computation and increase memory
#' use. If regionSet has more regions than maxRegionsToPlot, a number of regions 
#' equal to maxRegionsToPlot will be randomly sampled from the region set and
#' these regions will be plotted. Clustering rows is a major limiting factor
#' on how long it takes to plot the regions so if you want to plot many regions, 
#' you can also set cluster_rows to FALSE.
#' @param row_title Character object, row title
#' @param column_title Character object, column title
#' @param column_title_side Character object, where to put the column title:
#' "top" or "bottom"
#' @param cluster_rows Logical object, whether to cluster rows or not (may 
#' increase computation time significantly for large number of rows)
#' @param cluster_columns Logical object, whether to cluster columns. 
#' It is recommended
#' to keep this as FALSE so it will be easier to compare target
#' variables that have a certain order such as PCs 
#' (with cluster_columns = FALSE, they will be in the same specified
#' order in different heatmaps)
#' @param name Character object, legend title
#' @param col A vector of colors or a color mapping function which
#' will be passed to the ComplexHeatmap::Heatmap() function. See ?Heatmap
#' (the "col" parameter) for more details.
#' @template absVal
#' @param ... Optional parameters for ComplexHeatmap::Heatmap()
#' @return A heatmap. Columns are signalCol's, rows are regions. 
#' This heatmap allows you to see if some regions are 
#' associated with certain target variables but not others. 
#' Also, you can see if a subset of 
#' regions in the region set are associated with 
#' target variables while another subset
#' are not associated with any target variables 
#' To color each region, first the (absolute) signal 
#' values within that region are
#' averaged. Then this average is compared to the distribution of all (absolute)
#' individual signal values for the given target variable to get 
#' a quantile/percentile 
#' for that region. Colors are based on this quantile/percentile. 
#' The output is a Heatmap object (ComplexHeatmap package).
#' 
#' @examples
#' data("brcaATACCoord1")
#' data("brcaATACData1")
#' data("esr1_chr1")
#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation
#' regionByPCHM <- regionQuantileByTargetVar(signal = featureContributionScores, 
#'                                    signalCoord = brcaATACCoord1, 
#'                                    regionSet = esr1_chr1, 
#'                                    rsName = "Estrogen Receptor Chr1", 
#'                                    signalCol=paste0("PC", 1:2),
#'                                    maxRegionsToPlot = 8000, 
#'                                    cluster_rows = TRUE, 
#'                                    cluster_columns = FALSE, 
#'                                    column_title = rsName, 
#'                                    name = "Percentile of Loading Scores in PC")
#'                                    
#' 
#' @export
regionQuantileByTargetVar <- function(signal, signalCoord, regionSet, 
                               rsName = "", signalCol=paste0("PC", 1:5),
                               maxRegionsToPlot = 8000, cluster_rows = TRUE, 
                               row_title = "Region", column_title = rsName,
                               column_title_side = "top",
                               cluster_columns = FALSE, 
                               name = "Percentile of Loading Scores in PC", 
                               col = c("skyblue", "yellow"),
                               absVal=TRUE, ...) {
    
    ################### checking inputs  #################################
    
    ########## check that inputs are the correct class
    checkConvertInputClasses(signal=signal,
                             signalCoord=signalCoord,
                             regionSet=regionSet,
                             signalCol = signalCol)
    
    ########## check that dimensions of inputs are consistent
    # length of signal coord = nrow of `signal`
    if (length(signalCoord) != nrow(signal)) {
        stop(cleanws("The number of coordinates in 
            signalCoord (length(signalCoord)) does not equal the number of 
                     rows in `signal`"))
    } 
    
    ######### check that appropriate columns are present
    # signalCol are column names of `signal`
    if (!all(signalCol %in% colnames(signal))) {
        missingCols = signalCol[!(signalCol %in% colnames(signal))]
        stop(cleanws(paste0("Some signalCol are not 
                            columns of `signal`: ", missingCols)))
    }
    
    #######
    # what happens if there are NAs or Inf in `signal`?
    
    #################################################################

    
    # if too many regions to plot, randomly subsample regions
    if (length(regionSet) > maxRegionsToPlot) {
        regionInd <- sample(x = seq_along(regionSet), size = maxRegionsToPlot, 
                            replace = FALSE)
        # get subset
        regionSet <- regionSet[regionInd]
    }
    
    
    
    rsRegionAverage <- averagePerRegion(signal = signal, 
                                       signalCoord =signalCoord, 
                                       regionSet = regionSet, 
                                       signalCol = signalCol,
                                       returnQuantile = TRUE,
                                       absVal=absVal)
    # ranking in terms of percentiles in case there were different 
    # distributions of loading scores for each PC
    
    # if no overlap, it will be null
    if (!is.null(rsRegionAverage)) {
        # the heatmap
        Heatmap(matrix = as.matrix(rsRegionAverage[, signalCol, with=FALSE]), 
                row_title = row_title,
                column_title = rsName,
                column_title_side = column_title_side,
                cluster_rows = cluster_rows,
                cluster_columns = cluster_columns, 
                name = name, 
                col = col, ...)
    }

}


#' Plot ranked region set scores, annotating groups of interest
#' 
#' Visualize the distribution of region set scores for region set groups of
#' interest.
#' 
#' If the same region set matches two patterns, the later group
#' will be assigned to that region set.
#' 
#' @param rsScores data.frame. Each row should be a region set. 
#' Columns should include score columns and a column that contains the name
#' of each region set.
#' @param colToPlot character. Name of the column with region set scores to plot.
#' @param pattern character. Region sets that match each pattern will be given 
#' the same color.
#' Multiple patterns can be given as character objects in a vector (each will
#' have a different color).
#' Regular expressions can be used (ignore.case=TRUE though). For example,
#' to search for ER or GATA3 and color with a single color, 
#' this pattern can be given "ER|GATA3".
#' @param patternName character. A name for each string in "pattern" that
#' will be used for the legend.
#' @param rsNameCol character. Column name of "rsScores" column that contains
#' the name or description of each region set. This column will be 
#' searched for the pattern given by the "pattern" parameter.
#' @param alpha numeric. Transparency of points. See ggplot documentation for
#' further details.
#' @param shape integer. Shape of the points. 
#' See ggplot documentation for options.
#' 
#' @return ggplot object that can be modified using ggplot syntax. E.g.
#' plot + ggplot_function
#' @examples 
#' data(rsScores)
#' rsScores$rsName <- c("ER", "GATA3", "ER", "GATA3", "AP1") 
#' plotAnnoScoreDist(rsScores, colToPlot="PC1", pattern="ER", alpha=1)
#' plotAnnoScoreDist(rsScores, colToPlot="PC2", pattern=c("ER", "GATA3"))
#' @export
plotAnnoScoreDist <- function(rsScores, colToPlot, pattern, patternName=pattern, 
                              rsNameCol="rsName", alpha=0.5, shape=3) {
    
    if (!(is(rsScores, "data.frame") | is(rsScores, "matrix"))) {
        stop("rsScores should be a data.frame")
    }
    if (length(colToPlot) > 1) {
        stop("colToPlot may only be a single column.")
    }
    
    if (!all(c(colToPlot, rsNameCol) %in% colnames(rsScores))) {
        stop("colToPlot and rsNameCol must be columns of rsScores.")    
    }
    
    rsScores$rank <- order(order(as.numeric(rsScores[, colToPlot]), decreasing = TRUE))
    
    rsGroup <- rep(0, nrow(rsScores))
    for (i in seq_along(pattern)) {
        thisPatternInd <- grepl(pattern = pattern[i], 
                               x = rsScores[, rsNameCol], 
                               ignore.case = TRUE)
        # earlier groups will be overwritten if they are not mutually exclusive
        # (e.g. if the same region set matches two patterns, the later group
        # will be assigned)
        rsGroup[thisPatternInd] <- i
    }
    rsGroup <- as.factor(rsGroup)
    # necessary so region set names will be in legend
    levels(rsGroup) <- c("Other", patternName)
    rsScores$Group <- rsGroup 
    
    
    rsCorP <- ggplot(data = rsScores, 
                    mapping = aes(x=rank, y=get(colToPlot), 
                                  col=Group)) +
        theme_classic() + theme(text = element_text(colour = "black"),
                                axis.text=element_text(colour = "black"),
                                axis.ticks = element_line(colour = "black")) +
        # alpha=Group
        # geom_point(alpha=0.0, shape=3) +
        ylab("Region set score") + xlab("Region set rank") +
        scale_color_discrete(drop = FALSE) + theme(aspect.ratio = 1)
    
    
    # add each group (Other and pattern) sequentially so they will be plotted on top
    # of other points ('Other' plotted first)
    
    for (i in 1:(length(pattern) + 1)) {
        rsCorP <- rsCorP + geom_point(data = rsScores[as.numeric(rsScores$Group) == i, ], alpha=alpha, shape=shape) 
    }

    
    return(rsCorP)
}

Try the COCOA package in your browser

Any scripts or data that you put into this service are public.

COCOA documentation built on Nov. 8, 2020, 5:42 p.m.