R/make.saturation.plots.r
In MOMA: Multi Omic Master Regulator Analysis

Documented in genomicPlotSmall getDataFrame getSubtypeEventTables makeCoverageDf makeSaturationPlots mergeData mergeDataBySubtype oncoprintPlot plotEvents

#' Main function to generate the summary plots of the analysis
#' @import ComplexHeatmap
#' @import circlize
#' @import grid
#' @param momaObj : momaObj that has already run the saturationCalculation function
#' @param clustering.solution : clustering vector with sample names and cluster designations
#' @param important.genes : vector of gene names to prioritize when plotting. 
#' Can be general genes of interest, oncogenes, tumor supressors etc
#' @param fCNV : vector of confirmed functional CNVs if calculated. Will filter for only those CNVs
#' @param max.events : maximum number of events to plot for the oncoplots
#' @return object with both types of summary plot for each subtype
#' @examples 
#' 
#' \dontrun{
#' makeSaturationPlots(momaObj, max.events = 20)
#' }
#' @export
makeSaturationPlots <- function(momaObj, clustering.solution = NULL, 
                                important.genes = NULL, fCNV = NULL, 
                                max.events = 30) {
  
  # check for required components of previous saturation analysis and set as local variables
  if (length(momaObj$checkpoints) == 0) {
    stop("Genomic Saturation analysis has not been done. Run momaObj$saturationCalculation() before continuing.
         Quitting...")
  }
  
  checkpoints <- momaObj$checkpoints
  genomic.saturation <- momaObj$genomic.saturation
  
  # get clustering solution to use for calculations. should be provided or previously saved in the momaObj
  if (is.null(clustering.solution)) {
    if (length(momaObj$sample.clustering) == 0) {
      stop("No clustering solution provided. Provide one as an argument or save one
                to the momaObj. Quitting...")
    } else {
      clustering.solution <- momaObj$sample.clustering
    }
  }
  
  # make gene to cytoband location mapping
  band2gene <- momaObj$gene.loc.mapping$Gene.Symbol
  names(band2gene) <- momaObj$gene.loc.mapping$Cytoband
  
  
  
  ###########################
  # Get raw input mutation/cnv/fusion matrices  
  # from the momaObj and prep for oncoprint plots   
  ##########################
  
  # Mutations
  snpmat <- momaObj$mut
  snpmat[snpmat == 1] <- "mut"
  snpmat[snpmat == 0] <- NA
  rownames(snpmat) <- mapEntrez(rownames(snpmat))
  
  # Fusions if they exist
  fusions.mat <- NULL
  if(!is.na(momaObj$fusions[1,1])) {
    fusions.mat <- momaObj$fusions
    fusions.mat[fusions.mat == 1] <- "fus"
    fusions.mat[fusions.mat == 0] <- NA
  }
  
  # CNVs. Input should be GISITC scores so separate into high/low events
  # Filter to only keep fCNVs if provided
  cnv <- momaObj$cnv
  if(!is.null(fCNV)){
    cnv <- cnv[na.omit(match(fCNV, rownames(cnv))),]
    if(nrow(cnv) == 0) {
      stop("CNV matrix after functional CNV filtering is empty. 
           Check that names are in same format or if too many fCNVs are provided.
           Quitting...")
    }
  }
  
  amps <- dels <- cnv
  
  amps[amps < 0.5] <- NA
  amps[amps >= 0.5] <- "amp"
  
  dels[dels > -0.5] <- NA
  dels[dels <= -0.5] <- "del"
  
  
  rownames(amps) <- rownames(dels) <- mapEntrez(rownames(cnv))
  
  ##########################
  # Restructure saturation data 
  # to prep for plotting
  ##########################
  
  
  # get subtype event tables
  subtype.tables <- getSubtypeEventTables(genomic.saturation, clustering.solution, checkpoints)
  
  # get summary table of unique events added in for each regulator 
  ## could be clarified/improved
  ## also potential improvement: look for inflection points of huge jumps of new unique events and highlight those regulators in particular?
  tissue.coverage.df <- mergeDataBySubtype(genomic.saturation, clustering.solution, 100)
  
  
  # initalize object to save plots in
  tmp.oncoplots <- list()
  tmp.curveplots <- list()
  
  if(!is.null(important.genes)) {
    message("List of important genes has been provided. Will prioritize plotting these events first")
  }
  
  for(k in seq_along(subtype.tables)) {
    
    ###########
    # Oncoprint Events Plots
    ###########
    samples.thisCluster <- names(clustering.solution[clustering.solution == k])
    message("")
    message("Number of samples in cluster ", k, ": ", length(samples.thisCluster))
    
    # subset genomic event matrices for this cluster
    snpmat.thisClus <- snpmat[,colnames(snpmat) %in% samples.thisCluster]
    #cnv.thisClus <- cnv[,colnames(cnv) %in% samples.thisCluster]
    amps.thisClus <- amps[,colnames(amps) %in% samples.thisCluster]
    dels.thisClus <- dels[,colnames(dels) %in% samples.thisCluster]
    fusions.thisClus <- NULL
    if (!is.null(fusions.mat)) {	
      fusions.thisClus <- fusions.mat[,colnames(fusions.mat) %in% samples.thisCluster]
      if(ncol(fusions.thisClus) == 0) {
        fusions.thisClus <- NULL
      }
    }
    
    p.oncoprint <- oncoprintPlot(summary.vec = subtype.tables[[k]], snpmat.thisClus, amps.thisClus, dels.thisClus, fusions.thisClus, 
                                 important.genes, band2gene, max.events, k)
    tmp.oncoplots[[k]] <- p.oncoprint
    
    #########
    # Saturation Curve Plots
    #########
    p.coverage <- genomicPlotSmall(tissue.coverage.df, fraction=0.85, tissue.cluster=k)
    tmp.curveplots[[k]] <- p.coverage
    
  }
  
  list("oncoprint.plots" = tmp.oncoplots, "curve.plots" = tmp.curveplots)
  
}










#' Helper function to get subtype specific events
#' @param saturation.data : genomic saturation object from MOMA. List indexed by 
#' cluster then sample then regulator with the number of events associated with 
#' each additional regulator
#' @param sample.clustering : clustering vector with sample names and 
#' cluster designations
#' @param checkpoints : from momaObj
#' @return a table that has counts of how many times a particular event 
#' happens in a cluster
#' @keywords internal
getSubtypeEventTables <- function(saturation.data, sample.clustering, 
                                  checkpoints) {
  
  subtype.coverage <- list()
  coverage.allSubtypes <- saturation.data
  clustering <- sample.clustering
  
  # aggregate statistics across all samples, but use different subtype 
  # specific solutions
  for (cluster.id in unique(clustering)) {
    # dataframe with sample, type, gene names for each 
    # amp/del events can be locations. 
    coverage <- coverage.allSubtypes[[cluster.id]]
    sample.set <- names(coverage)
    
    # get number of regulators decided for this checkpoint depending on threshold 
    mr.cutoff <- length(checkpoints[[cluster.id]])
    
    #if (isTRUE(checkpoint.spec)) {
    #mr.cutoff <- length(momaObj$checkpoints.clusterSpecific[[cluster.id]])
    #} else {
    # mr.cutoff <- length(momaObj$checkpoints[[cluster.id]])
    #}
    
    subtype.coverage[[cluster.id]] <- makeCoverageDf(coverage, cutoff=mr.cutoff)
    
  }
  names(subtype.coverage) <- unique(clustering)
  
  # make summary table that counts the number of events in the group
  subtype.tables <- lapply(names(subtype.coverage), function(cluster.id) {
    table(apply(subtype.coverage[[cluster.id]], 1, function(x) paste0(x[1], ':', x[2])))
  })
  subtype.tables
}


#' Helper function for making the coverage dataframe 
#' @param coverage.list : List indexed by sample name, 
#' contains mut/fus/amp/del interactions
#' @param cutoff : number of regulators to include
#' @return dataframe with each sample and which events are 
#' captured by the checkpoint mrs
#' @keywords internal
makeCoverageDf <- function(coverage.list, cutoff) {
  
  df <- c()
  for (name in names(coverage.list)) {
    # skip empty/missing sample data
    if (is.null(coverage.list[[name]])) { next }
    
    for (type in c("mut", "amp", "del", "fus")) {
      
      events.thisSample <- coverage.list[[name]][[cutoff]][[type]]
      if (is.null(events.thisSample)) {
        non.null.idx <- which(vapply(coverage.list[[name]], 
                                     function(x) !is.null(x), 
                                     FUN.VALUE = logical(1))) ## ??
        real.cutoff <- max(non.null.idx[non.null.idx < cutoff])
        events.thisSample <- coverage.list[[name]][[real.cutoff]][[type]]
      }
      
      CT <- length(events.thisSample)
      if (CT==0) { next }
      submat <- cbind(rep(name, CT), rep(type, CT), events.thisSample)
      df <- rbind(df, submat)
    }
  }
  
  df <- data.frame(event=df[,3], type=df[,2], sample=df[,1])
  df	
}


#' Create data frame from coverage data, including number of total events 
#' 'covered' and unique events
#' @param genomic.saturation : data from genomic saturation function
#' @param sample.clustering : clustering vector with sample names and 
#' cluster designations
#' @param topN : number of regulators to look through. default is 100
#' @return dataframe with coverage data for genomic events
#' @keywords internal
mergeDataBySubtype <- function(genomic.saturation, sample.clustering, 
                               topN = 100)  {
  
  # generate summary stats for each subtype	
  full.df <- c()
  for (subtype in unique(sample.clustering)) {	
    
    # unnecessary 
    # subtype.samples <- names(sample.clustering[sample.clustering==subtype])
    coverage.subtype <- genomic.saturation[[subtype]]
    df <- mergeData(coverage.subtype, topN)
    
    # append a column specifying the subtype	
    df$subtype <- rep(subtype, nrow(df))
    full.df <- rbind(full.df, df)
  }
  
  full.df
}


#' Helper function for mergeDataBySubtype
#' @param coverage.range : genomic saturation for a particular subtype
#' @param topN : max number of top regulators to search through
#' @return dataframe with coverage data for genomic events
#' @keywords internal
mergeData <- function(coverage.range, topN)  {
  
  data <- c()
  for (i in seq_len(topN)) {
    # count for each sample
    # $mut/amp/del all point to either a NA or a vector of names of the event. If NA the length will be zero
    # so simply count the number of each type of event 
    count <- unlist(lapply(coverage.range, function(x) {
      num.events <- length(x[[i]]$mut)+length(x[[i]]$amp)+length(x[[i]]$del)
    }))
    count <- na.omit(count)
    
    # apply over each sample, get the coverage for each
    fraction <- unlist(lapply(coverage.range, function(x) {
      # critically: must omit the NAs so they don't interfere with count
      event.fractions <- x[[i]]$total.frac
      event.fractions
    }))
    fraction <- na.omit(fraction)
    
    all.events <- unlist(lapply(coverage.range, function(x) {
      c(x[[i]]$mut, x[[i]]$amp, x[[i]]$del)
    }))
    all.events <- na.omit(all.events)
    
    data <- rbind(data, c(i, mean(count), mean(fraction), length(unique(all.events))))
  }
  df <- data.frame(mean=data[,2], k=data[,1], fraction=data[,3], unique.events=data[,4]) 
  df	
}


#' Function to plot genomic events in the style of oncoPrint/cBioPortal
#' @importFrom tidyr separate
#' @importFrom dplyr arrange select mutate mutate_all everything bind_rows group_by n desc
#' @importFrom tibble deframe as_tibble rownames_to_column tibble column_to_rownames
#' @importFrom stringr str_split_fixed
#' @importFrom rlang .data
#' @importFrom grid textGrob
#' @param summary.vec : named vector of the counts, named 'Event name':'Type'
#' where type is 'mut', 'amp', 'del', 'fus'. Mutations are in Entrez ID
#' Amp/Deletion CNV events are in genomic band location
#' @param snpmat.thisClus : SNP matrix subset to samples in current cluster
#' @param amps.thisClus : CNV matrix subset to samples in current cluster (just amplifications)
#' @param dels.thisClus : CNV matrix subset to samples in current cluster (just deletions)
#' @param fusions.thisClus : Fusion matrix subset to samples in current cluster
#' @param important.genes : well known genes to highlight in the analysis
#' @param band2gene : mapping of genomic location IDs to gene name: vector of 
#' HUGO gene ids, named by genomic location
#' @param max.events : maximum number of events to plot for the oncoplots
#' @param k : current cluster number
#' @return oncoprint event plot
#' @keywords internal
oncoprintPlot <- function(summary.vec, snpmat.thisClus, amps.thisClus, 
                          dels.thisClus, fusions.thisClus, important.genes, 
                          band2gene, max.events, k) {
  
  data <- data.frame(coverage=names(summary.vec), freq=as.numeric(summary.vec)) %>%
    tidyr::separate(.data$coverage, c("id", "type"), ":", remove = FALSE) %>%
    dplyr::arrange(dplyr::desc(.data$freq))
  
  ## Split into separate data types
  mut.data <- data[data$type == "mut",] %>% 
    dplyr::mutate(id = mapEntrez(.data$id)) %>% 
    dplyr::select(.data$id, .data$freq) %>% 
    tibble::deframe()
  
  amp.data <- data[data$type == "amp",] %>%
    dplyr::select(.data$id, .data$freq) %>% 
    dplyr::mutate(gene = NA)
  
  del.data <- data[data$type == "del",] %>%
    dplyr::select(.data$id, .data$freq) %>% 
    dplyr::mutate(gene = NA)
  
  fus.data <- data[data$type == "fus",] %>% 
    dplyr::select(.data$id, .data$freq) %>% 
    tibble::deframe()
  
  
  if(is.null(important.genes)) {
    # if no list of important genes has been provided just plot top events
    
    # subset mutations by name
    mut.mat <- tibble::as_tibble(snpmat.thisClus[names(mut.data),], rownames = NA) %>% 
      tibble::rownames_to_column(var = "genomic.event") %>% 
      dplyr::mutate(type = "mut") %>% 
      dplyr::mutate_all(as.character) %>% 
      dplyr::select(.data$genomic.event, .data$type, dplyr::everything())
    
    # for each cytoband region subset the genes in that region and get the gene 
    # with the highest number of events to represent it
    
    # amplifications 
    for(row in seq_len(nrow(amp.data))) {
      cnv.loc <- amp.data$id[row]
      genes.inBand <- as.character(band2gene[which(names(band2gene)==cnv.loc)])
      band.mat <- amps.thisClus[intersect(genes.inBand, rownames(amps.thisClus)),]
      if(is.null(nrow(band.mat))) { 
        # only one gene case, just use that one
        amp.data$gene[row] <- intersect(genes.inBand, rownames(amps.thisClus))
      } else {
        rank.events <- apply(band.mat, 1, function(x) sum(!is.na(x))) %>% sort(decreasing = TRUE)  
        # take the one with most events cause sometimes genes in the band won't all have same score
        amp.data$gene[row] <- names(rank.events)[1]
      }
      
    }
    
    amps.mat <- amps.thisClus[amp.data$gene, ]
    amps.mat <- cbind(genomic.event = amp.data$id, type = rep("amp", nrow(amps.mat)), amps.mat) %>% 
      as.data.frame(stringsAsFactors = FALSE)
    
    # deletions
    for(row in seq_len(nrow(del.data))) {
      cnv.loc <- del.data$id[row]
      genes.inBand <- as.character(band2gene[which(names(band2gene)==cnv.loc)])
      band.mat <- dels.thisClus[intersect(genes.inBand, rownames(dels.thisClus)),]
      if(is.null(nrow(band.mat))) { 
        # only one gene case, just use that one
        del.data$gene[row] <- intersect(genes.inBand, rownames(dels.thisClus))
      } else {
        rank.events <- apply(band.mat, 1, function(x) sum(!is.na(x))) %>% sort(decreasing = TRUE)  
        # take the one with most events cause sometimes genes in the band won't all have same score
        del.data$gene[row] <- names(rank.events)[1]
      }
      
    }
    
    dels.mat <- dels.thisClus[del.data$gene, ]
    dels.mat <- cbind(genomic.event = del.data$id, type = rep("del", nrow(dels.mat)), dels.mat) %>%
      as.data.frame(stringsAsFactors = FALSE)
    
    # fusions
    
    if(!is.null(fusions.thisClus) & length(fus.data) > 0) {
      fus.mat <- tibble::as_tibble(fusions.thisClus[names(fus.data),], rownames = NA) %>% 
        tibble::rownames_to_column(var = "genomic.event") %>% 
        dplyr::mutate(type = "fus") %>% 
        dplyr::mutate_all(as.character) %>% 
        dplyr::select(.data$genomic.event, .data$type, dplyr::everything())
    } else {
      fus.mat <- NULL
    }
    
    
    mat.to.plot <- dplyr::bind_rows(mut.mat, amps.mat, dels.mat, fus.mat)
    
  } 
  
  
  # merge rows that are duplicated (have muts and amp/dels)
  dups.count <- mat.to.plot %>% dplyr::group_by(.data$genomic.event) %>% 
    dplyr::summarise(n = dplyr::n())
  dups.count <- dups.count[dups.count$n > 1, 1]
  dups <- mat.to.plot[mat.to.plot$genomic.event %in% dups.count$genomic.event,]
  
  
  if(nrow(dups) > 0 ) {
    dups.names <- unique(dups$genomic.event)
    # remove duplicates from the current matrix
    mat.to.plot <- mat.to.plot[!mat.to.plot$genomic.event %in% dups.count$genomic.event,]
    for (name in dups.names) {
      # collapse all types of events together into one row and then rebind to main matrix
      # having NA;event is fine syntax for use in oncoprint function
      #### added in function to keep amps and dels separate, only merge mut with amps/dels
      
      merged <- dups[dups$genomic.event == name,]
      if(!"mut" %in% merged$type) {
        # means that only events are amps/dels, don't merge. 
        # add a/d to the end do be able to differentiate later 
        # (and to not run into issue of duplicate rownames in the matrix)
        merged$genomic.event <- paste(merged$genomic.event, merged$type, sep = "_")
      } else if ("mut" %in% merged$type & nrow(merged) == 2) {
        # means that one event is a mut and other is amp/del
        # merge together like before by collapsing the rows together
        merged <- apply(merged, 2, paste0, collapse = ";")
        merged[1] <- name
      } else if (nrow(merged) == 3) {
        # means that there is one of each type of event mut, amp and del
        # merge mut with amp and del separately
        
        amp.merged <- merged[merged$type %in% c("amp", "mut"),] %>% apply(2, paste0, collapse = ";")
        amp.merged[1] <- paste(name, "amp", sep = "_")
        
        del.merged <- merged[merged$type %in% c("del", "mut"),] %>% apply(2, paste0, collapse = ";")
        del.merged[1] <- paste(name, "del", sep = "_")
        
        merged <- dplyr::bind_rows(amp.merged, del.merged)
      } else {
        stop("Problem merging duplicates, double check gene names.")
      }
      
      mat.to.plot <- dplyr::bind_rows(mat.to.plot, merged)
    }
  }
  
  # replace new NA's with regular ones so that plots can be counted for events again
  mat.to.plot[mat.to.plot == "NA;NA"] <- NA
  mat.to.plot[mat.to.plot == "NA;NA;NA"] <- NA
  
  # get row sums, in terms of events and take top ones. filter for events that happen at least twice here
  mat.to.plot <- mat.to.plot %>% dplyr::select(-.data$type) %>%
    tibble::column_to_rownames(var = "genomic.event") %>% 
    as.matrix()
  num.events <- apply(mat.to.plot, 1, function(x) sum(!is.na(x))) %>% sort(decreasing = TRUE) 
  num.events <- num.events[num.events > 1] 
  
  # add events in chunks of frequency until max is reached
  final.mat <- matrix(nrow = 0, ncol = ncol(mat.to.plot))
  freq.range <- unique(num.events)
  freq.idx <- 1
  while(nrow(final.mat) < max.events & freq.idx <= length(unique(num.events))) {
    to.add <- names(num.events[num.events == freq.range[freq.idx]])
    to.add.mat <- matrix(mat.to.plot[to.add,], nrow = length(to.add), dimnames = list(rownames = to.add))
    # final.mat <- rbind(final.mat, to.add.mat)
    new.final.mat <- rbind(final.mat, to.add.mat)
    if(nrow(new.final.mat) > max.events) {
      break
    } else {
      final.mat <- new.final.mat
    }
    freq.idx <- freq.idx + 1
  }
  
  # check if all genes are unique (as per merging of muts with amps/dels)
  # if so remove _cnv ending, else leave it on
  
  final.plot.names <- stringr::str_split_fixed(rownames(final.mat), pattern = "_", n = 2)
  if(length(unique(final.plot.names[,1])) == nrow(final.mat)) {
    rownames(final.mat) <- final.plot.names[,1]
  }
  
  message("Plotting ", nrow(final.mat), " total events for subtype ")
  
  if(nrow(final.mat) == 0) {
    return(grid::textGrob("No significant events to plot for this subtype"))
  }
  
  
  ##########
  # Heatmap plot
  #########
  
  
  heatmap_legend_param = list(title = "Alterations", 
                              at = c("amp", "del", "mut", "fus"), 
                              labels = c("Amplification", "Deletion", "Mutation", "Fusion"))
  
  
  # color parameters for oncoprint
  col = c("del" = "blue", "amp" = "red", "mut" = "#008000", "fus" = 'gold2', "NA" = "grey22")
  alter_fun = list(
    background = function(x, y, w, h) {
      grid.rect(x, y, w*0.6, h*0.75, 
                gp = gpar(fill = "#CCCCCC", col = NA))
    },
    # big blue - dels
    del = function(x, y, w, h) {
      grid.rect(x, y, w*0.6, h*0.75, 
                gp = gpar(fill = col["del"], col = NA))
    },
    # big red - amps
    amp = function(x, y, w, h) {
      grid.rect(x, y, w*0.6, h*0.75, 
                gp = gpar(fill = col["amp"], col = NA))
    },
    
    # small green
    mut = function(x, y, w, h) {
      grid.rect(x, y, w*0.6, h*0.33, 
                gp = gpar(fill = col["mut"], col = NA))
    },
    
    # big yellow
    fus = function(x, y, w, h) {
      grid.rect(x, y, w*0.6, h*0.75, 
                gp = gpar(fill = col["fus"], col = NA))
    }
  )
  
  # optimize size of labels
  if(nrow(final.mat) >= 55) {
    label.size <- 3
  } else if (nrow(final.mat) >= 40 ) {
    label.size <- 4
  } else if (nrow(final.mat) >= 30) {
    label.size <- 5
  } else if (nrow(final.mat) >= 20) {
    label.size <- 6
  } else {
    label.size <- 7
  }
  
  
  title <- paste("Genomic Events for Subtype", k)
  
  # set NAs to empty string in final mat to avoid Complex Heatmap warning
  final.mat[is.na(final.mat)] <- ""
  
  ht <- ComplexHeatmap::oncoPrint(final.mat,
                                  alter_fun = alter_fun, col = col,
                                  heatmap_legend_param = heatmap_legend_param,
                                  pct_side = "right", pct_gp = gpar(fontsize = label.size),
                                  column_title = title, column_title_gp = gpar(fontsize = 8),
                                  row_names_side = "left", row_names_gp = gpar(fontsize = label.size),
                                  show_heatmap_legend = TRUE,
                                  top_annotation = HeatmapAnnotation(
                                    column_barplot = anno_oncoprint_barplot(height = unit(.4, "cm"), axis_param = list(gp = gpar(fontsize = 4)))),
                                  right_annotation = rowAnnotation(
                                    row_barplot = anno_oncoprint_barplot(width = unit(1, "cm"), axis_param = list(gp = gpar(fontsize = 5)))))
  
  
  
  p <- grid::grid.grabExpr(draw(ht, padding = unit(c(0, 0, 0, 0), "mm")))
  
  p
  
}







##### No longer the primary plot style but keeping as an option plot style
##### TODO: Need to make it a freestanding function
#' Plot barchart of genomic events 
#' 
#' @importFrom rlang .data
#' @param summary.vec : named vector of the counts, named 'Event name':'Type'
#' where type is 'mut', 'amp', 'del', 'fus'. Mutations are in Entrez ID
#' Amp/Deletion CNV events are in genomic band location
#' @param highlight.genes : well known genes to highlight in the analysis in 
#' @param genomeBand_2_gene : mapping of genomic location IDs to gene name: 
#' vector of HUGO gene ids, named by genomic loci
#' @param samples.total : number of samples in the subtype, used to calculate percentages
#' @param max.muts : maximum number of mutations to get per sample, default is 10
#' @param max.cnv : maximum number of cnvs to per sample, default is 5
#' @return plot object
#' @keywords internal 
plotEvents <- function(summary.vec, highlight.genes=NULL, genomeBand_2_gene=NULL,
                       samples.total, max.muts = 10, max.cnv = 5) {
  
  data <- data.frame(coverage=names(summary.vec), Freq=as.numeric(summary.vec))
  # order by individual frequency
  data <- data[order(-data$Freq),]
  
  data$id <- unlist(lapply(data$coverage, function(label) {
    label <- as.character(label)
    name <- strsplit(label, ':')[[1]][1]
    hugo <- mapEntrez(as.character(name))
    if (is.na(hugo)) {
      return (name)
    } else {
      return (hugo)
    }
  }))
  
  # make a new column: for CNV band locations
  data$type <- unlist(lapply(data$coverage, function(label) {
    type <- strsplit(as.character(label), ':')[[1]][2]
    type
  }))
  
  # get order dataframe of events 
  mapped <- getDataFrame(data, highlight.genes, genomeBand_2_gene, max.muts = 10, max.cnv = 5)
  # check the size: if we have too many events to display, then select the top N unique IDs
  # already sorted by frequency of occurence. 
  if (nrow(mapped) > 50) {
    # add at least half simply with mutated gene labels
    mut.data <- mapped[mapped$type=='mut',]
    add.labels <- na.omit(unique(mut.data[order(-mut.data$Freq),][seq_len(25),]$id))
    # and add cnv
    additional <- setdiff(unique(mapped$id), add.labels)[seq_len(50-length(add.labels))]
    mapped <- mapped[mapped$id %in% union(additional, add.labels),]
  }
  
  # The palette with black: (unnecessary?)
  # cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
  
  # if exporting as object then just use default of 10 and then change by adding geom/layer when plotting
  # else if plotting directly then try to adjust text size to fit based on number of events
  
  # data.size <- nrow(mapped)
  # if(is.null(output)) {
  #   print("No file output name found, using pre-selected text size for plotting")
  #   y.textSize <- 10
  # } else {
  #   print("Output plot name found, adjusting text size based on number of events")
  #   if (num.subtypes > 5 && data.size > 50) {
  #     y.textSize <- 1
  #   } else if (num.subtypes > 4 && data.size > 25) {
  #     y.textSize <- 2
  #   } else if (num.subtypes > 4 && data.size <= 25) {
  #     y.textSize <- 3
  #   } else if (data.size < 20) {
  #     y.textSize <- 5
  #   } else if (data.size < 40) {
  #     y.textSize <- 6
  #   }
  # }
  
  message('Number of entries in events matrix: ', nrow(mapped))
  #print (paste('Using font size ', y.textSize))
  
  
  # Scale frequency to a percentage of samples in that cluster not number of occurences
  mapped$freq.percentage <- mapped$Freq/samples.total
  
  p <- ggplot2::qplot(x=mapped$id, y=mapped$freq.percentage, fill=mapped$type, data=mapped, geom = "col") + coord_flip() +
    scale_fill_manual(values = c("mut"='#00BA38', "amp"= '#F8766D', "del" = '#619CFF', "fus" = '#FF8C00' )) +
    ylab("Frequency") + xlab("Event") 
  #theme(axis.text.y = element_text(size=y.textSize))
  #labs(title=label) 
  
  # if (!is.null(output)) {
  #   p <- p + labs(title = paste(plot.label, "subtype", k))
  #   ggsave(output)
  # }
  
  p
}



#' Helper function to get data frame for bar plot plot.events function

#' @param data data.frame with $type, $id, $Freq per event
#' @param highlight.genes genes to look for in mutations/cnv lists (if looking
#'  for specific genes because of prior knowledge)
#' @param genomeBand_2_gene mapping of genomic location IDs to gene name: 
#' vector of HUGO gene ids, named by genomic loci
#' @param max.muts maximum number of mutations to get per sample, default is 10
#' @param max.cnv maximum number of cnvs to per sample, default is 5
#' @return ordered data frame with each genomic event and it's frequency
#' @keywords internal
getDataFrame <- function(data, highlight.genes, genomeBand_2_gene, 
                         max.muts = 10, max.cnv = 5) {
  
  #### edit logic of this section ?
  
  
  MAX.muts <- max.muts
  MAX.CNV.loc <- max.cnv
  ## all all highlight genes and events to the 
  
  ## for each CNV location event, find genes that overlap with the highlight list. 
  ## add duplicate entries for those genes
  mapped <- c()
  loc.data <- data[data$type=='del',]
  for (row in seq_len(nrow(loc.data))) {
    loc <- loc.data[row, 3]
    genes.inBand <- as.character(genomeBand_2_gene[which(names(genomeBand_2_gene)==loc)])
    hgIB <- intersect(genes.inBand, highlight.genes)
    
    if (length(hgIB)==0) { next }
    
    mapped <- rbind(mapped, data.frame(coverage=loc.data[row, 1], 
                                       Freq=loc.data[row, 2], 
                                       id=hgIB, 
                                       type=loc.data[row, 4]))
  }
  loc.data <- data[data$type=='amp',]
  for (row in seq_len(nrow(loc.data))) {
    loc <- loc.data[row, 3]
    genes.inBand <- as.character(genomeBand_2_gene[which(names(genomeBand_2_gene)==loc)])
    hgIB <- intersect(genes.inBand, highlight.genes)
    
    if (length(hgIB)==0) { next }
    
    mapped <- rbind(mapped, data.frame(coverage=loc.data[row, 1], 
                                       Freq=loc.data[row, 2], 
                                       id=hgIB, 
                                       type=loc.data[row, 4]))
  }
  
  # find mutations in key regions
  loc.data <- data[data$type=='mut',]
  for (row in seq_len(nrow(loc.data))) {
    gene <- loc.data[row, 3]
    hgIB <- intersect(gene, highlight.genes)
    if (length(hgIB)==0) { next }
    mapped <- rbind(mapped, data.frame(coverage=loc.data[row, 1], 
                                       Freq=loc.data[row, 2], 
                                       id=hgIB, 
                                       ype=loc.data[row, 4]))
  }
  
  # HUGO ids: add additional mutation events outside of the highlight genes
  all.gene.ids <- as.character(mapped$id)
  # add in top X mutations not in the driver list
  mut.data <- data[data$type=='mut',]
  i = 1	
  for (row in seq_len(nrow(mut.data))) {
    if (mut.data[row,]$id %in% all.gene.ids) { next }
    if (i > MAX.muts) { break }
    mapped <- rbind(mapped, mut.data[row,])
    i = 1+i
  }
  
  # add all fusions
  mapped <- rbind(mapped, data[data$type=='fus',])
  
  # add CNV band locations (a few)
  subset <- data[apply(cbind(data$type=='amp', data$type=='del'), 1, any),]
  if (nrow(subset) > MAX.CNV.loc) { subset <- subset[seq_len(MAX.CNV.loc),] }
  mapped <- rbind(mapped, subset)
  
  # order by total frequency of events by gene/id
  mapped$event_sums <- vapply(mapped$id, function(id) { 
    sum(as.numeric(mapped[which(mapped$id==as.character(id)),]$Freq)) }, 
    FUN.VALUE = numeric(1))
  mapped <- mapped[order(-mapped$event_sums),]
  mapped$id <- factor(mapped$id, levels=unique(rev(mapped$id)))
  
  mapped
}


#' Make small genomic plot
#' @importFrom tidyr drop_na
#' @importFrom rlang .data
#' @importFrom grDevices colorRampPalette
#' @import ggplot2
#' @import magrittr
#' @param input.df : tissue.coverage.df with mean, k, fraction and unique events.
#' @param fraction : what fraction coverage to use for genomic curve threshold
#' @param tissue.cluster : which cluster subsample to look at
#' @return output .png
#' @keywords internal
genomicPlotSmall <- function(input.df, fraction=0.85, tissue.cluster=NULL)  { 
  
  #### need to add in null matrix function ###
  
  
  # get number of subtypes and colors for plotting
  num.subtypes <- length(unique(input.df$subtype))
  getPalette <- colorRampPalette(brewer.pal(n = 8, name = "Dark2"))
  subtype.colors <- getPalette(num.subtypes)
  color.this.sub <- subtype.colors[tissue.cluster]
  
  
  subtype.df <- input.df[input.df$subtype == tissue.cluster,] 
  subtype.df <- subtype.df %>% tidyr::drop_na(.data$fraction)
  
  
  sweep <- subtype.df$fraction
  names(sweep) <- sort(unique(subtype.df$k))
  best.k <- fitCurvePercent(sweep, frac=fraction)
  message("Number of MRs in checkpoint: ", best.k)
  
  
  # mean statistic for these samples
  mean.stat <- unlist(lapply(sort(unique(subtype.df$k)), function(k) {
    mean(subtype.df[which(subtype.df$k==k),]$fraction)
  }))
  sweep <- mean.stat
  # Get mean # of samples
  mean.events.stat <- unlist(lapply(sort(unique(subtype.df$k)), function(k) {
    mean(subtype.df[which(subtype.df$k==k),]$mean)
  }))
  names(mean.events.stat) <- sort(unique(input.df$k))
  # the average multiplier across all the bands:
  # use to count the mean number of events on the right side
  y.multiplier <- mean(na.omit(mean.events.stat/mean.stat))
  
  # first make a condensed plot just the first ~100 events
  ymax <- subtype.df[nrow(subtype.df),]$fraction
  
  # make plot
  p.100 <- ggplot(subtype.df, aes(.data$k, .data$fraction)) + geom_line(color = color.this.sub, size=1.5, alpha=0.75) +
    xlab("Number of MRs") +
    scale_y_continuous(
      "Mean Fraction",
      sec.axis = sec_axis(~ . * y.multiplier, name = "Count"),
      limits=c(0,ymax+0.01)
    ) +
    #geom_ribbon(aes(ymin=null.bq, ymax=null.uq), color='#808080', alpha=0.2, linetype=2) +
    geom_vline(xintercept=as.numeric(best.k), linetype=3) +
    xlim(0,100) +
    #labs(title=tissue) +
    theme(axis.text.x = element_text(size=10), axis.text.y = element_text(size=10), legend.position="none")
  
  # return the plot itself
  p.100 
}
Any scripts or data that you put into this service are public.
MOMA documentation built on Nov. 8, 2020, 6:01 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
MOMA
Multi Omic Master Regulator Analysis

R/make.saturation.plots.r
In MOMA: Multi Omic Master Regulator Analysis

Defines functions genomicPlotSmall getDataFrame plotEvents oncoprintPlot mergeData mergeDataBySubtype makeCoverageDf getSubtypeEventTables makeSaturationPlots

Documented in genomicPlotSmall getDataFrame getSubtypeEventTables makeCoverageDf makeSaturationPlots mergeData mergeDataBySubtype oncoprintPlot plotEvents

Try the MOMA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MOMA Multi Omic Master Regulator Analysis

R/make.saturation.plots.r In MOMA: Multi Omic Master Regulator Analysis

Defines functions genomicPlotSmall getDataFrame plotEvents oncoprintPlot mergeData mergeDataBySubtype makeCoverageDf getSubtypeEventTables makeSaturationPlots

Documented in genomicPlotSmall getDataFrame getSubtypeEventTables makeCoverageDf makeSaturationPlots mergeData mergeDataBySubtype oncoprintPlot plotEvents

Try the MOMA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MOMA
Multi Omic Master Regulator Analysis

R/make.saturation.plots.r
In MOMA: Multi Omic Master Regulator Analysis