maftools: Summarize, Analyze and Visualize MAF Files

Documented in tcgaCompare

#' Compare mutation load against TCGA cohorts
#' @description Compares mutation load in input MAF against all of 33 TCGA cohorts derived from MC3 project.
#' @param maf \code{\link{MAF}} object(s) generated by \code{\link{read.maf}}
#' @param capture_size capture size for input MAF in MBs. Default NULL. If provided plot will be scaled to mutations per mb. TCGA capture size is assumed to be 50mb.
#' @param tcga_capture_size capture size for TCGA cohort in MB. Default 50
#' @param cohortName name for the input MAF cohort. Default "Input"
#' @param tcga_cohorts restrict tcga data to these cohorts.
#' @param primarySite If TRUE uses primary site of cancer as labels instead of TCGA project IDs. Default FALSE.
#' @param col color vector for length 2 TCGA cohorts and input MAF cohort. Default gray70 and black.
#' @param bg_col background color. Default'#EDF8B1', '#2C7FB8'
#' @param medianCol color for median line. Default red.
#' @param logscale Default TRUE
#' @param decreasing Default FALSE. Cohorts are arranged in increasing mutation burden.
#' @param rm_hyper Remove hyper mutated samples (outliers)? Default FALSE
#' @return data.table with median mutations per cohort
#' @examples
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' tcgaCompare(maf = laml, cohortName = "AML")
#' @export

tcgaCompare = function(maf, capture_size = NULL, tcga_capture_size = 50, cohortName = NULL, tcga_cohorts = NULL, primarySite = FALSE, col = c('gray70', 'black'), bg_col = c('#EDF8B1', '#2C7FB8'), medianCol = 'red', decreasing = FALSE, logscale = TRUE, rm_hyper = FALSE){

  tcga.cohort = system.file('extdata', 'tcga_cohort.txt.gz', package = 'maftools')

  if(Sys.info()[['sysname']] == 'Windows'){
    tcga.cohort.gz = gzfile(description = tcga.cohort, open = 'r')
    tcga.cohort <- suppressWarnings( data.table(read.csv( file = tcga.cohort.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
    close(tcga.cohort.gz)
  } else{
    tcga.cohort = data.table::fread(cmd = paste('zcat <', tcga.cohort), sep = '\t', stringsAsFactors = FALSE)
  }

  if(primarySite){
    tcga.cohort = tcga.cohort[,.(Tumor_Sample_Barcode, total, site)]
    colnames(tcga.cohort)[3] = 'cohort'
  }else{
    tcga.cohort = tcga.cohort[,.(Tumor_Sample_Barcode, total, cohort)]
  }

  if(!is.null(tcga_cohorts)){
    tcga.cohort = tcga.cohort[cohort %in% tcga_cohorts]
    if(nrow(tcga.cohort) == 0){
      stop("Something went wrong. Provide correct names for 'tcga_cohorts' arguments")
    }
  }

  if(length(maf) == 1){
    maf = list(maf)
  }
  maf.mutload = lapply(maf, function(m){
    x = getSampleSummary(m)[,.(Tumor_Sample_Barcode, total)]
    x
  })

  if(is.null(cohortName)){
    cohortName = paste0('Input', seq_len(length(maf)))
  }else if(length(cohortName) != length(maf)){
    stop("Please provide names for all input cohorts")
  }

  names(maf.mutload) = cohortName
  maf.mutload = data.table::rbindlist(l = maf.mutload, idcol = "cohort")

  #maf.mutload[,cohort := cohortName]
  tcga.cohort$total = as.numeric(as.character(tcga.cohort$total))
  maf.mutload$total = as.numeric(as.character(maf.mutload$total))

  if(rm_hyper){
    #Remove outliers from tcga cohort
    tcga.cohort = split(tcga.cohort, as.factor(tcga.cohort$cohort))
    tcga.cohort = lapply(tcga.cohort, function(x){
      xout = boxplot.stats(x = x$total)$out
      if(length(xout) > 0){
        message(paste0("Removed ", length(xout), " outliers from ", x[1,cohort]))
        x = x[!total %in% xout]
      }
      x
    })
    tcga.cohort = data.table::rbindlist(l = tcga.cohort)
    #Remove outliers from input cohort
    xout = boxplot.stats(x = maf.mutload$total)$out
    if(length(xout) > 0){
      message(paste0("Removed ", length(xout), " outliers from Input MAF"))
      maf.mutload = maf.mutload[!total %in% xout]
    }
  }

  if(is.null(capture_size)){
    tcga.cohort = rbind(tcga.cohort, maf.mutload)
    pt.test = pairwise.t.test(x = tcga.cohort$total, g = tcga.cohort$cohort, p.adjust.method = "fdr")
    pt.test.pval = as.data.frame(pt.test$p.value)
    data.table::setDT(x = pt.test.pval, keep.rownames = TRUE)
    colnames(pt.test.pval)[1] = 'Cohort'
    message("Performing pairwise t-test for differences in mutation burden..")
    pt.test.pval = data.table::melt(pt.test.pval, id.vars = "Cohort")
    colnames(pt.test.pval) = c("Cohort1", "Cohort2", "Pval")
    tcga.cohort[, plot_total := total]
  }else{
    message(paste0("Capture size [TCGA]:  ", tcga_capture_size))
    message(paste0("Capture size [Input]: ", capture_size))
    maf.mutload[,total_perMB := total/capture_size]
    tcga.cohort[,total_perMB := total/tcga_capture_size]
    tcga.cohort = rbind(tcga.cohort, maf.mutload)
    message("Performing pairwise t-test for differences in mutation burden (per MB)..")
    pt.test = pairwise.t.test(x = tcga.cohort$total_perMB, g = tcga.cohort$cohort, p.adjust.method = "fdr")
    pt.test.pval = as.data.frame(pt.test$p.value)
    data.table::setDT(x = pt.test.pval, keep.rownames = TRUE)
    colnames(pt.test.pval)[1] = 'Cohort'
    pt.test.pval = data.table::melt(pt.test.pval, id.vars = "Cohort")
    colnames(pt.test.pval) = c("Cohort1", "Cohort2", "Pval")
    tcga.cohort[, plot_total := total_perMB]
  }

  #Median mutations
  tcga.cohort.med = tcga.cohort[,.(.N, median(plot_total)),cohort][order(V2, decreasing = decreasing)]
  tcga.cohort$cohort = factor(x = tcga.cohort$cohort,levels = tcga.cohort.med$cohort)
  colnames(tcga.cohort.med) = c('Cohort', 'Cohort_Size', 'Median_Mutations')
  tcga.cohort$TCGA = ifelse(test = tcga.cohort$cohort %in% cohortName,
                            yes = 'Input', no = 'TCGA')

  #Plotting data
  tcga.cohort = split(tcga.cohort, as.factor(tcga.cohort$cohort))
  plot.dat = lapply(seq_len(length(tcga.cohort)), function(i){
                    x = tcga.cohort[[i]]
                    x = data.table::data.table(rev(seq(i-1, i, length.out = nrow(x))),
                                                   x[order(plot_total, decreasing = TRUE), plot_total],
                                               x[,TCGA])
                    x
                    })
  names(plot.dat) = names(tcga.cohort)
  if(logscale){
    y_lims = range(log10(data.table::rbindlist(l = plot.dat)[,V2]))
  }else{
    y_lims = range(data.table::rbindlist(l = plot.dat)[,V2])
  }

  #y_lims = range(log10(unlist(lapply(plot.dat, function(x) max(x[,V2], na.rm = TRUE)))))
  y_min = floor(min(y_lims))
  y_max = ceiling(max(y_lims))
  y_lims = c(y_min, y_max)
  y_at = pretty(y_lims)

  par(mar = c(4, 3, 2.5, 1.5))
  plot(NA, NA, xlim = c(0, length(plot.dat)), ylim = y_lims, axes = FALSE, xlab = NA, ylab = NA, frame.plot = TRUE)
  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4], col = grDevices::adjustcolor(col = "gray", alpha.f = 0.1))
  rect(xleft = seq(0, length(plot.dat)-1, 1), ybottom = min(y_lims), xright = seq(1, length(plot.dat), 1),
       ytop = y_max, col = grDevices::adjustcolor(col = bg_col, alpha.f = 0.2),
       border = NA)
  #rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4], col = grDevices::adjustcolor(col = bg_col, alpha.f = 0.2))

  abline(h = pretty(y_lims), lty = 2, col = "gray70")
  #abline(v = seq(1, length(plot.dat)), lty = 1, col = "gray70")
  lapply(seq_len(length(plot.dat)), function(i){
    x = plot.dat[[i]]
    if(x[1, V3] == "Input"){
      if(logscale){
        points(x$V1, log10(x$V2), pch = 16, cex = 0.4, col = col[2])
      }else{
        points(x$V1, x$V2, pch = 16, cex = 0.4, col = col[2])
      }
    }else{
      if(logscale){
        points(x$V1, log10(x$V2), pch = 16, cex = 0.4, col = col[1])
      }else{
        points(x$V1, x$V2, pch = 16, cex = 0.4, col = col[1])
      }
    }
  })

  samp_sizes = lapply(plot.dat, nrow)
  axis(side = 1, at = seq(0.5, length(plot.dat)-0.5, 1), labels = names(plot.dat),
       las = 2, tick = FALSE, line = -0.8)
  axis(side = 3, at = seq(0.5, length(plot.dat)-0.5, 1), labels = unlist(samp_sizes),
       las = 2, tick = FALSE, line = -0.8, font = 3)

  tcga.cohort.med[, Median_Mutations_log10 := log10(Median_Mutations)]

  if (decreasing) {
    sidePos = 2
    linePos = 1.6
  } else {
    sidePos = 2 # 4 is bad
    linePos = 1.6
  }

  if(logscale){
    if(is.null(capture_size)){
      axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE, labels = 10^(y_at))
      mtext(text = "TMB", side = sidePos, line = linePos)
      #colnames(tcga.cohort.med) = c("Cohort", "Cohort_Size", "Median_Mutations", "Median_Mutations_perMB")
    }else{
      axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE, labels = 10^(y_at))
      mtext(text = "TMB (per MB)", side = sidePos, line = linePos)
      #colnames(tcga.cohort.med) = c("Cohort", "Cohort_Size", "Median_Mutations_perMB", "Median_Mutations_perMB_log10")
    }
  }else{
    if(is.null(capture_size)){
      axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE)
      mtext(text = "TMB", side = sidePos, line = linePos)
      #colnames(tcga.cohort.med) = c("Cohort", "Cohort_Size", "Median_Mutations", "Median_Mutations_perMB")
    }else{
      axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE)
      mtext(text = "TMB (per MB)", side = sidePos, line = linePos)
    }
  }

  if(logscale){
    lapply(seq_len(nrow(tcga.cohort.med)), function(i){
      segments(x0 = i-1, x1 = i, y0 = tcga.cohort.med[i, Median_Mutations_log10],
               y1 = tcga.cohort.med[i, Median_Mutations_log10], col = medianCol)
    })
  }else{
    lapply(seq_len(nrow(tcga.cohort.med)), function(i){
      segments(x0 = i-1, x1 = i, y0 = tcga.cohort.med[i, Median_Mutations],
               y1 = tcga.cohort.med[i, Median_Mutations], col = medianCol)
    })
  }

  tcga.cohort = data.table::rbindlist(l = tcga.cohort)
  tcga.cohort[, plot_total := NULL]
  tcga.cohort[, TCGA := NULL]
  pt.test.pval = pt.test.pval[!is.na(Pval)][order(Pval, decreasing = FALSE)]
  list(median_mutation_burden = tcga.cohort.med, mutation_burden_perSample = tcga.cohort, pairwise_t_test = pt.test.pval)
}