#' Compare mutation load against TCGA cohorts
#' @description Compares mutation load in input MAF against all of 33 TCGA cohorts derived from MC3 project.
#' @param maf \code{\link{MAF}} object(s) generated by \code{\link{read.maf}}
#' @param capture_size capture size for input MAF in MBs. Default NULL. If provided plot will be scaled to mutations per mb. TCGA capture size is assumed to be 50mb.
#' @param tcga_capture_size capture size for TCGA cohort in MB. Default 50
#' @param cohortName name for the input MAF cohort. Default "Input"
#' @param tcga_cohorts restrict tcga data to these cohorts.
#' @param primarySite If TRUE uses primary site of cancer as labels instead of TCGA project IDs. Default FALSE.
#' @param col color vector for length 2 TCGA cohorts and input MAF cohort. Default gray70 and black.
#' @param bg_col background color. Default'#EDF8B1', '#2C7FB8'
#' @param medianCol color for median line. Default red.
#' @param logscale Default TRUE
#' @param decreasing Default FALSE. Cohorts are arranged in increasing mutation burden.
#' @param rm_hyper Remove hyper mutated samples (outliers)? Default FALSE
#' @return data.table with median mutations per cohort
#' @examples
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' tcgaCompare(maf = laml, cohortName = "AML")
#' @export
tcgaCompare = function(maf, capture_size = NULL, tcga_capture_size = 50, cohortName = NULL, tcga_cohorts = NULL, primarySite = FALSE, col = c('gray70', 'black'), bg_col = c('#EDF8B1', '#2C7FB8'), medianCol = 'red', decreasing = FALSE, logscale = TRUE, rm_hyper = FALSE){
tcga.cohort = system.file('extdata', 'tcga_cohort.txt.gz', package = 'maftools')
if(Sys.info()[['sysname']] == 'Windows'){
tcga.cohort.gz = gzfile(description = tcga.cohort, open = 'r')
tcga.cohort <- suppressWarnings( data.table(read.csv( file = tcga.cohort.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
close(tcga.cohort.gz)
} else{
tcga.cohort = data.table::fread(cmd = paste('zcat <', tcga.cohort), sep = '\t', stringsAsFactors = FALSE)
}
if(primarySite){
tcga.cohort = tcga.cohort[,.(Tumor_Sample_Barcode, total, site)]
colnames(tcga.cohort)[3] = 'cohort'
}else{
tcga.cohort = tcga.cohort[,.(Tumor_Sample_Barcode, total, cohort)]
}
if(!is.null(tcga_cohorts)){
tcga.cohort = tcga.cohort[cohort %in% tcga_cohorts]
if(nrow(tcga.cohort) == 0){
stop("Something went wrong. Provide correct names for 'tcga_cohorts' arguments")
}
}
if(length(maf) == 1){
maf = list(maf)
}
maf.mutload = lapply(maf, function(m){
x = getSampleSummary(m)[,.(Tumor_Sample_Barcode, total)]
x
})
if(is.null(cohortName)){
cohortName = paste0('Input', seq_len(length(maf)))
}else if(length(cohortName) != length(maf)){
stop("Please provide names for all input cohorts")
}
names(maf.mutload) = cohortName
maf.mutload = data.table::rbindlist(l = maf.mutload, idcol = "cohort")
#maf.mutload[,cohort := cohortName]
tcga.cohort$total = as.numeric(as.character(tcga.cohort$total))
maf.mutload$total = as.numeric(as.character(maf.mutload$total))
if(rm_hyper){
#Remove outliers from tcga cohort
tcga.cohort = split(tcga.cohort, as.factor(tcga.cohort$cohort))
tcga.cohort = lapply(tcga.cohort, function(x){
xout = boxplot.stats(x = x$total)$out
if(length(xout) > 0){
message(paste0("Removed ", length(xout), " outliers from ", x[1,cohort]))
x = x[!total %in% xout]
}
x
})
tcga.cohort = data.table::rbindlist(l = tcga.cohort)
#Remove outliers from input cohort
xout = boxplot.stats(x = maf.mutload$total)$out
if(length(xout) > 0){
message(paste0("Removed ", length(xout), " outliers from Input MAF"))
maf.mutload = maf.mutload[!total %in% xout]
}
}
if(is.null(capture_size)){
tcga.cohort = rbind(tcga.cohort, maf.mutload)
pt.test = pairwise.t.test(x = tcga.cohort$total, g = tcga.cohort$cohort, p.adjust.method = "fdr")
pt.test.pval = as.data.frame(pt.test$p.value)
data.table::setDT(x = pt.test.pval, keep.rownames = TRUE)
colnames(pt.test.pval)[1] = 'Cohort'
message("Performing pairwise t-test for differences in mutation burden..")
pt.test.pval = data.table::melt(pt.test.pval, id.vars = "Cohort")
colnames(pt.test.pval) = c("Cohort1", "Cohort2", "Pval")
tcga.cohort[, plot_total := total]
}else{
message(paste0("Capture size [TCGA]: ", tcga_capture_size))
message(paste0("Capture size [Input]: ", capture_size))
maf.mutload[,total_perMB := total/capture_size]
tcga.cohort[,total_perMB := total/tcga_capture_size]
tcga.cohort = rbind(tcga.cohort, maf.mutload)
message("Performing pairwise t-test for differences in mutation burden (per MB)..")
pt.test = pairwise.t.test(x = tcga.cohort$total_perMB, g = tcga.cohort$cohort, p.adjust.method = "fdr")
pt.test.pval = as.data.frame(pt.test$p.value)
data.table::setDT(x = pt.test.pval, keep.rownames = TRUE)
colnames(pt.test.pval)[1] = 'Cohort'
pt.test.pval = data.table::melt(pt.test.pval, id.vars = "Cohort")
colnames(pt.test.pval) = c("Cohort1", "Cohort2", "Pval")
tcga.cohort[, plot_total := total_perMB]
}
#Median mutations
tcga.cohort.med = tcga.cohort[,.(.N, median(plot_total)),cohort][order(V2, decreasing = decreasing)]
tcga.cohort$cohort = factor(x = tcga.cohort$cohort,levels = tcga.cohort.med$cohort)
colnames(tcga.cohort.med) = c('Cohort', 'Cohort_Size', 'Median_Mutations')
tcga.cohort$TCGA = ifelse(test = tcga.cohort$cohort %in% cohortName,
yes = 'Input', no = 'TCGA')
#Plotting data
tcga.cohort = split(tcga.cohort, as.factor(tcga.cohort$cohort))
plot.dat = lapply(seq_len(length(tcga.cohort)), function(i){
x = tcga.cohort[[i]]
x = data.table::data.table(rev(seq(i-1, i, length.out = nrow(x))),
x[order(plot_total, decreasing = TRUE), plot_total],
x[,TCGA])
x
})
names(plot.dat) = names(tcga.cohort)
if(logscale){
y_lims = range(log10(data.table::rbindlist(l = plot.dat)[,V2]))
}else{
y_lims = range(data.table::rbindlist(l = plot.dat)[,V2])
}
#y_lims = range(log10(unlist(lapply(plot.dat, function(x) max(x[,V2], na.rm = TRUE)))))
y_min = floor(min(y_lims))
y_max = ceiling(max(y_lims))
y_lims = c(y_min, y_max)
y_at = pretty(y_lims)
par(mar = c(4, 3, 2.5, 1.5))
plot(NA, NA, xlim = c(0, length(plot.dat)), ylim = y_lims, axes = FALSE, xlab = NA, ylab = NA, frame.plot = TRUE)
rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4], col = grDevices::adjustcolor(col = "gray", alpha.f = 0.1))
rect(xleft = seq(0, length(plot.dat)-1, 1), ybottom = min(y_lims), xright = seq(1, length(plot.dat), 1),
ytop = y_max, col = grDevices::adjustcolor(col = bg_col, alpha.f = 0.2),
border = NA)
#rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4], col = grDevices::adjustcolor(col = bg_col, alpha.f = 0.2))
abline(h = pretty(y_lims), lty = 2, col = "gray70")
#abline(v = seq(1, length(plot.dat)), lty = 1, col = "gray70")
lapply(seq_len(length(plot.dat)), function(i){
x = plot.dat[[i]]
if(x[1, V3] == "Input"){
if(logscale){
points(x$V1, log10(x$V2), pch = 16, cex = 0.4, col = col[2])
}else{
points(x$V1, x$V2, pch = 16, cex = 0.4, col = col[2])
}
}else{
if(logscale){
points(x$V1, log10(x$V2), pch = 16, cex = 0.4, col = col[1])
}else{
points(x$V1, x$V2, pch = 16, cex = 0.4, col = col[1])
}
}
})
samp_sizes = lapply(plot.dat, nrow)
axis(side = 1, at = seq(0.5, length(plot.dat)-0.5, 1), labels = names(plot.dat),
las = 2, tick = FALSE, line = -0.8)
axis(side = 3, at = seq(0.5, length(plot.dat)-0.5, 1), labels = unlist(samp_sizes),
las = 2, tick = FALSE, line = -0.8, font = 3)
tcga.cohort.med[, Median_Mutations_log10 := log10(Median_Mutations)]
if (decreasing) {
sidePos = 2
linePos = 1.6
} else {
sidePos = 2 # 4 is bad
linePos = 1.6
}
if(logscale){
if(is.null(capture_size)){
axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE, labels = 10^(y_at))
mtext(text = "TMB", side = sidePos, line = linePos)
#colnames(tcga.cohort.med) = c("Cohort", "Cohort_Size", "Median_Mutations", "Median_Mutations_perMB")
}else{
axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE, labels = 10^(y_at))
mtext(text = "TMB (per MB)", side = sidePos, line = linePos)
#colnames(tcga.cohort.med) = c("Cohort", "Cohort_Size", "Median_Mutations_perMB", "Median_Mutations_perMB_log10")
}
}else{
if(is.null(capture_size)){
axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE)
mtext(text = "TMB", side = sidePos, line = linePos)
#colnames(tcga.cohort.med) = c("Cohort", "Cohort_Size", "Median_Mutations", "Median_Mutations_perMB")
}else{
axis(side = 2, at = y_at, las = 2, line = -0.6, tick = FALSE)
mtext(text = "TMB (per MB)", side = sidePos, line = linePos)
}
}
if(logscale){
lapply(seq_len(nrow(tcga.cohort.med)), function(i){
segments(x0 = i-1, x1 = i, y0 = tcga.cohort.med[i, Median_Mutations_log10],
y1 = tcga.cohort.med[i, Median_Mutations_log10], col = medianCol)
})
}else{
lapply(seq_len(nrow(tcga.cohort.med)), function(i){
segments(x0 = i-1, x1 = i, y0 = tcga.cohort.med[i, Median_Mutations],
y1 = tcga.cohort.med[i, Median_Mutations], col = medianCol)
})
}
tcga.cohort = data.table::rbindlist(l = tcga.cohort)
tcga.cohort[, plot_total := NULL]
tcga.cohort[, TCGA := NULL]
pt.test.pval = pt.test.pval[!is.na(Pval)][order(Pval, decreasing = FALSE)]
list(median_mutation_burden = tcga.cohort.med, mutation_burden_perSample = tcga.cohort, pairwise_t_test = pt.test.pval)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.