knitr::opts_chunk$set(warning = FALSE, message = FALSE)
suppressPackageStartupMessages(library("readr")) suppressPackageStartupMessages(library("ggplot2")) suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("qdapRegex")) suppressPackageStartupMessages(library("tibble")) library("annoFuse")
To assess filtering sensitivity of annoFuse, we analyzed a subset of samples from TCGA and compared fusions retained with annoFuse filtering and oncogenic prioritization to those deemed the final call set in a previously published analysis by The Fusion Analysis Working Group PMID: 29617662.
A group of 160 samples were randomly selected from the following cancers: BLCA(10), BRCA(11), CESC(5), COAD(11), ESCA(5) , GBM(7), HNSC(10), KIRP(9), LGG(9), LIHC(9), LUAD(5), LUSC(11), OV(9), PAAD(8), PCPG(2), PRAD(14), SARC(6), SKCM(9), TGCT(6), THCA(4).
# Sample manifest was obtained from CAVATICA Data browser and TCGA files were copied to working project. tcga_manifest <- read_tsv(system.file("extdata", "sample_aliquot_tcga.tsv", package = "annoFuseData")) # Load [Final Fusion Call Set tab of Supplemental Table S1 of PMID: 29617662](https://pubmed.ncbi.nlm.nih.gov/29617662/) final_fusion <- read_tsv(system.file("extdata", "final_fusion_mmc2.txt", package = "annoFuseData")) %>% # subset of final fusions in overlapping samples dplyr::filter( Sample %in% tcga_manifest$aliquot_id ) %>% as.data.frame() %>% unique()
We ran kf-rnaseq-workflow to obtain fusion calls from STAR-Fusion and Arriba and the expression level quantifications from RSEM.
We then standardized STAR-Fusion and Arriba fusion calls using fusion_standardization
.
# merged STAR-Fusion and arriba calls tcga_arriba_df <- read_tsv(system.file("extdata", "merged_arriba_tcga.tsv", package = "annoFuseData")) tcga_starfusion_df <- read_tsv(system.file("extdata", "merged_starfusion_tcga.tsv", package = "annoFuseData")) # format formatted_starfusion <- annoFuse::fusion_standardization(fusion_calls = tcga_starfusion_df, caller = "STARFUSION", tumorID = tcga_starfusion_df$Sample) # format formatted_arriba <- annoFuse::fusion_standardization(fusion_calls = tcga_arriba_df, caller = "ARRIBA", tumorID = tcga_arriba_df$Sample) # Merge starfusion and arriba formatted_calls <- rbind(formatted_starfusion, formatted_arriba)
Artifact and QC filtering was performed using fusion_filtering_QC
. We filtered fusions using the following range of spanningDelta : 10, 20, 30, 40, 50, 100, 150, and 200.
merged_10 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 10) merged_20 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 20) merged_30 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 30) merged_40 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 40) merged_50 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 50) merged_100 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 100) merged_150 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 150) merged_200 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 200)
# get overlap with sample_id/aliquot_id with completed tasks get_tcga_overlap <- function(x, tcga_manifest, final_fusion) { samples_in_final_fusion <- x %>% left_join(tcga_manifest, by = c("Sample" = "sample_id")) %>% dplyr::filter(aliquot_id %in% final_fusion$Sample) %>% dplyr::select(aliquot_id, FusionName) %>% unique() final_fusion_subset <- final_fusion %>% dplyr::filter(Sample %in% samples_in_final_fusion$aliquot_id & Fusion %in% samples_in_final_fusion$FusionName) %>% nrow() return(final_fusion_subset) }
We then calculated overlaps of the subsetted published final fusions with our filtered fusion calls across all spanningDelta cutoffs.
# get merged files for cutoff 10,20,30,40,50,100,150,200 with sensitivity length_final_fusion_subset <- final_fusion$Fusion[ # only keep fusions from raw merged fusion calls # since other callers were also used in the TCGA publication which captured other fusions which(final_fusion$Fusion %in% formatted_calls$FusionName)] %>% length() # 603 hist_df <- merged_10 %>% dplyr::mutate("cutoff" = 10, "sensitivity" = get_tcga_overlap(merged_10, tcga_manifest, final_fusion) / length_final_fusion_subset) %>% dplyr::bind_rows(merged_20 %>% dplyr::mutate("cutoff" = 20, "sensitivity" = get_tcga_overlap(merged_20, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::bind_rows(merged_30 %>% dplyr::mutate("cutoff" = 30, "sensitivity" = get_tcga_overlap(merged_30, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::bind_rows(merged_40 %>% dplyr::mutate("cutoff" = 40, "sensitivity" = get_tcga_overlap(merged_40, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::bind_rows(merged_50 %>% dplyr::mutate("cutoff" = 50, "sensitivity" = get_tcga_overlap(merged_50, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::bind_rows(merged_100 %>% dplyr::mutate("cutoff" = 100, "sensitivity" = get_tcga_overlap(merged_100, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::bind_rows(merged_150 %>% dplyr::mutate("cutoff" = 150, "sensitivity" = get_tcga_overlap(merged_150, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::bind_rows(merged_200 %>% dplyr::mutate("cutoff" = 200, "sensitivity" = get_tcga_overlap(merged_200, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>% dplyr::mutate("change_spanning" = .$SpanningFragCount - .$JunctionReadCount) hist_df <- as.data.frame(hist_df) dat_text <- data.frame( label = c( paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 10), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 20), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 30), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 40), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 50), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 100), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 150), "change_spanning"]))), paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 200), "change_spanning"]))) ), cutoff = c(10, 20, 30, 40, 50, 100, 150, 200) )
Here, we visualize the distribution of spanningDelta
(spanningFragCount - JunctionReadCount) across fusions called from the TCGA to assess a cutoff for spanningDelta
ggplot(hist_df, aes(x = change_spanning)) + geom_histogram(binwidth = 5) + scale_x_continuous(limits = c(0, 200)) + facet_wrap(. ~ cutoff) + theme_publication(base_size = 20) + geom_text( data = dat_text, mapping = aes(x = 100, y = 1200, label = label), hjust = -0.1, vjust = -1 ) + theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15))
We find that sensitivity reaches 96% at a spanningDelta cutoff of 100
extra_breaks <- c(0.7, 0.8, .85, 0.92, 0.96, 1.00) breaks <- sort(c(extra_breaks, with(hist_df, pretty(range(sensitivity))))) ggplot(hist_df, aes(x = cutoff, y = sensitivity)) + geom_line() + ylab("sensitivity") + theme_publication(base_size = 20) + scale_y_continuous( breaks = breaks, limits = range(breaks) ) + theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15))
hist_df %>% select(cutoff,sensitivity) %>% unique() %>% remove_rownames()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.