expression_filter_fusion: Expression filtering with user provided expression Matrix and...

View source: R/expression_filter_fusion.R

expression_filter_fusionR Documentation

Expression filtering with user provided expression Matrix and standard fusion calls

Description

Expression filtering with user provided expression Matrix and standard fusion calls

Usage

expression_filter_fusion(
  standardFusioncalls,
  expressionMatrix,
  expressionFilter
)

Arguments

standardFusioncalls

A dataframe from star fusion or arriba standardized to run through the filtering steps

expressionMatrix

Expression matrix for samples used in cohort for fusion calls

expressionFilter

FPKM/TPM threshold for not expressed

Value

Standardized fusion calls annotated with gene list and fusion list provided in reference folder

Examples

## Not run: 
# standardize
fusionfileArriba <- read_arriba_calls(
  system.file("extdata", "arriba_example.tsv", package = "annoFuseData")
)
fusionfileStarFusion <- read_starfusion_calls(
  system.file("extdata", "starfusion_example.tsv", package = "annoFuseData")
)
library(dplyr)
formattedArriba <- fusion_standardization(fusionfileArriba,
  caller = "ARRIBA",
  tumorID = "tumorID"
)
formattedStarFusion <- fusion_standardization(fusionfileStarFusion,
  caller = "STARFUSION",
  tumorID = "tumorID"
)
# merge standardized fusion calls
standardFusioncalls <- rbind(formattedStarFusion, formattedArriba) %>% as.data.frame()
fusionQCFiltered <- fusion_filtering_QC(
  standardFusioncalls = standardFusioncalls,
  readingFrameFilter = "in-frame|frameshift|other",
  artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap|ConjoinG",
  junctionReadCountFilter = 1,
  spanningFragCountFilter = 10,
  readthroughFilter = TRUE
)
# expression based filter to capture only fusions where atleast 1 gene is expressed

expressionFile <- system.file("extdata", "example.rsem.genes.results.gz", package = "annoFuseData")
expressionMatrix <- read_tsv(expressionFile)
library(reshape2)
# split gene id and symbol
expressionMatrix <- cbind(
  expressionMatrix,
  colsplit(expressionMatrix$gene_id, pattern = "_", names = c("EnsembleID", "GeneSymbol"))
)
# collapse to matrix of HUGO symbols x Sample identifiers
# take max expression per row and use the max value for duplicated gene symbols
expressionMatrix.collapsed <- expressionMatrix %>%
  arrange(desc(FPKM)) %>% # arrange decreasing by FPKM
  distinct(GeneSymbol, .keep_all = TRUE) %>% # keep the ones with greatest FPKM value.
  # If ties occur, keep the first occurencce
  unique() %>%
  remove_rownames() %>%
  dplyr::select(.data$EnsembleID, .data$GeneSymbol, .data$FPKM, .data$gene_id)
# rename columns
colnames(expressionMatrix.collapsed)[3] <- "tumorID"
expressionFiltered <- expression_filter_fusion(
  standardFusioncalls = fusionQCFiltered,
  expressionMatrix = expressionMatrix.collapsed,
  expressionFilter = 1
)

## End(Not run)


d3b-center/annoFuse documentation built on Oct. 2, 2024, 4:17 a.m.