R/fusion_standardization.R

Defines functions shape_output fusion_standardization

Documented in fusion_standardization shape_output

#' Standardizes fusion calls
#'
#' Various fusion callers have different formats that make aggregating and filtering
#' data difficult. By standardizing fusion callers output we capture the required
#' columns which we use for downstream analysis
#'
#' @param fusion_calls A dataframe from star fusion or arriba (more callers to be added)
#' @param caller string options STARFUSION/ARRIBA
#' @param tumorID string or character vector of same length as fusion_calls
#' @param input_json_file (optional) json format config file to provide input and
#' output columns headers required for CUSTOM type and not required for other callers
#'
#' @author Krutika S Gaonkar, Saksham Phul (phuls@chop.edu)
#' @export
#'
#' @return Standardized fusion calls ready for filtering
#'
#' @examples
#' # read in arriba fusion file
#' fusionfileArriba <- read_arriba_calls(
#'   system.file("extdata", "arriba_example.tsv", package = "annoFuseData")
#' )
#' # read in starfusion file
#' fusionfileStarFusion <- read_starfusion_calls(
#'   system.file("extdata", "starfusion_example.tsv", package = "annoFuseData")
#' )
#' formattedArriba <- fusion_standardization(fusionfileArriba,
#'   caller = "ARRIBA",
#'   tumorID = "tumorID"
#' )
#' formattedStarFusion <- fusion_standardization(fusionfileStarFusion,
#'   caller = "STARFUSION",
#'   tumorID = "tumorID"
#' )
#' # read in CUSTOM type file
#' fusionfileCustom <- data.frame(
#'   Sample = c("BS_WDC88K6G", "BS_6J9HGSSB", "BS_K62F9BCS"),
#'   FusionName = c("KIAA1549--BRAF", "TFG--GPR128", "SPECC1L--NTRK2"),
#'   Gene1A = c("KIAA1549", "TFG", "SPECC1L"),
#'   Gene1B = c("BRAF", "GPR128", "NTRK2"),
#'   Gene2A = c("", "", ""),
#'   Gene2B = c("", "", ""),
#'   Fusion_Type = c("", "", ""),
#'   annots = c(
#'     "[Cosmic,ChimerPub,ChimerSeq,chimerdb_pubmed,ChimerKB,INTRACHROMOSOMAL[chr7:1.74Mb]]", 
#'     "[ChimerPub,GTEx,ChimerKB,Greger_Normal,ChimerSeq]", 
#'     "[INTERCHROMOSOMAL[chr22--chr9]]")
#' )
#' formattedCUSTOM <- fusion_standardization(fusionfileCustom,
#'   caller = "CUSTOM",
#'   tumorID = "All",
#'   input_json_file = system.file("extdata", "config", package = "annoFuseData")
#' )
#' # format of the input_json_file ("Input_header" : "Output_header")
#' #  {
#' #  "CUSTOM":{
#' # 	  	"Sample": "Sample_output",
#' # 		  "FusionName": "FusionName_output",
#' # 		  "Gene1A": "Gene1A_output",
#' # 		  "Gene1B": "Gene1B_output",
#' # 		  "Gene2A": "Gene2A_output",
#' # 		  "Gene2B": "Gene2B_output",
#' # 		  "Fusion_Type":"Fusion_Type_output",
#' # 		  "annots":"annots_output"
#' # 	    }
#' #  }
fusion_standardization <- function(fusion_calls,
                                   caller = c("STARFUSION", "ARRIBA", "CUSTOM"),
                                   tumorID = "tumorID",
                                   input_json_file = "No file exists") {
  stopifnot(is(fusion_calls, "data.frame"))
  stopifnot(is.character(caller))
  stopifnot(is.character(tumorID))
  
  if (caller == "STARFUSION") {
    if (!any(grepl("PROT_FUSION_TYPE", colnames(fusion_calls)))) {
      # add NA if --examine_coding_effect was not used while running starfusion
      fusion_calls$PROT_FUSION_TYPE <- NA
    }
    fusion_calls <- fusion_calls %>%
      # standardize fusion type column name
      dplyr::rename(
        Fusion_Type = PROT_FUSION_TYPE,
        FusionName = "#FusionName"
      ) %>%
      dplyr::mutate(
        # remove chr notation from breakpoint columns
        LeftBreakpoint = gsub("^chr", "", .data$LeftBreakpoint),
        RightBreakpoint = gsub("^chr", "", .data$RightBreakpoint),
        # remove strand information to match breakpoint locations
        LeftBreakpoint = gsub(":[-|+]$", "", .data$LeftBreakpoint),
        RightBreakpoint = gsub(":[-|+]$", "", .data$RightBreakpoint),
        # STARFusion does not return confidence information
        Confidence = NA,
        # standardize fusion types
        Fusion_Type = dplyr::case_when(
          Fusion_Type == "INFRAME" ~ "in-frame",
          Fusion_Type == "FRAMESHIFT" ~ "frameshift",
          TRUE ~ "other"
        ),
        Sample = tumorID,
        Caller = "STARFUSION"
      )
    formatted_data <- shape_output(fusion_calls)
    return(formatted_data)
  } else if (caller == "ARRIBA") {
    if (!any(colnames(fusion_calls) == "annots")) {
      fusion_calls$annots <- ""
    }
    fusion_calls <- fusion_calls %>%
      # standardizing fusion type annotation
      dplyr::rename(
        Fusion_Type = .data$reading_frame,
        Confidence = .data$confidence,
        # SpanningFragCount is equivalent to discordant_mates in Arriba
        SpanningFragCount = .data$discordant_mates
      ) %>%
      dplyr::mutate(
        LeftBreakpoint = gsub("^chr", "", .data$breakpoint1),
        RightBreakpoint = gsub("^chr", "", .data$breakpoint2),
        # readthrough information from arriba
        annots = paste(.data$annots, .data$type, sep = ","),
        # Intergenic gene fusion breakpoints in arriba are annotated as
        # "gene1A,gene2A". As comma is used as a common delimiter in files changing
        # it to "/"
        FusionName = paste0(gsub(",", "/", .data$`#gene1`), "--", gsub(",", "/", .data$gene2)),
        # JunctionReadCount is equivalent to split reads in Arriba. Arriba however
        # provides split_reads1 and split_reads2 to provide information of reads
        # anchoring in gene1 or gene2
        JunctionReadCount = .data$split_reads1 + .data$split_reads2,
        Fusion_Type = dplyr::case_when(
          !Fusion_Type %in% c("out-of-frame", "in-frame") ~ "other",
          Fusion_Type == "out-of-frame" ~ "frameshift",
          TRUE ~ "in-frame"
        ),
        Sample = tumorID,
        Caller = "ARRIBA"
      )
    formatted_data <- shape_output(fusion_calls)
    return(formatted_data)
  } else if (caller == "CUSTOM") {
    # Condition to check if required columns exists in the input and config file not provided by the user
    if (!file.exists(input_json_file) &&
        any(colnames(fusion_calls) == "Sample") &&
        any(colnames(fusion_calls) == "FusionName") &&
        any(colnames(fusion_calls) == "Gene1A") &&
        any(colnames(fusion_calls) == "Gene1B") &&
        any(colnames(fusion_calls) == "Gene2A") &&
        any(colnames(fusion_calls) == "Gene2B") &&
        any(colnames(fusion_calls) == "Fusion_Type") &&
        any(colnames(fusion_calls) == "annots")) {
      message("All required columns exists! ")
      return(fusion_calls)
    } else if (file.exists(input_json_file)) # if config file is provided
    {
      message("Annotating based on config file")
      
      json_data_frame <- as.data.frame(rjson::fromJSON(file = input_json_file)) # Get data from json file
      json_cols <- colnames(json_data_frame) # extract cols from json data frame
      caller <- strsplit(json_cols, split = "[.]")[[1]][1] # Get the caller from json file
      
      input_columns <- list() # define list to store input columns from json file
      output_columns <- list() # define list to store output columns from json file
      for (i in json_cols) {
        input_columns <- append(input_columns, strsplit(i, split = "[.]")[[1]][2]) # extract input columns
      }
      
      if ("Sample" %in% input_columns &&
          "FusionName" %in% input_columns &&
          "Gene1A" %in% input_columns &&
          "Gene1B" %in% input_columns &&
          "Gene2A" %in% input_columns &&
          "Gene2B" %in% input_columns &&
          "Fusion_Type" %in% input_columns &&
          "annots" %in% input_columns) # check if required column exists in config file else throw an exception
      {
        message("All required columns exists!")
        for (i in 1:ncol(json_data_frame)) { # for-loop over columns
          output_columns <- append(output_columns, json_data_frame[, i])
        }
        # json_dataframe <- do.call(rbind, Map(data.frame, input_name=input_columns, output_name=output_columns))
        
        input_columns_without_None <- input_columns
        output_columns_without_None <- output_columns
        output_columns_with_None <- list()
        index_none <- list()
        
        for (i in 1:length(input_columns)) { # loop to extract None from input json
          if (input_columns[i] == "None") {
            output_columns_with_None <- append(output_columns_with_None, output_columns[i])
            index_none <- append(index_none, i)
          }
        }
        
        if (length(index_none > 0)) { # check if none exist as an input in the config file
          input_columns_without_None <- input_columns[-unlist(index_none)]
          output_columns_without_None <- output_columns[-unlist(index_none)]
        }
        
        standard_calls <- fusion_calls %>% dplyr::select(unlist(input_columns_without_None))
        standard_calls <- gdata::rename.vars(standard_calls, from = unlist(input_columns_without_None), to = unlist(output_columns_without_None))
        for (i in output_columns_with_None)
        {
          if (i != "Caller") { # set caller for all the rows if json has None::Caller
            standard_calls[i] <- NA
          } else {
            standard_calls[i] <- caller
          }
        }
        standard_calls <- standard_calls[, unlist(output_columns)]
        # print(standard_calls)
        return(standard_calls)
      } else {
        stop(paste("Provide all the required columns. Required columns for Custom caller are: Sample FusionName Gene1A Gene1B Gene2A Gene2B Fusion_Type annots."))
      }
    } else # if user do not meet input expectations
    {
      stop(paste(caller, " caller requires specific columns or config file do not exists. Required columns are: Sample FusionName Gene1A Gene1B Gene2A Gene2B Fusion_Type annots."))
    }
  } else {
    stop(paste(caller, "is not a supported caller string."))
  }
}
#' function used by STARFUSION and ARRIBA to shape the final output columns as required
#' @param fusion_calls A dataframe used for star fusion or arriba method  (more callers to be added)
#'
#' @author Saksham Phul (phuls@chop.edu)
#' @export
#'
#' @return processed dataframe in the final desired format
shape_output <- function(fusion_calls) {
  # Get standard columns for filtering
  
  standard_calls <- fusion_calls %>%
    # select columns for standard fusion format
    dplyr::select(c(
      "LeftBreakpoint",
      "RightBreakpoint",
      "FusionName",
      "Sample",
      "Caller",
      "Fusion_Type",
      "JunctionReadCount",
      "SpanningFragCount",
      "Confidence",
      "annots"
    )) %>%
    # to obtain geneA and geneB for gene search below
    bind_cols(reshape2::colsplit(fusion_calls$FusionName, pattern = "--", names = c("GeneA", "GeneB"))) %>%
    # Intergenic fusion will have Gene1A,Gene2A,Gene1B,Gene2B
    separate(.data$GeneA, sep = "/", into = c("Gene1A", "Gene2A"), remove = FALSE) %>%
    separate(.data$GeneB, sep = "/", into = c("Gene1B", "Gene2B"), remove = FALSE) %>%
    # remove distance to fusion breakpoint from gene names in intergenic fusion
    mutate(
      Gene1A = gsub("[(].*", "", .data$Gene1A),
      Gene2A = gsub("[(].*", "", .data$Gene2A),
      Gene1B = gsub("[(].*", "", .data$Gene1B),
      Gene2B = gsub("[(].*", "", .data$Gene2B),
      BreakpointLocation = case_when(
        Gene1A == Gene1B & !grepl("/", FusionName) ~ "Intragenic",
        grepl("/", FusionName) ~ "Intergenic",
        TRUE ~ "Genic"
      ),
      SpanningDelta = SpanningFragCount - JunctionReadCount
    ) %>%
    as.data.frame()
  return(standard_calls)
}
d3b-center/annoFuse documentation built on Oct. 2, 2024, 4:17 a.m.