R/read_maf_dt.R

Defines functions read.maf

Documented in read.maf

#' Read MAF files.
#' @description Takes tab delimited MAF (can be plain text or gz compressed) file as an input and summarizes it in various ways. Also creates oncomatrix - helpful for visualization.
#' @details This function takes MAF file as input and summarizes them. If copy number data is available, e.g from GISTIC, it can be provided too via arguments gisticAllLesionsFile, gisticAmpGenesFile,
#' and gisticDelGenesFile. Copy number data can also be provided as a custom table containing Gene name, Sample name and Copy Number status.
#'
#' Note that if input MAF file contains multiple affected transcripts of a variant, this function by default removes them as duplicates, while keeping single unique entry per variant per sample.
#' If you wish to keep all of them, set removeDuplicatedVariants to FALSE.
#'
#'FLAGS - If you get a note on possible FLAGS while reading MAF, it means some of the top mutated genes are fishy. These genes are often non-pathogenic and passengers, but are frequently mutated in most of the public exome studies. Examples of such genes include TTN, MUC16, etc.
#'This note can be ignored without any harm, it's only generated as to make user aware of such genes. See references for details on FLAGS.
#'
#'@references Shyr C, Tarailo-Graovac M, Gottlieb M, Lee JJ, van Karnebeek C, Wasserman WW. FLAGS, frequently mutated genes in public exomes. BMC Med Genomics 2014; 7: 64.
#'
#' @param maf tab delimited MAF file. File can also be gz compressed. Required. Alternatively, you can also provide already read MAF file as a dataframe.
#' @param clinicalData Clinical data associated with each sample/Tumor_Sample_Barcode in MAF. Could be a text file or a data.frame. Default NULL.
#' @param useAll logical. Whether to use all variants irrespective of values in Mutation_Status. Defaults to TRUE. If FALSE, only uses with values Somatic.
#' @param rmFlags Default FALSE. Can be TRUE or an integer. If TRUE removes all the top 20 FLAG genes. If integer, remove top n FLAG genes.
#' @param gisticAllLesionsFile All Lesions file generated by gistic. e.g; all_lesions.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticAmpGenesFile Amplification Genes file generated by gistic. e.g; amp_genes.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticDelGenesFile Deletion Genes file generated by gistic. e.g; del_genes.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticScoresFile scores.gistic file generated by gistic. Default NULL
#' @param cnLevel level of CN changes to use. Can be 'all', 'deep' or 'shallow'. Default uses all i.e, genes with both 'shallow' or 'deep' CN changes
#' @param cnTable Custom copynumber data if gistic results are not available. Input file or a data.frame should contain three columns in aforementioned order with gene name, Sample name and copy number status (either 'Amp' or 'Del'). Default NULL. Recommended to include additional columns `Chromosome` `Start_Position` `End_Position`
#' @param isTCGA Is input MAF file from TCGA source. If TRUE uses only first 12 characters from Tumor_Sample_Barcode.
#' @param removeDuplicatedVariants removes repeated variants in a particuar sample, mapped to multiple transcripts of same Gene. See Description. Default TRUE.
#' @param vc_nonSyn NULL. Provide manual list of variant classifications to be considered as non-synonymous. Rest will be considered as silent variants. Default uses Variant Classifications with High/Moderate variant consequences. https://m.ensembl.org/info/genome/variation/prediction/predicted_data.html: "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site","Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del","In_Frame_Ins", "Missense_Mutation"
#' @param verbose TRUE logical. Default to be talkative and prints summary.
#' @return An object of class MAF.
#' @examples
#' laml.maf = system.file("extdata", "tcga_laml.maf.gz", package = "maftools") #MAF file
#' laml.clin = system.file('extdata', 'tcga_laml_annot.tsv', package = 'maftools') #clinical data
#' laml = read.maf(maf = laml.maf, clinicalData = laml.clin)
#' @import data.table
#' @seealso \code{\link{plotmafSummary}} \code{\link{write.mafSummary}}
#' @export


read.maf = function(maf, clinicalData = NULL, rmFlags = FALSE, removeDuplicatedVariants = TRUE, useAll = TRUE, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL,
                    gisticDelGenesFile = NULL, gisticScoresFile = NULL, cnLevel = 'all', cnTable = NULL, isTCGA = FALSE, vc_nonSyn = NULL, verbose = TRUE){

  #1. Read MAF if its a file or convert to data.table if its data.frame
  start_time = proc.time()
  if (is.data.frame(x = maf)) {
    maf  = data.table::as.data.table(maf)
  } else{
    if (verbose) {
      cat('-Reading\n')
    }

    maf <-
      data.table::fread(
        file = maf,
        sep = "\t",
        stringsAsFactors = FALSE,
        verbose = FALSE,
        data.table = TRUE,
        showProgress = TRUE,
        header = TRUE,
        fill = TRUE,
        skip = "Hugo_Symbol",
        quote = ""
      )
  }

  #2. validate MAF file
  if(verbose){
    cat("-Validating\n")
  }
  maf = validateMaf(maf = maf, isTCGA = isTCGA, rdup = removeDuplicatedVariants, chatty = verbose)

  #3. validation check for variants classified as Somatic in Mutation_Status field.
  if(!useAll){
    cat('--Using only `Somatic` variants from Mutation_Status. Set useAll = TRUE to include everything.')
    if(length(colnames(maf)[colnames(x = maf) %in% 'Mutation_Status']) > 0){
      maf = maf[Mutation_Status %in% "Somatic"]
      if(nrow(maf) == 0){
        stop('No more Somatic mutations left after filtering for Mutation_Status! Maybe set useAll to TRUE ?')
      }
    }else{
      cat('Mutation_Status not found. Assuming all variants are Somatic and validated\n')
    }
  }

  #4. Seperate synonymous variants from non-syn variants
    #Variant Classification with Low/Modifier variant consequences. http://asia.ensembl.org/Help/Glossary?id=535
  if(is.null(vc_nonSyn)){
    vc.nonSilent = c("Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site",
                     "Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del",
                     "In_Frame_Ins", "Missense_Mutation")
  }else{
    vc.nonSilent = vc_nonSyn
  }
  # silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron",
  #            "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA", "De_novo_Start_InFrame", "De_novo_Start_OutOfFrame", "Start_Codon_Ins", "Start_Codon_SNP", "Stop_Codon_Del")
    #Variant Classification with High/Moderate variant consequences. http://asia.ensembl.org/Help/Glossary?id=535


    if(is(object = rmFlags, class2 = "logical")){
      if(rmFlags){
        flags = flags(top = 20)
        cat("-Removing", length(flags), "FLAG genes\n")
        maf = maf[!Hugo_Symbol %in% flags]
      }
    }else if(is(object = rmFlags, class2 = "numeric")){
      flags = flags(top = rmFlags)
      cat("-Removing", length(flags), "FLAG genes\n")
      maf = maf[!Hugo_Symbol %in% flags]
    }


  maf.silent = maf[!Variant_Classification %in% vc.nonSilent] #Silent variants
  if(nrow(maf.silent) > 0){
    maf.silent.vc = maf.silent[,.N, .(Tumor_Sample_Barcode, Variant_Classification)]
    maf.silent.vc.cast = data.table::dcast(data = maf.silent.vc, formula = Tumor_Sample_Barcode ~ Variant_Classification, fill = 0, value.var = 'N') #why dcast is not returning it as data.table ?
    summary.silent = data.table::data.table(ID = c('Samples',colnames(maf.silent.vc.cast)[2:ncol(maf.silent.vc.cast)]),
                                N = c(nrow(maf.silent.vc.cast), colSums(maf.silent.vc.cast[,2:ncol(maf.silent.vc.cast), with = FALSE])))

    maf = maf[Variant_Classification %in% vc.nonSilent] #Choose only non-silent variants from main table
    if(verbose){
      cat(paste0('-Silent variants: ', nrow(maf.silent)), '\n')
      #print(summary.silent)
    }
  }

  if(nrow(maf) == 0){
    stop("No non-synonymous mutations found\nCheck `vc_nonSyn`` argumet in `read.maf` for details")
  }

  #5. Process CN data if available.
  if(!is.null(gisticAllLesionsFile)){
    if(verbose){
      cat('-Processing GISTIC copy number data\n')
    }
    gisticIp = readGistic(gisticAllLesionsFile = gisticAllLesionsFile, gisticAmpGenesFile = gisticAmpGenesFile,
                          gisticDelGenesFile = gisticDelGenesFile, isTCGA = isTCGA, gisticScoresFile = gisticScoresFile, cnLevel = cnLevel, verbose = verbose)
    #gisticIp = gisticIp@data
    gisticIp = merge(
      gisticIp@data,
      gisticIp@cytoband.summary[, .(Unique_Name, Wide_Peak_Limits)],
      by.x = 'Cytoband',
      by.y = 'Unique_Name',
      all.x = TRUE
    )

    gisticIp = cbind(gisticIp, loci2df(loci = gisticIp$Wide_Peak_Limits))

    suppressWarnings(gisticIp[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')])
    gisticIp = gisticIp[!duplicated(id)]
    gisticIp[,id := NULL]
    if(verbose){
      cat("--Start and end coordinates for CN altered genes are derived from the corresponding `Wide Peak Limits`\n")
    }

    maf = data.table::rbindlist(list(maf, gisticIp), fill = TRUE, use.names = TRUE)
    maf$Tumor_Sample_barcode = factor(x = maf$Tumor_Sample_barcode,
                                      levels = unique(c(levels(maf$Tumor_Sample_barcode), unique(as.character(gisticIp$Tumor_Sample_barcode)))))

    #oncomat = createOncoMatrix(maf, chatty = verbose)
  }else if(!is.null(cnTable)){
    if(verbose){
      cat('-Processing copy number data\n')
    }
    if(is.data.frame(cnTable)){
      cnDat = data.table::copy(cnTable)
      data.table::setDT(x = cnDat)
    }else{
      cnDat = data.table::fread(input = cnTable, sep = '\t', stringsAsFactors = FALSE, header = TRUE, colClasses = 'character')
    }
    colnames(cnDat)[1:3] = c('Hugo_Symbol', 'Tumor_Sample_Barcode', 'Variant_Classification')
    if(isTCGA){
      cnDat[,Tumor_Sample_Barcode := substr(x = cnDat$Tumor_Sample_Barcode, start = 1, stop = 12)]
    }
    cnDat$Variant_Type = 'CNV'
    suppressWarnings(cnDat[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')])
    cnDat = cnDat[!duplicated(id)]
    cnDat[,id := NULL]
    maf = data.table::rbindlist(l = list(maf, cnDat), fill = TRUE, use.names = TRUE)
    maf$Tumor_Sample_barcode = factor(x = maf$Tumor_Sample_barcode,
                                      levels = unique(c(levels(maf$Tumor_Sample_barcode), unique(as.character(cnDat$Tumor_Sample_barcode)))))
  }

  #Set factors
  maf$Variant_Classification = as.factor(as.character(maf$Variant_Classification))
  maf$Variant_Type = as.factor(as.character(maf$Variant_Type))

  if(verbose){
    cat('-Summarizing\n')
  }


  m = MAF(nonSyn = maf, syn = maf.silent, clinicalData = clinicalData, verbose = verbose)


  if(verbose){
    cat("-Finished in",data.table::timetaken(start_time),"\n")
  }

  m
}
PoisonAlien/maftools documentation built on Nov. 10, 2024, 6:01 p.m.