R/mutFilterCom.R

Defines functions mutFilterCom

Documented in mutFilterCom

#' mutFilterCom
#' @description Apply common filtering strategies on a MAF data frame.
#'
#' @param maf An MAF data frame.
#' @param PONfile Panel-of-Normals files, which can be either obtained through 
#' GATK (https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-)
#' or generated by users. Should have at least four columns: CHROM, POS, REF, ALT
#' @param PONformat The format of PON file, either "vcf" or "txt". Default: "vcf"
#' @param panel The sequencing panel applied on the dataset. Parameters
#' for \code{\link{mutFilterQual}} function are set differently for different
#' panels. Default: "Customized". Options: "MSKCC", "WES".
#' @param tumorDP Threshold of tumor total depth. Default: 20
#' @param normalDP Threshold of normal total depth. Default: 10
#' @param tumorAD Threshold of tumor alternative allele depth. Default: 5
#' @param normalAD Threshold of normal alternative allele depth. Default: Inf
#' @param VAF Threshold of VAF value. Default: 0.05
#' @param VAFratio Threshold of VAF ratio (tVAF/nVAF). Default: 0.
#' @param SBmethod Method will be used to detect strand bias,
#' including 'SOR' and 'Fisher'. Default: 'SOR'. SOR: StrandOddsRatio
#' (https://gatk.broadinstitute.org/hc/en-us/articles/360041849111-
#' StrandOddsRatio)
#' @param SBscore Cutoff strand bias score used to filter variants.
#' Default: 3.
#' @param maxIndelLen Maximum length of indel accepted to be included.
#' Default: 50.
#' @param minInterval Maximum length of interval between an SNV and an indel
#' accepted to be included. Default: 10.
#' @param tagFILTER Variants with spcific tag in the FILTER column will be kept,
#' Default: 'PASS'.
#' @param dbVAF Threshold of VAF value for databases. Default: 0.01.
#' @param ExAC Whether to filter variants listed in ExAC with VAF higher than
#' cutoff(set in VAF parameter). Default: TRUE.
#' @param Genomesprojects1000 Whether to filter variants listed in
#' Genomesprojects1000 with VAF higher than cutoff(set in VAF parameter).
#' Default: TRUE.
#' @param ESP6500 Whether to filter variants listed in ESP6500 with VAF higher
#' than cutoff(set in VAF parameter). Default: TRUE.
#' @param gnomAD Whether to filter variants listed in gnomAD with VAF higher
#' than cutoff(set in VAF parameter). Default: TRUE.
#' @param dbSNP Whether to filter variants listed in dbSNP. Default: FALSE.
#' @param keepCOSMIC Whether to keep variants in COSMIC even
#' they have are present in germline database. Default: TRUE.
#' @param keepType A group of variant classifications will be kept,
#' including 'exonic', 'nonsynonymous' and 'all'. Default: 'exonic'.
#' @param bedFile A file in bed format that contains region information.
#' Default: NULL.
#' @param bedHeader Whether the input bed file has a header or not. 
#' Default: FALSE.
#' @param bedFilter Whether to filter the information in bed file or not, which
#' only leaves segments in Chr1-Ch22, ChrX and ChrY. Default: TRUE.
#' @param mutFilter Whether to directly return a filtered MAF data frame.
#' If FALSE, a simulation filtration process will be run, and the original MAF
#' data frame with tags in CaTag column, and  a filter report will be returned.
#' If TRUE, a filtered MAF data frame and a filter report will be generated.
#' Default: FALSE.
#' @param selectCols Columns will be contained in the filtered data frame.
#' By default (TRUE), the first 13 columns and 'Tumor_Sample_Barcode' column.
#' Or a vector contains column names will be kept.
#' @param report Whether to generate report automatically. Default: TRUE
#' @param reportFile File name of the report. Default: 'FilterReport.html'
#' @param reportDir Path to the output report file. Default: './'.
#' @param TMB Whether to calculate TMB. Default: TRUE. Note: CaMutQC uses 
#' unfiltered maf to calculate TMB value.
#' @param assay Methodology and assay will be applied as a reference, including
#' 'MSK-v3', 'MSK-v2', 'MSK-v1', 'FoundationOne', 'Pan-Cancer Panel' and
#' 'Customized'. Default: 'MSK-v3'.
#' @param genelist A vector of panel gene list, only useful when assay is set to
#' 'Customized'.
#' @param mutType A group of variant classifications that will be kept,
#' only useful when assay is set to 'Pan-Cancer Panel' or 'Customized',
#' including 'exonic' and 'nonsynonymous'. Default: 'nonsynonymous'.
#' @param cancerType Type of cancer whose filtering parameters
#' need to be referred to.  Options are: "COADREAD", "BRCA", "LIHC", "LAML",
#' "LCML", "UCEC", "UCS", "BLCA", "KIRC" and "KIRP"
#' @param reference A specific study whose filtering strategies
#' need to be referred to.
#' Format: "Last_name_of_the_first_author_et_al-Journal-Year-Cancer_type"
#' Options are: "Haraldsdottir_et_al-Gastroenterology-2014-UCEC",
#' "Cherniack_et_al-Cancer_Cell-2017-UCS",
#' "Mason_et_al-Leukemia-2015-LCML",
#' "Gerlinger_et_al-Engl_J_Med-2012-KIRC"
#' "Zhu_et_al-Nat_Commun-2020-KIRP"
#' @param progressbar Whether to show progress bar when running this function
#' Default: TRUE
#' @param codelog If TRUE, your code, along with the parameters you set, 
#' will be export in a log file. It will be convenient for users to repeat 
#' experiments. Default: FALSE
#' @param codelogFile Where to store the codelog, only useful when codelog is
#' set to TRUE. Default: "mutFilterCom.log"
#' @param verbose Whether to generate message/notification during the 
#' filtration process. Default: TRUE.
#'
#' @return An MAF data frame after common strategy filtration
#' @return A filter report in HTML format
#' @import ggplot2 DT
#' @importFrom methods is
#'
#' @export mutFilterCom
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterCom(maf,
#' PONfile=system.file("extdata", "PON_test.txt", package="CaMutQC"),
#' TMB=FALSE, report=FALSE, PONformat="txt", verbose=FALSE)


mutFilterCom <- function(maf, PONfile, PONformat = "vcf", panel = "Customized", 
                         tumorDP = 20, normalDP = 10, tumorAD = 5, 
                         normalAD = Inf, VAF = 0.05, VAFratio = 0,
                         SBmethod = 'SOR', SBscore = 3, maxIndelLen = 50,
                         minInterval = 10, tagFILTER = 'PASS', dbVAF = 0.01,
                         ExAC = TRUE, Genomesprojects1000 = TRUE, 
                         gnomAD = TRUE, dbSNP = FALSE, keepCOSMIC = TRUE,
                         keepType = 'exonic', bedFile = NULL, bedHeader = FALSE,
                         bedFilter = TRUE, mutFilter = FALSE, ESP6500 = TRUE,
                         selectCols = TRUE, report = TRUE,
                         assay = 'MSK-v3', genelist = NULL,
                         mutType = 'nonsynonymous',
                         reportFile = 'FilterReport.html', reportDir = './',
                         TMB = TRUE, cancerType = NULL, reference = NULL,
                         progressbar = TRUE, codelog = FALSE, 
                         codelogFile = "mutFilterCom.log", verbose = TRUE) {
    # check user input
    if (!(is(maf, "data.frame"))) {
        stop("maf input should be a data frame, did you get it from vcfToMAF function?")
    }
    
    # run mutFilterTech
    if (verbose) {
        message("  Filtration for technical issue is running")
    }
    mafFilteredT <- mutFilterTech(maf, panel=panel, tumorDP=tumorDP,
                            normalDP=normalDP, tumorAD=tumorAD, VAF=VAF,
                            normalAD=normalAD, VAFratio=VAFratio, 
                            SBmethod=SBmethod, SBscore=SBscore, 
                            maxIndelLen=maxIndelLen, minInterval=minInterval, 
                            tagFILTER=tagFILTER, progressbar=progressbar,
                            PONfile=PONfile, PONformat=PONformat, 
                            verbose=verbose)
    # filter first for report usage
    mafFilteredTs <- mafFilteredT[mafFilteredT$CaTag == "0", ]
    # run mutSelection
    if (verbose) {
        message("\n")
        message("  Cancer somatic variant selection is running")
    }
    mafFilteredS <- mutSelection(mafFilteredT, dbVAF=dbVAF, ExAC=ExAC,
                            Genomesprojects1000=Genomesprojects1000,
                            ESP6500=ESP6500, gnomAD=gnomAD, dbSNP=dbSNP,
                            keepCOSMIC=keepCOSMIC, keepType=keepType,
                            bedFile=bedFile, bedFilter=bedFilter,
                            bedHeader=bedHeader, progressbar=progressbar,
                            verbose=verbose)
    # filter first for report usage
    mafFilteredS2 <- mutSelection(mafFilteredTs, dbVAF=dbVAF, ExAC=ExAC,
                   Genomesprojects1000=Genomesprojects1000, dbSNP=dbSNP,
                   ESP6500=ESP6500, gnomAD=gnomAD, keepCOSMIC=keepCOSMIC,
                   keepType=keepType, bedFile=bedFile,bedHeader=bedHeader,
                   bedFilter=bedFilter, progressbar=FALSE, verbose=FALSE)
    mafFilteredF <- mafFilteredS2[mafFilteredS2$CaTag == '0', ]
    if (nrow(mafFilteredF) == 0){ warning('No variant left after filtration.')}
    if (TMB){
        TMBvalue <- calTMB(maf, bedFile=bedFile, assay=assay,
                            genelist=genelist, mutType=mutType, 
                            bedHeader=bedHeader, bedFilter=bedFilter)
        mes <- paste0("Method used to calculate TMB: ", assay, ".")
        message(mes)
        mes <- paste0("Estimated TMB is: ", TMBvalue, ".")
        message(mes)
    }
    # report generation
    if (report){
        rmarkdown::render(system.file("rmd", "CaMutQC-FilterReport.Rmd",
        package="CaMutQC"), output_file=reportFile, output_dir=reportDir)
    }
    # export codelog if asked
    if (codelog) {
        printer <- file(codelogFile, "w")
        # export date and running code 
        writeLines(paste0(date(), " \n"), con=printer)
        running_code <- paste0("mutFilterCom(maf, panel=", panel, ", tumorDP=", 
                      tumorDP, ", normalDP=", normalDP, ", tumorAD=", tumorAD,
                      ", normalAD=", normalAD, ", VAF=", VAF, ", VAFratio=",
                      VAFratio, ", SBmethod=", SBmethod, ", SBscore=", 
                      SBscore, ", maxIndelLen=", maxIndelLen, ", minInterval=",
                      minInterval, ", tagFILTER=", tagFILTER, ", dbVAF=",
                      dbVAF, ", ExAC=", ExAC, ", Genomesprojects1000=", 
                      Genomesprojects1000, ", ESP6500=", ESP6500, ", gnomAD=",
                      gnomAD, ", dbSNP=", dbSNP, ", keepCOSMIC=", keepCOSMIC,
                      ", keepType=", keepType, ", bedFile=", bedFile,
                      ", bedHeader=", bedHeader, ", bedFilter=", bedFilter,
                      ", mutFilter=", mutFilter, ", selectCols=", selectCols,
                      ", report=", report, ", assay=", assay, ", genelist=",
                      genelist, ", mutType=", mutType,", reportDir=", reportDir,
                      ", reportFile=", reportFile, ", TMB=", TMB, 
                      ", cancerType=", cancerType, ", reference=", reference, 
                      ", progressbar=", progressbar, ", codelog=", codelog, 
                      ", codelogFile=", codelogFile, ", PONfile=", PONfile, 
                      ", PONformat=", PONformat, ", verbose=", verbose, ")")
        writeLines(running_code, con=printer)
        close(printer)
    }
    if (mutFilter) {
        if (selectCols){
            if (isTRUE(selectCols)){
                return(mafFilteredF[, c(seq_len(12), 16)])
            }else{
                if (all(selectCols %in% colnames(mafFilteredF))){
                    return(mafFilteredF[, selectCols])
                }else{ stop('Not all selected columns can be found in MAF.') }
            }
        }
    }else{ return(mafFilteredS) }
}
likelet/CaMutQC documentation built on Aug. 17, 2024, 4 a.m.