R/mutFilterDB.R

Defines functions dbMessage mutFilterDB

Documented in mutFilterDB

#' mutFilterDB
#' @description Filter variants in germline database.
#'
#' @param maf An MAF data frame, generated by \code{\link{vcfToMAF}} function.
#' @param dbVAF Threshold of VAF value for database annotations. Default: 0.01.
#' @param ExAC Whether to filter variants listed in ExAC with VAF higher than
#' cutoff (set in dbVAF parameter). Default: TRUE.
#' @param Genomesprojects1000 Whether to filter variants listed in
#' Genomesprojects1000 with VAF higher than cutoff (set in dbVAF parameter).
#' Default: TRUE.
#' @param ESP6500 Whether to filter variants listed in ESP6500 with VAF higher
#' than cutoff (set in dbVAF parameter). Default: TRUE.
#' @param gnomAD Whether to filter variants listed in gnomAD with VAF higher
#' than cutoff (set in dbVAF parameter). Default: TRUE.
#' @param dbSNP Whether to filter variants listed in dbSNP. Default: FALSE.
#' @param keepCOSMIC Whether to keep variants in COSMIC even
#' they are present in germline database. Default: TRUE.
#' @param verbose Whether to generate message/notification during the 
#' filtration process. Default: TRUE.
#' @importFrom methods is
#'
#' @return An MAF data frame after filtration for database
#' and clinical significance
#'
#' @export mutFilterDB
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterDB(maf)

mutFilterDB <- function(maf, dbVAF = 0.01, ExAC = TRUE,
                        Genomesprojects1000 = TRUE, ESP6500 = TRUE,
                        gnomAD = TRUE, dbSNP = FALSE, keepCOSMIC = TRUE,
                        verbose = TRUE){
    # check user input
    if (!(is(maf, "data.frame"))) {
        stop("maf input should be a data frame, did you get it from vcfToMAF function?")
    }
    
    # create NULL tags first
    for (t in c("tags1", "tags2", "tags3", "tags4", "tags5", "tags6", "tags7")){
        assign(t, NULL)
    }
    # ExAC filtration
    if (ExAC){
        if ('ExAC_AF' %in% colnames(maf) & any(!(is.na(maf$ExAC_AF)))){
            tags1 <- rownames(maf[((!(is.na(maf$ExAC_AF)))&(maf$ExAC_AF >= dbVAF)), ])
        }else{if (verbose) {dbMessage("ExAC")}}}
    # 1000 Genomesprojects filtration
    if (Genomesprojects1000){
        if ('GMAF' %in% colnames(maf) & any(!(is.na(maf$GMAF)))){
          tags2 <- rownames(maf[((!(is.na(maf$GMAF))) & (maf$GMAF >= dbVAF)), ])
        }else{if (verbose) {dbMessage("Genomesprojects1000")}}}
    # gnomAD filtration
    if (gnomAD){
        if ('gnomAD_AF' %in% colnames(maf) & any(!(is.na(maf$gnomAD_AF))) ){
          tags3 <- rownames(maf[((!(is.na(maf$gnomAD_AF))) & 
                                   (maf$gnomAD_AF >= dbVAF)), ])
        }else{if (verbose) {dbMessage("gnomAD")}}}
    # ESP6500 filtration
    if (ESP6500){
        if ('AA_MAF' %in% colnames(maf) & any(!(is.na(maf$AA_MAF)))){
          tags4 <- rownames(maf[((!(is.na(maf$AA_MAF)))&(maf$AA_MAF >= dbVAF)), ])
        }else if ('EA_MAF' %in% colnames(maf) & any(!(is.na(maf$EA_MAF)))){
          tags5 <- rownames(maf[((!(is.na(maf$EA_MAF)))&(maf$EA_MAF >= dbVAF)), ])
        }else{if (verbose) {dbMessage("ESP6500")}}}
    if (keepCOSMIC) {
        tags6 <- rownames(maf[grep('COS', maf[, 'Existing_variation']), ])
    }
    if (dbSNP){
        tags7 <- rownames(maf[grep('rs', maf[, 'Existing_variation']), ])
    }
    tags <- unique(c(tags1, tags2, tags3, tags4, tags5, tags6, tags7))
    tags <- tags[!(tags %in% tags6)]
    # patho tags: with pathogenic but removing pathogenicity
    pathoTags <- setdiff(as.character(grep('pathogenic', maf$CLIN_SIG, 
                  ignore.case=TRUE)), as.character(grep('pathogenicity', 
                                        maf$CLIN_SIG, ignore.case=TRUE)))
    tags <- intersect(tags, setdiff(rownames(maf), pathoTags))
    maf[tags, 'CaTag'] <- paste0(maf[tags, 'CaTag'], 'D')
    return(maf)
}

# generate message when certain database annotation is missing
dbMessage <- function(db) {
    firstMes <- paste0('\n  This VCF file wasn\'t annotated by ', db,' database.')
    secondMes <- paste0(' No variants will be filtered based on ', db , '.')
    finalMes <- paste0(firstMes, secondMes)
    message(finalMes)
}
likelet/CaMutQC documentation built on Aug. 17, 2024, 4 a.m.