R/TMB.R

Defines functions readBedPanel mutCountRegion calTMB

Documented in calTMB

#' calTMB
#' @description Calculate Tumor Mutational Burden (TMB) in specific regions.
#'
#' @param maf An MAF data frame, generated by \code{\link{vcfToMAF}} function.
#' @param bedFile A file in bed format that contains region information.
#' Default: NULL.
#' @param bedHeader Whether the input bed file has a header or not. 
#' Default: FALSE.
#' @param assay Methodology and assay will be applied as a reference, including
#' 'MSK-v3', 'MSK-v2', 'MSK-v1', 'FoundationOne', 'Pan-Cancer Panel' and
#' 'Customized'. Default: 'MSK-v3'.
#' @param genelist A vector of panel gene list, only useful when assay is set to
#' 'Customized'.
#' @param mutType A group of variant classifications that will be kept,
#' only useful when assay is set to 'Pan-Cancer Panel' or 'Customized',
#' including 'exonic', 'nonsynonymous'. and 'all' Default: 'nonsynonymous'.
#' @param bedFilter Whether to filter the information in bed file or not, which
#' only leaves segments in Chr1-Ch22, ChrX and ChrY. Default: TRUE.
#' @return A TMB value.
#' @importFrom methods is
#' @export calTMB
#' @examples
#' maf <- vcfToMAF(system.file("extdata", "WES_EA_T_1_mutect2.vep.vcf",
#' package="CaMutQC"))
#' TMB_value <- calTMB(maf, bedFile=system.file("extdata/bed/panel_hg38",
#' "FlCDx-hg38.rds", package="CaMutQC"))

calTMB <- function(maf, bedFile = NULL, bedHeader = FALSE, assay = 'MSK-v3', 
                   genelist = NULL, mutType = 'nonsynonymous', bedFilter = TRUE){
    # check user input
    if (!(is(maf, "data.frame"))) {
        stop("maf input should be a data frame, did you get it from vcfToMAF function?")
    }
    
    # obtain genome build version and filter maf 
    if (length(unique(maf$NCBI_Build)) == 1) {
        genVer <- unique(maf$NCBI_Build)
        # give user a message about the bed used in CaMutQC
        mes <- paste0("Bed files in CaMutQC are not accurate.", 
                      " The result serves only as a reference. \n")
        warning(mes)
        res <- readBedPanel(assay=assay, genVer=genVer, maf=maf, bedFile=bedFile)
        maf <- res[[1]]
        bedFile <- res[[2]]
    }else{stop("More than 1 unique NCBI_Build value, or missing value detected.")}
    ## select variants based on chosen assay
    if (strsplit(assay, split = '-')[[1]][1] == 'MSK'){
        # filter on VAF, VAFratio and AD
        maf <- mutFilterQual(maf, tumorAD=5, VAFratio=5, VAF=0.01,
                             tumorDP=0, normalDP=0)
        # filter for non-/hotspot genes (COSMIC genes)
        mafCOS <- maf[grep('COSV', maf[, 'Existing_variation']), ]
        mafNonCOS <- maf[setdiff(rownames(maf), rownames(mafCOS)), ]
        mafCOSFilter <- mutFilterQual(mafCOS, tumorDP=20, tumorAD=8, normalDP=0,
                                 VAF=0.02, VAFratio=0)
        mafNonCOSFilter <- mutFilterQual(mafNonCOS, tumorDP=20, tumorAD=10,
                                    normalDP=0, VAF=0.05, VAFratio=0)
        maf <- unique(rbind(mafCOSFilter, mafNonCOSFilter))
        # filter non-exonic variants
        maf <- mutFilterType(maf)
    }else if (assay %in% c('FoundationOne', 'Pan-Cancer Panel')) {
        # filter noncoding variants
        noncoding <- read.table(system.file("extdata", "noncoding.txt",
                                            package="CaMutQC"), header=TRUE)
        maf <- maf[which(!(maf$One_Consequence %in%
                                          noncoding$Noncoding_Variant_Types)), ]
        # filter dbsnp variants
        tags1 <- rownames(maf[grep('rs', maf[, 'Existing_variation']), ])
        maf <- maf[setdiff(rownames(maf), tags1), ]
        # filter variants with ExAC >= 2
        ## detect whether the VCF file has ExAC annotation or not
        if ('ExAC_AF' %in% colnames(maf)){ maf <- maf[maf$ExAC_AF < 2, ]
        }else{
            mes <- paste0('ExAC information cannot be found in VCF file.',
                          ' No variants will be filtered based on ExAC.')
            warning(mes)
        }
        # filter stop-gain variants in tumor suppressor genes
        TSGs <- read.table(system.file("extdata", "TSGenelist.txt",
                           package="CaMutQC"), sep="\t", header=TRUE)
        TSGsAll <- c(TSGs$GeneSymbol,unique(unlist(strsplit(TSGs$Alias, "\\|"))))
        maf <- maf[(!((maf$Hugo_Symbol %in% TSGsAll) &
                             (maf$One_Consequence == 'stop_gained'))), ]
        # filter variants in COSMIC
        tags2 <- rownames(maf[grep('COSV', maf[, 'Existing_variation']), ])
        maf <- maf[setdiff(rownames(maf), tags2), ]
    }else if (assay == 'Customized'){
        if (!(is.null(genelist))){
            maf <- maf[(maf$Hugo_Symbol %in% genelist), ]
        }
        ## select certain variant type
        maf <- mutFilterType(maf, keepType=mutType)
    }
    bed <- readBed(bedFile, bedHeader=bedHeader)
    if (bedFilter) {
        chromVaild <- c(paste0('chr', seq_len(22)), 'chrX', 'chrY')
        bed <- bed[which(bed[, 1] %in% chromVaild), ]
    }
    # filter based on CaTag, and remove empty rows
    maf <- maf[maf$CaTag == '0' & !(is.na(maf$NCBI_Build)), ]
    # return 0 if no variants left
    if (nrow(maf) == 0){ return(0)
    }else{
        ## sort bed object
        bedProc <- unique(bed[, seq_len(3)])
        colnames(bedProc) <- c('chrom', 'chromStart', 'chromEnd')
        bedProc <- unique(cbind(bedProc,
                          interval=bedProc$chromEnd-bedProc$chromStart, Num=0))
        rownames(bedProc) <- seq_len(nrow(bedProc))
        maf <- maf[, c("Chromosome", "Start_Position", "End_Position")]
        chrs <- unique(maf$Chromosome)
        # warn if there is no overlap between the chr in maf and bed regions
        if (length(intersect(chrs, unique(bed$V1))) == 0) {
            mes <- paste0("No overlap between chr info in maf and chr info in ", 
                          "bed file.\n", "Maybe '1' in maf should be 'chr1'?")
            stop(mes)
        }
        # iterate through every chromosome
        for (chr in chrs) {
            mafTarget <- maf[maf$Chromosome == chr, ]
            bedTarget <- bedProc[bedProc$chrom == chr, ]
            l <- vapply(seq_len(nrow(bedTarget)), function(i) {
                mutCountRegion(mafTarget, bedTarget[i, ]) }, numeric(1))
            bedProc[bedProc$chrom == chr, 'Num'] <- l
        }
        TMB <- round(mean(bedProc$Num/bedProc$interval) * 1000000, 3)
        return(TMB)
    }
}

## helper function for counting variants in specific region
mutCountRegion <- function(mutLoc, bedSingle){
    count <- sum(mutLoc$Start_Position >= bedSingle[1, 'chromStart'] & 
                     mutLoc$End_Position <= bedSingle[1, 'chromEnd'])
    count <- count + bedSingle[1, 'Num']
    return(count)
}

## helper function for reading corresponding bed and panel file
readBedPanel <- function(assay, genVer, maf, bedFile){
    # check genome version
    if (!(genVer %in% c("GRCh37", "GRCh38"))){ stop("Invalid human genome version!") }
    if (assay == 'MSK-v3') {
        panelGene <- read.table(system.file("extdata/Panel_gene",
                             "MSK-IMPACT_gene_v3_468.txt", package="CaMutQC"))
        if (is.null(bedFile)){
            bedFile <- system.file("extdata/bed/panel_hg19", "MSK_v3-hg19.rds",
                                   package="CaMutQC")
            if (genVer == "GRCh38") {
                bedFile <- str_replace_all(bedFile, "19", "38")
            }
        }
    }else if (assay == 'MSK-v2') {
        panelGene <- read.table(system.file("extdata/Panel_gene",
                             "MSK-IMPACT_gene_v2_410.txt", package="CaMutQC"))
        if (is.null(bedFile)) {
            bedFile <- system.file("extdata/bed/panel_hg19", "MSK_v2-hg19.rds",
                                   package="CaMutQC")
            if (genVer == "GRCh38") {
                bedFile <- str_replace_all(bedFile, "19", "38")
            }
        }
    }else if (assay == 'MSK-v1') {
        panelGene <- read.table(system.file("extdata/Panel_gene",
                             "MSK-IMPACT_gene_v1_341.txt", package="CaMutQC"))
        if (is.null(bedFile)){
            bedFile <- system.file("extdata/bed/panel_hg19", "MSK_v1-hg19.rds",
                                   package="CaMutQC")
            if (genVer == "GRCh38") {
                bedFile <- str_replace_all(bedFile, "19", "38")
            }
        }
    }else if (assay == 'FoundationOne') {
        panelGene <- read.table(system.file("extdata/Panel_gene",
                            "FoundationOne_genelist.txt", package="CaMutQC"))
        if (is.null(bedFile)){
            bedFile <- system.file("extdata/bed/panel_hg19", "FlCDx-hg19.rds",
                                   package="CaMutQC")
            if (genVer == "GRCh38") {
                bedFile <- str_replace_all(bedFile, "19", "38")
            }
        }
    } else if (assay == 'Pan-Cancer Panel') {
        panelGene <- read.table(system.file("extdata/Panel_gene",
                                "Pan-cancer_genelist.txt", package="CaMutQC"))
        maf <- mutFilterQual(maf, tumorDP=0, normalDP=0,
                             tumorAD=0, VAF=0.05, VAFratio=0)
        if (is.null(bedFile)){
            bedFile <- system.file("extdata/bed/panel_hg19", 
                                   "Pan-cancer-hg19.rds", package="CaMutQC")
            if (genVer == "GRCh38") {
                bedFile <- str_replace_all(bedFile, "19", "38")
            }
        }
    }else if (assay != 'Customized') { stop('Invalid assay!') }
    if (exists("panelGene")) {
        maf <- maf[which(maf$Hugo_Symbol %in% panelGene$V1), ]
    }
    return(list(maf, bedFile))
}
likelet/CaMutQC documentation built on Aug. 17, 2024, 4 a.m.