R/mutFilterAdj.R

Defines functions selectIndel mutFilterAdj

Documented in mutFilterAdj

#' mutFilterAdj
#' @description Filter SNVs with adjacent indels
#'
#' @param maf An MAF data frame, generated by \code{\link{vcfToMAF}} function.
#' @param maxIndelLen Maximum length of indel accepted to be included.
#' Default: 50
#' @param minInterval Minimum length of interval between an SNV and an indel
#' accepted to be included. Default: 10
#'
#' @return An MAF data frame after filtration for adjacent variants.
#'
#' @import dplyr tidyr
#' @importFrom methods is
#'
#' @export mutFilterAdj
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterAdj(maf)

mutFilterAdj <- function(maf, maxIndelLen = 50, minInterval = 10){
    # check user input
    if (!(is(maf, "data.frame"))) {
      stop("maf input should be a data frame, did you get it from vcfToMAF function?")
    }
  
    if ("DEL" %in% unique(maf$Variant_Type) | 
        "INS" %in% unique(maf$Variant_Type)) {
        # create an indel bed with indels of length <= maxIndelLen
        bedFrame <- selectIndel(maf, maxIndelLen, minInterval)
        # directly return the dataframe if there is no satisfied indel
        if (is.null(bedFrame)) {
            return(maf)
        }
        snpFrame <- maf[maf$Variant_Type == "SNP", ]
        # add tags to variants in expanded bed
        nTags <- rownames(snpFrame[snpFrame$Chromosome 
                                     %in% bedFrame$Chromosome 
                                     & snpFrame$Start_Position 
                                     %in% bedFrame$Location,])
        maf[nTags, 'CaTag'] <- paste0(maf[nTags, 'CaTag'] , 'A')
    }
    # else: directly return maf if there is no INDEL in maf data frame
    return(maf)
}

# select indel with length <= maxIndelLen, and return the corresponding bed
selectIndel <- function(mafDat, maxIndelLen = 50, minInterval = 10) {
    indels <- rownames(mafDat[(mafDat$Variant_Type %in% c("DEL", "INS")) & 
                              (mafDat$End_Position - mafDat$Start_Position <=
                                 maxIndelLen),])
    # if no indel satisfies the requirement, return NULL
    if (length(indels) == 0) {
        return(NULL)
    }
    chrs <- mafDat[indels, "Chromosome"]
    starts <- mafDat[indels, "Start_Position"]
    ends <- mafDat[indels, "End_Position"]
    tmpbed <- data.frame(Chromosome=chrs, 
                         Start_Position=starts - minInterval, 
                         End_Position=ends + minInterval)
    # generate the bed framn first
    finalbed <- data.frame(matrix(ncol=2))
    colnames(finalbed) <- c("Chromosome", "Location")
    # iterate through every row of bed file, split it into single base
    # Generate the bed frame
    finalbed <- data.frame(Chromosome = character(), Location = integer())
    # Iterate through every row of the bed file and split it into single bases
    finalbed <- bind_rows(lapply(seq_len(nrow(tmpbed)), function(i) {
      currentbed <- data.frame(Chromosome=rep(tmpbed[i, 1], tmpbed[i, 3] - tmpbed[i, 2] + 1),
                               Location=tmpbed[i, 2]:tmpbed[i, 3])
    }), .id="ID")
    # Remove the ID column added by bind_rows
    finalbed <- finalbed[, -1]
    
    # remove duplicated rows
    finalbed <- na.omit(finalbed[!duplicated(finalbed), ])
    rownames(finalbed) <- seq_len(nrow(finalbed))
    return(finalbed)
}
likelet/CaMutQC documentation built on Aug. 17, 2024, 4 a.m.