#' mutFilterAdj
#' @description Filter SNVs with adjacent indels
#'
#' @param maf An MAF data frame, generated by \code{\link{vcfToMAF}} function.
#' @param maxIndelLen Maximum length of indel accepted to be included.
#' Default: 50
#' @param minInterval Minimum length of interval between an SNV and an indel
#' accepted to be included. Default: 10
#'
#' @return An MAF data frame after filtration for adjacent variants.
#'
#' @import dplyr tidyr
#' @importFrom methods is
#'
#' @export mutFilterAdj
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterAdj(maf)
mutFilterAdj <- function(maf, maxIndelLen = 50, minInterval = 10){
# check user input
if (!(is(maf, "data.frame"))) {
stop("maf input should be a data frame, did you get it from vcfToMAF function?")
}
if ("DEL" %in% unique(maf$Variant_Type) |
"INS" %in% unique(maf$Variant_Type)) {
# create an indel bed with indels of length <= maxIndelLen
bedFrame <- selectIndel(maf, maxIndelLen, minInterval)
# directly return the dataframe if there is no satisfied indel
if (is.null(bedFrame)) {
return(maf)
}
snpFrame <- maf[maf$Variant_Type == "SNP", ]
# add tags to variants in expanded bed
nTags <- rownames(snpFrame[snpFrame$Chromosome
%in% bedFrame$Chromosome
& snpFrame$Start_Position
%in% bedFrame$Location,])
maf[nTags, 'CaTag'] <- paste0(maf[nTags, 'CaTag'] , 'A')
}
# else: directly return maf if there is no INDEL in maf data frame
return(maf)
}
# select indel with length <= maxIndelLen, and return the corresponding bed
selectIndel <- function(mafDat, maxIndelLen = 50, minInterval = 10) {
indels <- rownames(mafDat[(mafDat$Variant_Type %in% c("DEL", "INS")) &
(mafDat$End_Position - mafDat$Start_Position <=
maxIndelLen),])
# if no indel satisfies the requirement, return NULL
if (length(indels) == 0) {
return(NULL)
}
chrs <- mafDat[indels, "Chromosome"]
starts <- mafDat[indels, "Start_Position"]
ends <- mafDat[indels, "End_Position"]
tmpbed <- data.frame(Chromosome=chrs,
Start_Position=starts - minInterval,
End_Position=ends + minInterval)
# generate the bed framn first
finalbed <- data.frame(matrix(ncol=2))
colnames(finalbed) <- c("Chromosome", "Location")
# iterate through every row of bed file, split it into single base
# Generate the bed frame
finalbed <- data.frame(Chromosome = character(), Location = integer())
# Iterate through every row of the bed file and split it into single bases
finalbed <- bind_rows(lapply(seq_len(nrow(tmpbed)), function(i) {
currentbed <- data.frame(Chromosome=rep(tmpbed[i, 1], tmpbed[i, 3] - tmpbed[i, 2] + 1),
Location=tmpbed[i, 2]:tmpbed[i, 3])
}), .id="ID")
# Remove the ID column added by bind_rows
finalbed <- finalbed[, -1]
# remove duplicated rows
finalbed <- na.omit(finalbed[!duplicated(finalbed), ])
rownames(finalbed) <- seq_len(nrow(finalbed))
return(finalbed)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.