#' mutFilterPON
#' @description Filter variants based on Panel of Normals
#'
#' @param maf An MAF data frame, generated by \code{\link{vcfToMAF}} function.
#' @param PONfile Panel-of-Normals files, which can be either obtained through
#' GATK (https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-)
#' or generated by users. Should have at least four columns: CHROM, POS, REF, ALT
#' @param PONformat The format of PON file, either "vcf" or "txt". Default: "vcf"
#' @param verbose Whether to generate message/notification during the
#' filtration process. Default: TRUE.
#'
#' @return An MAF data frame where some variants have P tag in CaTag column
#' for PON filtration.
#' @import vcfR stringr
#' @importFrom methods is
#'
#' @export mutFilterPON
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterPON(maf, PONfile=system.file("extdata",
#' "PON_test.txt", package="CaMutQC"), PONformat="txt")
## PON filtration using external dataset and info flag
mutFilterPON <- function(maf, PONfile, PONformat = "vcf", verbose = TRUE) {
# check user input
if (!(is(maf, "data.frame"))) {
stop("maf input should be a data frame, did you get it from vcfToMAF function?")
}
# give recommendation for PON file
if (verbose) {
url <- "https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-"
mes <- paste0("\n PON file can be obtained from Public GATK Panel of Normals at ",
url)
message(mes)
}
# first filter using the flag in maf column
if('PON' %in% colnames(maf)) {
# add P tag if variants have been labelled
preLabelled <- rownames(maf[!(is.na(maf$PON)), ])
}else{
preLabelled <- NULL
}
# joint useful information for matching
genomeVersion <- unique(maf$NCBI_Build)
if (length(genomeVersion) > 1){
stop('There are more than one version of genome in this dataset.')
} else if (!(genomeVersion %in% c('GRCh38', 'GRCh37'))){
stop('Invaild genome version. CaMutQC only supports human genome so far.')
} else {
# load PON file
if (verbose) {
message(' Loading PON data...')
}
if (PONformat == "vcf") {
invisible(capture.output(somatic <- read.vcfR(PONfile)))
# give PON a unique label, so they can be easily found and targeted
somatic@fix[, 'ID'] <- 1
pon_maf <- merge(maf, somatic@fix, all.x=TRUE,
by = c("Chromosome", "Start_Position",
"Reference_Allele", "Tumor_Seq_Allele2"),
by.y = c("CHROM", "POS", "REF", "ALT"), all.y=FALSE)
}else if (PONformat == "txt") {
PON_frame <- read.table(PONfile, header=TRUE, sep = "\t")
PON_frame$ID <- 1
# get the intersect veriants using merging action
pon_maf <- merge(maf, PON_frame, all.x=TRUE,
by.x = c("Chromosome", "Start_Position",
"Reference_Allele", "Tumor_Seq_Allele2"),
by.y = c("CHROM", "POS", "REF", "ALT"), all.y=FALSE)
}else{
stop("Wrong PONformat, should be 'vcf' or 'txt'")
}
# add tag qualified variants
discard <- rownames(pon_maf[which(pon_maf$ID == 1), ])
maf[union(discard, preLabelled), 'CaTag'] <- paste0(maf[union(discard,
preLabelled),
'CaTag'], 'P')
return(maf)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.