#' Basic VCF filter function
#'
#' Function to remove artifacts and low confidence/quality variant calls.
#'
#'
#' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf}
#' function from the VariantAnnotation package.
#' @param tumor.id.in.vcf The tumor id in the \code{CollapsedVCF} (optional).
#' @param use.somatic.status If somatic status and germline data is available,
#' then use this information to remove non-heterozygous germline SNPs or
#' germline SNPS with biased allelic fractions.
#' @param snp.blacklist A file with blacklisted genomic regions. Must
#' be parsable by \code{import} from \code{rtracklayer}, for a example a
#' BED file with file extension \sQuote{.bed}.
#' @param af.range Exclude variants with allelic fraction smaller or greater than
#' the two values, respectively. The higher value removes homozygous SNPs,
#' which potentially have allelic fractions smaller than 1 due to artifacts or
#' contamination. If a matched normal is available, this value is ignored,
#' because homozygosity can be confirmed in the normal.
#' @param contamination.range Count variants in germline databases with allelic fraction
#' in the specified range. If the number of these putative contamination
#' variants exceeds an expected value and if they are found on almost all
#' chromosomes, the sample is flagged as potentially contaminated and extra
#' contamination estimation steps will be performed later on.
#' @param min.coverage Minimum coverage in tumor. Variants with lower coverage
#' are ignored.
#' @param min.base.quality Minimium base quality in tumor. Requires a \code{BQ}
#' genotype field in the VCF. Values below this value will be ignored.
#' @param max.base.quality Maximum base quality in tumor. Requires a \code{BQ}
#' genotype field in the VCF. Variants exceeding this value will have their
#' BQ capped at this value.
#' @param base.quality.offset Subtracts the specified value from the base quality score.
#' Useful to add some cushion for too optimistically calibrated scores.
#' Requires a \code{BQ} genotype field in the VCF.
#' @param min.supporting.reads Minimum number of reads supporting the alt
#' allele. If \code{NULL}, calculate based on coverage and assuming sequencing
#' error of 10^-3.
#' @param error Estimated sequencing error rate. Used to calculate minimum
#' number of supporting reads using \code{\link{calculatePowerDetectSomatic}}
#' when base quality scores are not available.
#' @param target.granges \code{GenomicRanges} object specifiying the target
#' postions. Used to remove off-target reads. If \code{NULL}, do not check
#' whether variants are on or off-target.
#' @param remove.off.target.snvs If set to a true value, will remove all SNVs
#' outside the covered regions.
#' @param model.homozygous If set to \code{TRUE}, does not remove homozygous
#' variants. Ignored in case a matched normal is provided in the VCF.
#' @param interval.padding Include variants in the interval flanking regions of
#' the specified size in bp. Requires \code{target.granges}.
#' @param DB.info.flag Flag in INFO of VCF that marks presence in common
#' germline databases. Defaults to \code{DB} that may contain somatic variants
#' if it is from an unfiltered germline database.
#' @return A list with elements \item{vcf}{The filtered \code{CollapsedVCF}
#' object.} \item{flag}{A flag (\code{logical(1)}) if problems were
#' identified.} \item{flag_comment}{A comment describing the flagging.}
#' @author Markus Riester
#' @seealso \code{\link{calculatePowerDetectSomatic}}
#' @examples
#'
#' # This function is typically only called by runAbsolute via
#' # fun.filterVcf and args.filterVcf.
#' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
#' vcf <- readVcf(vcf.file, "hg19")
#' vcf.filtered <- filterVcfBasic(vcf)
#'
#' @export filterVcfBasic
#' @importFrom GenomeInfoDb seqnames seqlevelsStyle seqlevelsStyle<-
#' genomeStyles sortSeqlevels
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom stats pbeta
filterVcfBasic <- function(vcf, tumor.id.in.vcf = NULL,
use.somatic.status = TRUE, snp.blacklist = NULL, af.range = c(0.03, 0.97),
contamination.range = c(0.01, 0.075), min.coverage = 15, min.base.quality = 25,
max.base.quality = 50, base.quality.offset = 1,
min.supporting.reads = NULL, error = 0.001, target.granges = NULL,
remove.off.target.snvs = TRUE, model.homozygous = FALSE,
interval.padding = 50, DB.info.flag = "DB") {
flag <- NA
flag_comment <- NA
if (is.null(tumor.id.in.vcf)) {
tumor.id.in.vcf <- .getTumorIdInVcf(vcf)
}
if (use.somatic.status) {
n <- .countVariants(vcf)
vcf <- .testGermline(vcf, tumor.id.in.vcf)
flog.info("Removing %i non heterozygous (in matched normal) germline SNPs.",
n - .countVariants(vcf))
} else {
info(vcf)$SOMATIC <- NULL
}
if (!is.null(min.base.quality) && min.base.quality > 0) {
vcf <- .filterVcfByBQ(vcf, tumor.id.in.vcf, min.base.quality)
}
bqs <- .getBQFromVcf(vcf, tumor.id.in.vcf, max.base.quality = max.base.quality,
na.sub = TRUE, error = error, base.quality.offset = base.quality.offset)
if (length(unique(bqs)) > 1) {
flog.info("Base quality scores range from %.0f to %0.f (offset by %0.f)", min(bqs), max(bqs),
base.quality.offset)
} else {
if (is.null(bqs[1])) {
.stopRuntimeError("NULL value observed for BQS.")
}
flog.info("Global base quality score of %0.f", bqs[1])
}
if (median(bqs) > 40) {
flog.info("High median base quality score (%.0f). UMI-barcoded data?", median(bqs))
}
cutoffs <- .calcMinSupportingReadsForFiltering(vcf, tumor.id.in.vcf, bqs, min.supporting.reads)
n <- .countVariants(vcf)
.sufficientReads <- function(vcf, ref, cutoffs) {
idx <- rep(TRUE, nrow(vcf))
.filterVcfByAD <- function(vcf, min.supporting.reads, depth) {
# remove variants with insufficient reads. This includes reference alleles
# to remove homozygous germline variants.
do.call(rbind, geno(vcf)$AD[, tumor.id.in.vcf])[, ifelse(ref, 1, 2)] >=
min.supporting.reads |
geno(vcf)$DP[, tumor.id.in.vcf] < depth
}
bqs <- .getBQFromVcf(vcf, tumor.id.in.vcf, max.base.quality = max.base.quality,
na.sub = TRUE, error = error, base.quality.offset = base.quality.offset)
if (length(bqs) != length(vcf)) {
.stopRuntimeError("BQ scores and VCF do not align.")
}
for (i in seq(nrow(cutoffs))) {
colid <- match(bqs, colnames(cutoffs))
idx <- idx & .filterVcfByAD(vcf, cutoffs[i, colid], as.numeric(rownames(cutoffs))[i])
}
idx
}
idxNotAlt <- .sufficientReads(vcf, ref = FALSE, cutoffs)
vcf <- .removeVariants(vcf, !idxNotAlt, "sufficient reads")
idxNotHomozygous <- .sufficientReads(vcf, ref = TRUE, cutoffs)
if (!sum(idxNotHomozygous)) .stopUserError("None of the heterozygous variants in provided VCF passed filtering.")
#--------------------------------------------------------------------------
# heurestic to find potential contamination
#--------------------------------------------------------------------------
idx <- info(vcf)[[DB.info.flag]] & idxNotHomozygous &
(unlist(geno(vcf)$FA[, tumor.id.in.vcf]) < contamination.range[2] |
unlist(geno(vcf)$FA[, tumor.id.in.vcf]) > (1 - contamination.range[2]))
if (!sum(idx)) {
# this usually only happens with wrong input where all germline SNPs are removed
fractionContaminated <- 0
} else {
fractionContaminated <- sum(idx) / sum(info(vcf)[[DB.info.flag]] & idxNotHomozygous)
}
minFractionContaminated <- 0.02
if (fractionContaminated > 0) {
expectedAllelicFraction <- contamination.range[1] * 0.75
powerDetectCont <- calculatePowerDetectSomatic(mean(geno(vcf)$DP[, tumor.id.in.vcf],
na.rm = TRUE), f = expectedAllelicFraction, verbose = FALSE)$power
minFractionContaminated <- min(0.2, max(minFractionContaminated, powerDetectCont * 0.5))
flog.info("Initial testing for significant sample cross-contamination: %s",
ifelse(fractionContaminated > minFractionContaminated, "maybe", "unlikely"))
}
# do we have many low allelic fraction calls that are in germline databases on basically
# all chromosomes? then we found some contamination
if (sum(runLength(seqnames(rowRanges(vcf[idx]))) > 3) >= 20 &&
fractionContaminated >= minFractionContaminated) {
flag <- TRUE
flag_comment <- "POTENTIAL SAMPLE CONTAMINATION"
}
# If we have a matched normal, we can distinguish LOH from homozygous
# in 100% pure samples. If not we need to see a sufficient number
# of alt alleles to believe a heterozygous normal genotype.
if (use.somatic.status || model.homozygous) {
af.range[2] <- 1
if (model.homozygous) af.range[2] <- Inf
} else {
vcf <- .removeVariants(vcf, !idxNotHomozygous, "homozygous")
}
vcf <- .removeVariants(vcf,
unlist(geno(vcf)$DP[, tumor.id.in.vcf]) < min.coverage,
"min.coverage")
vcf <- .removeVariants(vcf,
unlist(geno(vcf)$FA[, tumor.id.in.vcf]) < af.range[1],
"af.range")
# remove homozygous germline
vcf <- .removeVariants(vcf,
info(vcf)[[DB.info.flag]] &
geno(vcf)$FA[, tumor.id.in.vcf] >= af.range[2],
"homozygous af.range")
flog.info("Removing %i variants with AF < %.3f or AF >= %.3f or insufficient supporting reads or depth < %i.",
n - .countVariants(vcf), af.range[1], af.range[2], min.coverage)
n <- .countVariants(vcf)
if (!is.null(snp.blacklist)) {
for (i in seq_along(snp.blacklist)) {
blackBed <- try(import(snp.blacklist[i]))
if (is(blackBed, "try-error")) {
.stopUserError("Could not import snp.blacklist ", snp.blacklist[[i]],
":", blackBed)
}
ov <- suppressWarnings(overlapsAny(vcf, blackBed))
vcf <- .removeVariants(vcf, ov, "blacklist")
flog.info("Removing %i blacklisted variants.",
n - .countVariants(vcf))
}
}
if (!is.null(target.granges)) {
vcf <- .annotateVcfTarget(vcf, target.granges, interval.padding)
if (remove.off.target.snvs) {
n.vcf.before.filter <- .countVariants(vcf)
# make sure all SNVs are in covered exons
key <- paste0(.getPureCNPrefixVcf(vcf), "OnTarget")
vcf <- .removeVariants(vcf, info(vcf)[[key]] <= 0, "intervals")
flog.info("Removing %i variants outside intervals.",
n.vcf.before.filter - .countVariants(vcf))
}
}
if (!is.null(info(vcf)[[DB.info.flag]]) &&
sum(info(vcf)[[DB.info.flag]]) < nrow(vcf) / 2) {
flog.warn("Less than half of variants are annoted as germline database member. Make sure that VCF %s",
"contains both germline and somatic variants.")
}
# remove seqlevels we filtered out
seqlevels(vcf) <- seqlevelsInUse(vcf)
list(
vcf = vcf,
flag = flag,
flag_comment = flag_comment
)
}
# If a matched normal is available, this will remove all
# heterozygous germline SNPs with biased allelic fractions
.testGermline <-
function(vcf, tumor.id.in.vcf, allowed = 0.05) {
# extract normal allelic fractions and total coverage from the VCF
arAll <- as.numeric(geno(vcf)$FA[, -match(tumor.id.in.vcf,
colnames(geno(vcf)$FA))])
dpAll <- as.numeric(geno(vcf)$DP[, -match(tumor.id.in.vcf,
colnames(geno(vcf)$DP))])
# calculate probability that true allelic fraction in normal is 0.5
pBeta <-pbeta(0.5,
shape1 = arAll * dpAll + 1,
shape2 = (1 - arAll) * dpAll + 1, log.p = FALSE)
# keep only somatic, non-biased and if allelic ratio is close
# enough. The latter is useful for ultra-deep sequencing, when
# non-reference bias can lead to small p-values.
idx <- info(vcf)$SOMATIC | (pBeta > 0.025 & pBeta < 1 - 0.025) |
(arAll > 0.5 - allowed & arAll < 0.5 + allowed)
idx <- idx & dpAll > 5
.removeVariants(vcf, !idx, "matched germline")
}
# calculate allelic fraction from read depths
.addFaField <- function(vcf, field = "FA") {
if (is.null(geno(vcf)$AD)) {
.stopRuntimeError("No AD geno field in VCF.")
}
matrixFA <- do.call(rbind, apply(geno(vcf)$AD, 1, function(x) lapply(x,
function(y) y[2] / sum(y))))
newGeno <- DataFrame(
Number = "A", Type = "Float",
Description = "Allele fraction of the alternate allele",
row.names = field)
geno(header(vcf)) <- rbind(geno(header(vcf)), newGeno)
geno(vcf)[[field]] <- matrixFA
vcf
}
.addDpField <- function(vcf, field="DP") {
if (is.null(geno(vcf)$AD)) {
.stopRuntimeError("No AD geno field in VCF.")
}
matrixDP <- apply(geno(vcf)$AD, 2, sapply, sum)
newGeno <- DataFrame(
Number = 1, Type = "Integer",
Description = "Approximate read depth",
row.names = field)
geno(header(vcf)) <- rbind(geno(header(vcf)), newGeno)
geno(vcf)[[field]] <- matrixDP
vcf
}
.addBqField <- function(vcf, error = 0.001, field = "BQ") {
flog.warn("Did not find base quality scores, will use global error rate of %.4f instead.",
error)
matrixBQ <- matrix(-10 * log10(error),
nrow = nrow(vcf), ncol = ncol(vcf),
dimnames = list(rownames(vcf), colnames(vcf)))
newGeno <- DataFrame(
Number = "A", Type = "Float",
Description = "Average base quality",
row.names = field)
geno(header(vcf)) <- rbind(geno(header(vcf)), newGeno)
geno(vcf)[[field]] <- matrixBQ
vcf
}
.addFilterFlag <- function(vcf) {
newFilter <- DataFrame(
Description = "Ignored by PureCN",
row.names = "purecn_ignore")
fixed(header(vcf))$FILTER <- rbind(fixed(header(vcf))$FILTER, newFilter)
vcf
}
.getNormalIdInVcf <- function(vcf, tumor.id.in.vcf) {
samples(header(vcf))[-match(tumor.id.in.vcf, samples(header(vcf)))]
}
.getTumorIdInVcf <- function(vcf, sampleid=NULL) {
.getTumorId <- function(vcf, sampleid) {
if (ncol(vcf) == 1) return(1)
if (ncol(vcf) != 2) {
.stopUserError("VCF contains ", ncol(vcf), " samples. Should be ",
"either one or two.")
}
if (!is.null(geno(vcf)$GT)) {
cs <- colSums(geno(vcf)$GT == "0")
if (any(is.na(cs))) {
flog.warn("GT field in VCF contains missing values.")
}
cs <- colSums(geno(vcf)$GT == "0", na.rm = TRUE)
if (max(cs) > 0) return(which.min(cs))
cs <- colSums(geno(vcf)$GT == "0/0", na.rm = TRUE)
if (max(cs) > 0) return(which.min(cs))
}
if (!is.null(geno(vcf)$FA)) {
cs <- apply(geno(vcf)$FA, 2, function(x) length(which(as.numeric(x) == 0)))
if (max(cs) > 0) return(which.min(cs))
}
if (sampleid %in% samples(header(vcf))) {
return(match(sampleid, samples(header(vcf))))
}
flog.warn("Cannot determine tumor vs. normal in VCF. %s",
"Assuming it is first sample. Specify tumor via sampleid argument.")
return(1)
}
tumor.id.in.vcf <- .getTumorId(vcf, sampleid = sampleid)
if (!is.numeric(tumor.id.in.vcf) || tumor.id.in.vcf < 1 ||
tumor.id.in.vcf > 2) {
.stopRuntimeError("Tumor id not in expected range.")
}
tumor.id.in.vcf <- samples(header(vcf))[tumor.id.in.vcf]
# check that sampleid matches tumor and not normal.
if (!is.null(sampleid)) {
if (sampleid %in% samples(header(vcf)) &&
sampleid != tumor.id.in.vcf) {
flog.warn("Sampleid looks like a normal in VCF, not like a tumor.")
tumor.id.in.vcf <- sampleid
}
}
tumor.id.in.vcf
}
.checkVcfFieldAvailable <- function(vcf, field) {
if (is.null(geno(vcf)[[field]]) ||
sum(is.finite(as.numeric(geno(vcf)[[field]][, 1]))) < 1) {
return(FALSE)
}
return(TRUE)
}
.readAndCheckVcf <- function(vcf.file, genome, DB.info.flag = "DB",
POPAF.info.field = "POP_AF",
min.pop.af = 0.001, error = 0.001,
check.DB = TRUE, vcf.field.prefix = NULL) {
if (is(vcf.file, "character")) {
vcf <- readVcf(vcf.file, genome)
} else if (!is(vcf.file, "CollapsedVCF")) {
.stopUserError("vcf.file neither a filename nor a CollapsedVCF ",
"object.")
} else {
vcf <- vcf.file
}
# add PureCN ignore flag
vcf <- .addFilterFlag(vcf)
flog.info("Found %i variants in VCF file.", length(vcf))
triAllelic <- elementNROWS(alt(vcf)) > 1
if (sum(triAllelic)) {
n <- .countVariants(vcf)
vcf <- .removeVariants(vcf, triAllelic, "triallelic")
flog.info("Removing %i triallelic sites.",n - length(vcf))
}
if (is.null(info(vcf)$SOMATIC)) {
# try to add a SOMATIC flag for callers that do not
# provide one (matched tumor/normal cases only)
vcf <- .addSomaticField(vcf)
}
# attempt to find GATK4 name for POP_AF field
# if POP_AF does not exist
if (!POPAF.info.field %in% names(info(vcf)) &&
"POPAF" %in% names(info(vcf))) {
# try to add an DP geno field if missing
POPAF.info.field <- "POPAF"
}
if (!.checkVcfFieldAvailable(vcf, "DP")) {
# try to add an DP geno field if missing
vcf <- .addDpField(vcf)
}
# check for NAs in DP
idx <- is.na(rowSums(geno(vcf)$DP))
if (any(idx)) {
n <- .countVariants(vcf)
vcf <- .removeVariants(vcf, idx, "NA DP")
flog.warn("DP FORMAT field contains NAs. Removing %i variants.", n - length(vcf))
}
if (check.DB) {
if (is.null(info(vcf)[[DB.info.flag]]) ||
all(unlist(sapply(info(vcf)[[DB.info.flag]], is.na)))) {
# try to add an DB field based on rownames
vcf <- .addDbField(vcf, DB.info.flag, POPAF.info.field, min.pop.af)
} else if (!is.null(info(vcf)[[POPAF.info.field]])) {
flog.warn("VCF contains both %s and %s INFO fields. Will ignore %s.",
DB.info.flag, POPAF.info.field, POPAF.info.field)
}
# check for NAs in DB
idx <- is.na(info(vcf)[[DB.info.flag]])
if (sum(idx)) {
flog.warn("DB INFO flag contains NAs")
info(vcf)[[DB.info.flag]][idx] <- FALSE
}
cntLikelyGL <- sum(info(vcf)[[DB.info.flag]], na.rm = TRUE)
flog.info("%i (%.1f%%) variants annotated as likely germline (%s INFO flag).",
cntLikelyGL, cntLikelyGL / length(vcf) * 100, DB.info.flag)
if (!cntLikelyGL) {
.stopUserError("VCF either contains no germline variants or variants are not properly annotated.")
}
}
if (is.null(geno(vcf)$AD)) {
vcf <- .addADField(vcf)
} else {
vcf <- .checkADField(vcf)
}
if (!.checkVcfFieldAvailable(vcf, "FA")) {
# try to add an FA geno field if missing
vcf <- .addFaField(vcf)
}
idx <- !apply(geno(vcf)$FA, 1, complete.cases)
if (any(idx)) {
flog.warn("Found %i variants with missing allelic fraction starting with %s. Removing them.",
sum(idx), rownames(vcf)[idx][1])
vcf <- .removeVariants(vcf, idx, "NA FA")
}
# we don't know yet which sample is tumor. in case normal has no base quality
# scores, pick the sample with lowest number of NAs
bqs <- lapply(seq(ncol(vcf)), function(i) .getBQFromVcf(vcf, i, na.sub = FALSE))
bq <- bqs[[which.min(sapply(bqs, function(x) sum(is.na(x))))]]
if (is.null(bq) || all(is.na(bq))) {
vcf <- .addBqField(vcf, error)
} else if (any(is.na(bq))) {
flog.warn("BQ FORMAT field contains NAs.")
}
meta(header(vcf))$purecnprefix <- DataFrame(
Value = vcf.field.prefix,
row.names = "purecnprefix"
)
vcf
}
.getPureCNPrefixVcf <- function(vcf) {
prefix <- meta(header(vcf))$purecnprefix[[1]]
prefix <- if (is.null(prefix)) "" else prefix
return(prefix)
}
.checkADField <- function(vcf) {
refs <- apply(geno(vcf)$AD, 2, function(x) sapply(x, function(y) y[2]))
if (!any(complete.cases(refs))) {
# VarScan2 does only provide alt in AD
flog.warn("AD field misses ref counts.")
matrixAD <- do.call(cbind, lapply(samples(header(vcf)), function(j) {
dp <- unlist(geno(vcf)$DP[, j])
alt <- sapply(geno(vcf)$AD[, j], function(x) x[1])
ref <- dp - alt
AD <- lapply(seq_along(dp), function(i) as.integer(c(ref[i], alt[i])))
names(AD) <- names(dp)
AD
}))
colnames(matrixAD) <- samples(header(vcf))
geno(vcf)$AD <- matrixAD
}
vcf
}
.addDbField <- function(vcf, DB.info.flag,
POPAF.info.field,
min.pop.af) {
if (!is.null(info(vcf)$SOMATIC) &&
sum(colSums(geno(vcf)$DP) > 0) == 2 &&
any(info(vcf)$SOMATIC)) {
db <- !info(vcf)$SOMATIC
flog.warn("vcf.file has no DB info field for membership in germline databases.%s",
" Found and used somatic status instead.")
} else if (!is.null(info(vcf)[[POPAF.info.field]]) &&
max(unlist(info(vcf)[[POPAF.info.field]]), na.rm = TRUE) > 0.05) {
if (max(unlist(info(vcf)[[POPAF.info.field]]), na.rm = TRUE) > 1.1) {
flog.info("Maximum of POPAF INFO is > 1, assuming -log10 scaled values")
db <- info(vcf)[[POPAF.info.field]] < -log10(min.pop.af)
} else {
db <- info(vcf)[[POPAF.info.field]] > min.pop.af
}
db <- sapply(db, function(x) x[[1]])
flog.warn("vcf.file has no DB info field for membership in germline databases. Found and used valid population allele frequency > %f instead.",
min.pop.af)
} else {
db <- grepl("^rs", rownames(vcf))
if (!sum(db)) {
.stopUserError("vcf.file has no ", DB.info.flag, " or ", POPAF.info.field,
" info field for membership in germline databases.")
} else {
flog.warn("vcf.file has no %s or %s info field for membership in germline databases.%s",
DB.info.flag, POPAF.info.field, " Guessing it based on available dbSNP ID.")
}
}
newInfo <- DataFrame(
Number = 0,
Type = "Flag",
Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or germline database membership",
row.names = DB.info.flag)
info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
info(vcf)[[DB.info.flag]] <- db
vcf
}
.addSomaticField <- function(vcf) {
# add SOMATIC flag only for GATK4 MuTect2 output
if (ncol(vcf) < 2) {
return(vcf)
} else {
newInfo <- DataFrame(
Number = 0, Type = "Flag",
Description = "Somatic event",
row.names = "SOMATIC")
info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
if (!is.null(info(vcf)$P_GERMLINE)) {
info(vcf)$SOMATIC <- unlist(info(vcf)$P_GERMLINE < log10(0.5))
info(vcf)$SOMATIC[is.infinite(info(vcf)$SOMATIC) |
is.na(info(vcf)$SOMATIC)] <- FALSE
} else if (!is.null(info(vcf)$GERMQ)) {
flog.warn("Found GERMQ info field with Phred scaled germline probabilities.")
info(vcf)$SOMATIC <- unlist(info(vcf)$GERMQ > -10 * log10(0.5))
info(vcf)$SOMATIC[is.infinite(info(vcf)$SOMATIC) |
is.na(info(vcf)$SOMATIC) ] <- FALSE
} else {
flog.warn("Having trouble guessing SOMATIC status...")
info(vcf)$SOMATIC <- apply(geno(vcf)$GT, 1, function(x) x[1] != x[2])
}
}
vcf
}
.addADField <- function(vcf, field="AD") {
# FreeBayes
if (!length(setdiff(c("DP", "AO", "RO"), names(geno(vcf))))) {
matrixAD <- do.call(cbind, lapply(samples(header(vcf)), function(j) {
ao <- unlist(geno(vcf)$AO[, j])
ro <- unlist(geno(vcf)$RO[, j])
AD <- lapply(seq_along(ao), function(i) as.integer(c(ro[i], ao[i])))
names(AD) <- names(ao)
AD
}))
colnames(matrixAD) <- samples(header(vcf))
newGeno <- DataFrame(
Number = ".", Type = "Integer",
Description = "Allelic depths for the ref and alt alleles in the order listed",
row.names = field)
geno(header(vcf)) <- rbind(geno(header(vcf)), newGeno)
geno(vcf)[[field]] <- matrixAD
} else {
.stopUserError("vcf.file has no AD geno field containing read depths ",
"of ref and alt.")
}
vcf
}
.addCosmicCNT <- function(vcf, cosmic.vcf.file, Cosmic.CNT.info.field) {
if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
flog.info("VCF already COSMIC annotated. Skipping.")
return(vcf)
}
cosmicSeqStyle <- .getSeqlevelsStyle(headerTabix(
TabixFile(cosmic.vcf.file))$seqnames)
vcfRenamedSL <- vcf
if (!length(intersect(seqlevelsStyle(vcf), cosmicSeqStyle))) {
seqlevelsStyle(vcfRenamedSL) <- cosmicSeqStyle[1]
}
flog.info("Reading COSMIC VCF...")
# look-up the variants in COSMIC
cosmic.vcf <- readVcf(cosmic.vcf.file, genome = genome(vcfRenamedSL)[1],
ScanVcfParam(which = rowRanges(vcfRenamedSL),
info = "CNT",
fixed = "ALT",
geno = NA
)
)
ov <- findOverlaps(vcfRenamedSL, cosmic.vcf, type = "equal")
# make sure that alt alleles match
idx <- as.logical(alt(vcf[queryHits(ov)]) ==
alt(cosmic.vcf[subjectHits(ov)]))
ov <- ov[idx]
if (!length(ov)) return(vcf)
if (is.null(info(cosmic.vcf)$CNT)) {
flog.warn("Cosmic VCF has no CNT info field. %s",
"Giving up COSMIC annotation.")
return(vcf)
}
newInfo <- DataFrame(
Number = 1, Type = "Integer",
Description = "How many samples in Cosmic have this mutation",
row.names = Cosmic.CNT.info.field)
info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
info(vcf)[[Cosmic.CNT.info.field]] <- NA
info(vcf)[[Cosmic.CNT.info.field]][queryHits(ov)] <- info(cosmic.vcf[subjectHits(ov)])$CNT
vcf
}
.calcTargetedGenome <- function(granges) {
tmp <- reduce(granges)
sum(width(tmp)) / (1000^2)
}
.annotateVcfTarget <- function(vcf, target.granges, interval.padding) {
target.granges.padding <- .padGranges(target.granges, interval.padding)
flog.info("Total size of targeted genomic region: %.2fMb (%.2fMb with %ibp padding).",
.calcTargetedGenome(target.granges),
.calcTargetedGenome(target.granges.padding),
interval.padding)
idxTarget <- overlapsAny(vcf, target.granges)
idxPadding <- overlapsAny(vcf, target.granges.padding)
prefix <- .getPureCNPrefixVcf(vcf)
key <- paste0(prefix, "OnTarget")
newInfo <- DataFrame(
Number = 1, Type = "Integer",
Description = "1: On-target; 2: Flanking region; 0: Off-target.",
row.names = key)
info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
info(vcf)[[key]] <- 0
info(vcf)[[key]][idxPadding] <- 2
info(vcf)[[key]][idxTarget] <- 1
# report stats in log file
targetsWithSNVs <- overlapsAny(target.granges.padding, vcf)
percentTargetsWithSNVs <- sum(targetsWithSNVs, na.rm = TRUE) /
length(targetsWithSNVs) * 100
flog.info("%.1f%% of targets contain variants.",
percentTargetsWithSNVs)
vcf
}
.getBQFromVcf <- function(vcf, tumor.id.in.vcf, max.base.quality = 50,
na.sub = TRUE, error = 0.001, base.quality.offset = 1) {
x <- NULL
if (!is.null(geno(vcf)$BQ)) {
# Mutect 1
x <- as.numeric(geno(vcf)$BQ[, tumor.id.in.vcf])
} else if (!is.null(geno(vcf)$MBQ)) {
# Mutect 2
x <- as.numeric(geno(vcf)$MBQ[, tumor.id.in.vcf])
} else if (!is.null(info(vcf)$MBQ)) {
# Mutect 2.2
x <- sapply(info(vcf)$MBQ, function(x) x[2])
} else if (!is.null(rowRanges(vcf)$QUAL)) {
# Freebayes
x <- as.numeric(rowRanges(vcf)$QUAL)
}
if (!is.null(x)) {
x <- pmin(max.base.quality, x)
idx <- is.na(x)
if (any(idx) && na.sub) {
x[idx] <- -10 * log10(error)
}
x <- x - base.quality.offset
# make sure we have integers here
x <- floor(x)
}
return(x)
}
.filterVcfByBQ <- function(vcf, tumor.id.in.vcf, min.base.quality) {
n.vcf.before.filter <- .countVariants(vcf)
idx <- .getBQFromVcf(vcf, tumor.id.in.vcf, base.quality.offset = 0) < min.base.quality
if (sum(idx, na.rm = TRUE) / length(idx) > 0.5) {
flog.warn("Many variants removed by min.base.quality (%i) filter. You might want to lower the cutoff.", min.base.quality)
}
vcf <- .removeVariants(vcf, idx, "BQ", na.rm = FALSE)
flog.info("Removing %i low quality variants with non-offset BQ < %i.",
n.vcf.before.filter - .countVariants(vcf), min.base.quality)
vcf
}
.checkVcfNotEmpty <- function(vcf) {
if (!.countVariants(vcf)) .stopUserError("None of the variants in provided VCF passed filtering.")
}
# in future this will use the FILTER flag to remove variants
.removeVariants <- function(vcf, idx, label, na.rm = TRUE) {
if (is.null(idx)) {
flog.warn("Could not identify variants at filter step %s.", label)
return(vcf)
}
if (is(idx, "integer")) {
idx <- seq(length(vcf)) %in% idx
}
if (any(is.na(idx))) {
flog.warn("Variant ids contain NAs at filter step %s.", label)
idx[is.na(idx)] <- na.rm
}
# TODO: once used flags, make sure this includes already filtered variants
if (all(idx)) {
.stopUserError("No variants passed filter ", label, ".")
}
vcf[!idx]
}
# in future this will use the FILTER flag to count
.countVariants <- function(vcf) {
if (!is(vcf, "VCF")) {
.stopRuntimeError("Not a VCF object in .countVariants.")
}
nrow(vcf)
}
#returns a bqs by coverage matrix with min reads cutoffs
.calcMinSupportingReadsForFiltering <- function(vcf, tumor.id.in.vcf, bqs, min.supporting.reads) {
errorsp <- sort(unique(bqs))
errors <- 10^(-errorsp / 10)
# find supporting read cutoffs based on coverage and sequencing error
#--------------------------------------------------------------------------
if (is.null(min.supporting.reads)) {
depthsall <- geno(vcf)$DP[, tumor.id.in.vcf]
depths <- sort(unique(round(log2(depthsall))))
depths <- 2^depths
# this should not happen, but 0 coverage will confuse the power calculations
if (min(depths) < 1) {
depths[depths < 1] <- 1
depths <- unique(depths)
}
cutoffs <- sapply(errors, function(e) sapply(depths, function(d)
min(d + 1, suppressWarnings(.calcMinSupportingReadsForPower(coverage = d, error = e)))))
colnames(cutoffs) <- errorsp
rownames(cutoffs) <- depths
if (any(is.na(cutoffs))) {
.stopRuntimeError("Observed NAs in minimum supporting reads calculation.")
}
flog.info("Minimum number of supporting reads ranges from %.0f to %.0f, depending on coverage and BQS.",
min(cutoffs), max(cutoffs))
} else {
cutoffs <- matrix(min.supporting.reads, nrow = 1, ncol = length(errorsp),
dimnames = list(0, errorsp))
}
return(cutoffs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.