#' Compute allele frequencies at germline heterozygous positions
#
#' Given a set of genomic coordinates this function identifies heterozygous positions in a bam file
#' given by \code{normalBam} and retuns the minor allele frequency at these positions in a bam file
#' specified by \code{tumorBam}.
#'
#' @param normalBam The path to a bam file
#' @param tumorBam The path to a bam file
#' @param genome The genome build of \code{tumorBam} and \code{normalBam}. Possible values are "hg38", "hg19", and "hg18".
#' @param positions A \code{GRanges} object consisting of the genomic regions of interest. The default set of positions is the \code{snps} object from \code{svfilters.hg38}, \code{svfilters.hg19} or \code{svilters.hg18} depending on the user-specified \code{genome} argument. The \code{snps} object contains 1,000,000 positions that are frequently seen as heterozygous, therefore fewer positions are needed to find a sufficient number of heterozygous positions. The \code{snps} object contains more than enough positions for tumor ploidy/ploidy analysis on well-covered WGS bam files, although for bam files genertated from targeted capture-based sequencing data it is recommended to use all or most the \code{dbsnp150_snps} object in \code{svfilters.hg38}, \code{svfilters.hg19} or \code{svfilters.hg18} to find a sufficient number of heterozygous positions as these objects contain over 12,000,000 positions.
#'
#' @param region If \code{region} is specified only SNPs in \code{position} that fall in \code{region} will be used. This argument is useful for capture-based sequencing technologies (e.g. Whole-exome sequencing) where we tend to only have high enough coverage to accurately call SNPs in the targeted regions. By supplying a \code{GRanges} object containing the targeted regions this function avoids calculating coverage metrics and allele frequencies for low coverage off-target regions. Alternatively, users can provide the full path to a bed file in BED format.
#' @param n The number of positions to use for pileup in \code{normalBam}. The \code{snps} objects
#' in \code{svfilters.hg38}, \code{svfilters.hg19} and \code{svfilters.hg18} contain 1 million frequently heterozygous positions. By specifying the \code{n} argument a random sample of the positions object of length \code{n} will be used. For tumor purity/ploidy analysis 10,000 heterozygous positions well spread across the genome is typically plenty. The default value of \code{n = 50000} is generally sufficient to achieve this on a 30X WGS bam file. For sequencing data from targeted capture protocols, it is recommended to use the full set of SNPs in \code{dbsnp150_snp} in \code{svfilters.hg38}, \code{svfilters.hg19} or \code{svfilters.hg18} to identify a suffient number of SNPs with high enough coverage.
#' @param minCovNormal The minimum coverage of a position in \code{normalBam} to be considered.
#' @param minCovTumor The minimum coverage of a position in \code{tumorBam} to be considered.
#' @param min_base_quality The minimum Phred score of a base for it to be counted
#' @param minMafNormal The minimum minor allele frequency (MAF) in order to consider a position
#' as heterozygous in \code{normalBam}.
#' @param minMafTumor The minimum minor allele frequency (MAF) in order to consider a position
#' as heterozygous in \code{tumorBam}. It is recommended to set this value to at least 0.05
#' when setting \code{normalBam = NULL} to avoid outputting homozygous positions.
#'
#' @details If using this function to generate allele frequencies in a tumor
#' sample at germline heterozygous positions identified in a matched normal sample then
#' \code{tumorBam} should point to the bam file for the tumor sample and \code{normalBam}
#' should point to the bam file for its matched normal.
#'
#' @examples
#' extdir <- system.file("extdata", package="svbams")
#' bam1 <- file.path(extdir, "cgov10t.bam")
#' bam2 <- file.path(extdir, "cgov44t_revised.bam")
#'
#' data(snps, package = "svfilters.hg19")
#' snps <- keepSeqlevels(snps, c("chr3", "chr5"), pruning.mode = "coarse")
#' \dontrun{
#' svAF(normalBam=bam1,
#' tumorBam=bam2,
#' genome="hg19",
#' positions = snps,
#' n = 1000,
#' minCovNormal = 10,
#' minCovTumor = 10,
#' minMafNormal = 0.3,
#' minMafTumor = 0)
#'}
#' @return
#' A data.frame with the following columns:
#'
#' \strong{Chrom}: The chromosome of the event \cr
#' \strong{Pos}: The coordinate of the event \cr
#' \strong{RefBase}: The base in the reference genome (corresponds to the refUCSC column in dbSNP build 150) \cr
#' \strong{AltBase}: The other observed base \cr
#' \strong{Normal.Mut.Count}: The coverage of AltBase in normalBam \cr
#' \strong{Normal.Coverage}: The distinct coverage of RefBase + AltBase in normalBam \cr
#' \strong{Tumor.Mut.Count}: The coverage of AltBase in tumorBam \cr
#' \strong{Tumor.Coverage}: The distinct coverage of RefBase + AltBase in tumorBam \cr
#' \strong{Tumor.MAF}: The minor allele frequency of the event in tumorBam
#'
#' @export
#'
svAF <- function(normalBam,
tumorBam,
genome,
positions,
region,
n = 50000,
minCovNormal = 20,
minCovTumor = 20,
minMafNormal = 0.3,
minMafTumor = 0,
min_base_quality = 0) {
if (!is.null(normalBam)) {
if (!file.exists(normalBam)) {
stop(paste0(normalBam), " doesn't exist!")
}
}
if (!is.null(tumorBam)) {
if (!file.exists(tumorBam)) {
stop(paste0(tumorBam), " doesn't exist!")
}
}
if (minCovNormal < 0) {
stop("minCovNormal cannot take on a negative value.")
}
if (minCovTumor < 0) {
stop("minCovTumor cannot take on a negative value.")
}
if ((minMafNormal < 0) | (minMafNormal > 0.5)) {
stop("minMafNormal must be in the range [0,0.5]")
}
if ((minMafTumor < 0) | (minMafTumor > 0.5)) {
stop("minMafTumor must be in the range [0,0.5]")
}
if (missing(positions)) {
data("snps", package = paste0("svfilters.", genome), envir = environment())
SNPs <- get("snps")
} else {
SNPs <- positions
}
if (min_base_quality < 0) {
stop("The value of 'min_base_quality' must be greater than or equal to 0")
}
if (n < 1) {
stop("The value of 'n' must be at least 1")
}
if (n > length(SNPs)) {
stop("The value of 'n' must be less than or equal to the number of SNPs in 'positions'")
}
if (!(genome %in% c("hg18", "hg19", "hg38"))) {
stop(paste0(genome, " is not a possible value for 'genome'. Possible values include 'hg19' and 'hg18'"))
}
if (!missing(region)) {
if (!is(region, "GRanges")) {
if (file.exists(region)) {
region <- bed2gr(bedPath = region)
} else {
stop("The file path given as an argument to 'region' does not exist. Make sure that the full path to
the bed file is specified e.g. '/Users/XSVuser/Documents/capture-region.bed'")
}
}
}
out.df <- data.frame(Chrom = character(0),
Pos = character(0),
RefBase = character(0),
AltBase = character(0),
Normal.Mut.Count = character(0),
Normal.Coverage = character(0),
Tumor.Mut.Count = character(0),
Tumor.Coverage = character(0),
Tumor.MAF = character(0))
if (!missing(region)) {
SNPs <- subsetByOverlaps(SNPs, region)
if (length(SNPs) > 0) {
message(paste0(length(SNPs), " SNPs in 'positions' overlap with 'region'"))
} else {
stop(paste0(length(SNPs), " SNPs in 'positions' overlap with 'region'"))
}
}
if (n > length(SNPs)) {
n <- length(SNPs)
message("Scaling down 'n' to ", n)
}
SNPs <- SNPs[sample(1:length(SNPs), size = n, replace = FALSE)]
if (!is.null(normalBam) & !is.null(tumorBam)) {
message(paste0("Computing pileup at ", n, " position(s) in ", normalBam))
querySNPs <- SNPs
normalPU <- pileup(normalBam,
scanBamParam=ScanBamParam(which=querySNPs,
flag = scanBamFlag(isDuplicate = FALSE)),
pileupParam=PileupParam(distinguish_strands=FALSE,
include_deletions=FALSE,
min_base_quality=min_base_quality))
if (nrow(normalPU) == 0) {
warning(paste0("0 coverage was found at every specified position in", normalBam), call. = FALSE)
return(out.df)
}
message("Identifying heterozygous positions")
normalSNPs <- filterSNPs(pu = normalPU, SNPs = querySNPs, min.cov = minCovNormal, min.maf = minMafNormal, keepSingles = FALSE)
if (length(normalSNPs) == 0) {
warning(paste0("0 heterozygous positions were found in", normalBam), call. = FALSE)
return(out.df)
}
message(paste0("Computing pileup at ", length(normalSNPs), " position(s) in ", tumorBam))
tumorPU <- pileup(tumorBam,
scanBamParam=ScanBamParam(which=normalSNPs,
flag = scanBamFlag(isDuplicate = FALSE)),
pileupParam=PileupParam(distinguish_strands=FALSE,
include_deletions=FALSE,
min_base_quality=min_base_quality))
if (nrow(tumorPU) == 0) {
warning(paste0("0 coverage was found at every identified germline heterozygous position in", tumorBam), call. = FALSE)
return(out.df)
}
tumorSNPs <- filterSNPs(pu = tumorPU, SNPs = querySNPs, min.cov = minCovTumor, min.maf = minMafTumor, keepSingles = TRUE)
alleleFreqs <- calcAlleleFreq(tumorSNPs = tumorSNPs, normalSNPs = normalSNPs)
}
## When tumorBam is specified and normalBam isn't, do pileup in tumorBam with minMafTumor
if (is.null(normalBam) & !is.null(tumorBam)) {
querySNPs <- SNPs
message(paste0("Computing pileup at ", length(querySNPs), " position(s) in ", tumorBam))
tumorPU <- pileup(tumorBam,
scanBamParam=ScanBamParam(which=querySNPs,
flag = scanBamFlag(isDuplicate = FALSE)),
pileupParam=PileupParam(distinguish_strands=FALSE,
include_deletions=FALSE,
min_base_quality=min_base_quality))
if (nrow(tumorPU) == 0) {
warning(paste0("0 coverage was found at every identified germline heterozygous position in", tumorBam), call. = FALSE)
return(out.df)
}
message(paste0("Identifying positions with MAF >= ", minMafTumor))
if (minMafTumor > 0) {
tumorSNPs <- filterSNPs(pu = tumorPU, SNPs = querySNPs, min.cov = minCovTumor, min.maf = minMafTumor, keepSingles = FALSE)
} else if (minMafTumor == 0) {
tumorSNPs <- filterSNPs(pu = tumorPU, SNPs = querySNPs, min.cov = minCovTumor, min.maf = minMafTumor, keepSingles = TRUE)
}
if (length(tumorSNPs) == 0) {
warning(paste0("0 heterozygous positions were found in", tumorBam), call. = FALSE)
return(out.df)
}
alleleFreqs <- calcAlleleFreq(tumorSNPs = tumorSNPs, normalSNPs = NULL)
}
message(paste0("Retuning allele frequencies for ", length(alleleFreqs), " position(s)"))
alleleFreqs
}
# Takes as input a pileup output, returns a
# GRanges of heterozygous positions if called with keepSingles = FALSE.
# If called with keepSingles = TRUE, then keeps homozygous positions
# as long as they match the SNP database. This is to detect high
# purity LOH (e.g. LOH in a cell line). The high purity LOH
# will be missed in a tumor-only analysis because we can't
# distingiush it from being germline homozygous.
filterSNPs <- function(pu, SNPs, min.cov, min.maf, keepSingles) {
position <- NULL
pu$position <- paste(pu$seqnames, pu$pos, sep = ":")
## If a position has more than 2 bases counted (becaue of errors, mutations,
## ...) then only keep the 2 most frequent alleles
potentialHet <- as.data.frame(
pu %>%
group_by(position) %>%
top_n(2, count)
)
# If the second most frequent allele is tied in counts with the third most frequent (and potentially 4th)
# then that entry is maintained at this point. I remove this position because the MAF
# is smiilar to the frequency or an error base so I am less confident of the true MAF.
# Note that this case is very rare.
potentialHet <- as.data.frame(
potentialHet %>%
dplyr::group_by(position) %>%
dplyr::filter(n() < 3)
)
## If keepSingles == FALSE then single-allele positions are removed. This will
## be the case when identifying hets in a matchedNormal, or in a tumor-only
## analysis to remove homozygotes. These positions are kept in the tumor in
## T/N analysis because of LOH in a high purity sample
if (keepSingles == FALSE) {
potentialHet <- as.data.frame(
potentialHet %>%
dplyr::group_by(position) %>%
dplyr::filter(n() > 1)
)
}
if (nrow(potentialHet) == 0) {
return(GRanges())
}
## Remove position with coverage < min.cov
## Remove positions with alt allele frequency < min.maf
tot <- NULL
summed <- potentialHet %>%
group_by(position) %>%
summarize(tot = sum(count)) %>%
filter(tot < min.cov)
maf <- NULL
mafs <- potentialHet %>%
group_by(position) %>%
mutate(maf = count / sum(count)) %>%
filter(maf < min.maf)
rem <- c(summed$position, mafs$position)
potentialHet <- subset(potentialHet, !(position %in% rem))
if (nrow(potentialHet) == 0) {
return(GRanges())
}
ids <- as.character(unique(potentialHet$position))
keepers <- GRanges()
for (i in ids) {
pos <- potentialHet[which(potentialHet$position == i),]
if (nrow(pos) == 2) {
genotype <- paste(pos$nucleotide, collapse = "/")
} else if (nrow(pos) == 1) {
genotype <- as.character(pos$nucleotide)
}
chrom <- strsplit(i, split = ":")[[1]][1]
start <- as.integer(strsplit(i, split = ":")[[1]][2])
end <- start
coord <- GRanges(seqnames = chrom,
ranges = IRanges(start = start,
end = end))
snp <- subsetByOverlaps(SNPs, coord)
## Only include positions that match the snp database for both reference and
## alt alleles if 2 alleles are present. If one allele is present, only that
## allele must match the SNP database
if (nrow(pos) == 2) {
if ((length(grep(snp$refUCSC, genotype)) == 0) ||
(length(grep(gsub("/", "|", snp$altAllele), genotype)) == 0)) {
next
}
} else if (nrow(pos == 1)) {
if ((length(grep(snp$refUCSC, genotype)) == 0) &&
(length(grep(snp$altAllele, genotype)) == 0)) {
next
}
}
if (length(which(pos$nucleotide == snp$refUCSC)) == 1) {
snp$refCount <- pos$count[which(pos$nucleotide == snp$refUCSC)]
} else {
snp$refCount <- 0
}
altAllele <- strsplit(snp$altAllele, split = "/")[[1]]
ind <- which(pos$nucleotide %in% altAllele)
if (length(ind) == 1) {
snp$altCount <- pos$count[ind]
snp$altAllele <- pos$nucleotide[ind]
} else if (length(ind) == 0) {
snp$altCount <- 0
}
keepers <- c(keepers, snp)
}
return(keepers)
}
calcAlleleFreq <- function(tumorSNPs, normalSNPs = NULL, min.cov) {
message("Computing allele frequencies")
if (!is.null(normalSNPs)) {
## Matching the indicies of tumorSNPs and normalSNPs
normalSNPs <- subsetByOverlaps(normalSNPs, tumorSNPs)
normalSNPs <- sort(normalSNPs)
}
tumorSNPs <- sort(tumorSNPs)
maf <- tumorSNPs$refCount / (tumorSNPs$refCount + tumorSNPs$altCount)
maf[which(maf > 0.5)] <- 1 - maf[which(maf > 0.5)]
if (!is.null(normalSNPs)) {
g <- GRanges(as.character(seqnames(tumorSNPs)),
IRanges(start(ranges(tumorSNPs)), width=1),
RefBase=tumorSNPs$refUCSC,
AltBase = tumorSNPs$altAllele,
Normal.Mut.Count = normalSNPs$altCount,
Normal.Coverage = (normalSNPs$refCount +
normalSNPs$altCount),
Tumor.Mut.Count = tumorSNPs$altCount,
Tumor.Coverage = (tumorSNPs$altCount +
tumorSNPs$refCount),
Tumor.MAF = round(maf, digits = 2),
seqinfo = seqinfo(tumorSNPs))
}
if (is.null(normalSNPs)) {
g <- GRanges(as.character(seqnames(tumorSNPs)),
IRanges(start(ranges(tumorSNPs)), width=1),
RefBase=tumorSNPs$refUCSC,
AltBase = tumorSNPs$altAllele,
Normal.Mut.Count = NA,
Normal.Coverage = NA,
Tumor.Mut.Count = tumorSNPs$altCount,
Tumor.Coverage = (tumorSNPs$altCount +
tumorSNPs$refCount),
Tumor.MAF = round(maf, digits = 2),
seqinfo = seqinfo(tumorSNPs))
}
g
}
## Input path to bed file, returns GRanges of regions
## It is expected that bedPath points to a tab-delimited text
## file in 3-column bed format. If more than 3 columns are present,
## it is expected that:
## Column 1: Chromosome
## Column 2: Start Coordinate
## Column 3 : End Coordinate
bed2gr <- function(bedPath) {
bed <- read.table(bedPath, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
bed.gr <- NULL
try({
bed.gr <- GRanges(seqnames = bed$V1,
ranges = IRanges(start = bed$V2,
end = bed$V3))
}, silent = TRUE)
if (is.null(bed.gr)) {
stop("The supplied bed file cannot be coerced to a GRanges object.
Check that the bed file follows BED formatting rules (see https://genome.ucsc.edu/FAQ/FAQformat.html#format1).
Note that only the first three BED fields are required.")
} else {
return(bed.gr)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.