countPhredScores <- function(qual) {
qual <- as.character(qual)
qual <- lapply(qual, function(x) unlist(strsplit(x, "")))
qual <- lapply(qual, function(x) {names(x) <- seq(1, length(x)); return(x)})
qual <- as.data.frame(do.call("bind_rows", qual))
qualProb <- apply(qual, 2, table)
qualProb <- as.matrix(do.call("bind_rows", qualProb))
qualProb <- qualProb[, order(phred2ASCIIOffset(colnames(qualProb)))]
qualProb[is.na(qualProb)] <- 0
return(qualProb)
}
findReads <- function(bamFile,
tag = character(0),
what=scanBamWhat(),
which=IRangesList(),
isSupplementary=NA,
reverseComplement=NA,
mapqFilter=0) {
f <- scanBamFlag(isDuplicate = FALSE,
isSupplementaryAlignment = isSupplementary)
p <- ScanBamParam(flag = f,
tag = tag,
simpleCigar = FALSE,
reverseComplement = reverseComplement,
what = what,
which = which,
mapqFilter = mapqFilter)
bam <- scanBam(bamFile, param = p)[[1]]
return(bam)
}
calcPhredScoreProfile <- function(bamFilePath, mapqFilter=0, maxFileSize=1,
targetRegions=NULL, subsampleRatio=NA,
subsampleRegionLength=1e+5,
disableSubsampling=FALSE, threads=1) {
if(missing(bamFilePath)) stop("bamFilePath is required")
bamFile <- BamFile(bamFilePath)
seqInfo <- scanBamHeader(bamFile)$targets
colnames(targetRegions) <- c("chr", "start", "end")
targetRegions <- targetRegions[targetRegions$chr %in% names(seqInfo), ]
if (length(targetRegions) != 0 & is(targetRegions, "GenomicRanges")) {
targetRegions <- data.frame(seqnames(targetRegions),
start(targetRegions),
end(targetRegions))
}
if (file.info(bamFilePath)$size / 1e+9 <= maxFileSize |
disableSubsampling) {
if (length(targetRegions) == 0) {
regions <- GRanges(names(seqInfo), IRanges(1, seqInfo))
regions <-
unlist(slidingWindows(regions, width=50000, step = 50000))
} else {
colnames(targetRegions) <- c("chr", "start", "end")
regions <- GRanges(targetRegions$chr, IRanges(targetRegions$start,
targetRegions$end))
}
if (file.info(bamFilePath)$size / 1e+9 <= maxFileSize) {
message("BAM filesize smaller than ",
maxFileSize,
" gigabytes, calculate Phred quality score profile ",
"based on all reads.")
} else {
message("Subsampling disabled, ",
"calculate Phred quality score profile based on all reads.")
}
} else {
if (is.na(subsampleRatio)) {
subsampleRatio <-
maxFileSize * 1e+9 / file.info(bamFilePath)$size
}
message("BAM filesize greater than ",
maxFileSize,
" gigabytes, calculation based on subsampled reads ",
"(subsample ratio: ",
subsampleRatio, ").")
if (length(targetRegions) == 0) {
nRegions <- seqInfo * subsampleRatio / subsampleRegionLength
seqInfo <- seqInfo[nRegions >= 1]
nRegions <- ceiling(nRegions[nRegions >= 1])
if (length(seqInfo) == 0) {
stop("subsampleRegionLength is too large ",
"with this subsampleRatio.")
}
subsampleRegions <- GRanges()
for (i in seq_along(seqInfo)) {
startPos <- sample(seq(1, seqInfo[i], subsampleRegionLength),
nRegions[i], replace = FALSE)
subsampleRegion <-
GRanges(seqnames = names(seqInfo[i]),
ranges = IRanges(startPos,
startPos +
subsampleRegionLength - 1))
suppressWarnings(subsampleRegions <-
c(subsampleRegions, subsampleRegion))
}
regions <- sort(subsampleRegions)
} else {
regions <- GRanges(targetRegions$chr, IRanges(targetRegions$start,
targetRegions$end))
regions <- regions[sample(seq_along(regions),
round(length(regions) * subsampleRatio))]
}
}
message("Get reads from BAM file in ", length(regions), " regions.",
"\nUsing ", threads, " thread(s).")
cl <- makeCluster(threads)
registerDoParallel(cl)
readInfo <- foreach(i = seq_along(regions),
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
.errorhandling = "remove",
.packages = c("Biostrings", "dplyr",
"GenomicRanges", "Rsamtools"),
.export = c("findReads", "countPhredScores")
) %dopar% {
temp <- findReads(bamFile,
what = "qual",
which = regions[i],
reverseComplement=TRUE,
mapqFilter = mapqFilter)
if (length(temp$qual) > 1) {
temp$qual <- countPhredScores(temp$qual)
return(temp)
}
}
stopCluster(cl)
quals <- unname(readInfo[names(readInfo) == "qual"])
if (length(quals) == 0) {
stop("No reads found.")
}
readLen <- max(unlist(lapply(quals, nrow)))
PhredScores <- unique(unlist(lapply(quals, colnames)))
PhredScores <- PhredScores[order(phred2ASCIIOffset(PhredScores))]
for (i in seq_along(quals)) {
qual <- quals[[i]]
PhredScoreDiff <- setdiff(PhredScores, colnames(qual))
if (length(PhredScoreDiff) > 0 ) {
temp = matrix(0, nrow = nrow(qual), ncol = length(PhredScoreDiff))
colnames(temp) <- PhredScoreDiff
quals[[i]] <- cbind(qual, temp)
quals[[i]] <- quals[[i]][, PhredScores]
}
rowDiff <- readLen - nrow(qual)
if (rowDiff > 0) {
temp = matrix(0, nrow = rowDiff, ncol = length(PhredScores))
quals[[i]] <- rbind(quals[[i]], temp)
}
}
qualProb <- Reduce("+", quals)
qualProb <- qualProb / rowSums(qualProb)
return(qualProb)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.