#' Pileup the mitochondrial reads in a BAM, for variant calling or CN purposes.
#'
#' If a BAM filename is given, but no ScanBamParam, scanMT(bam) will be called.
#' Human mitochondrial genomes (GRCh37+ and hg38+) are fully supported, mouse
#' mitochondrial genomes (C57BL/6J aka NC_005089) are only somewhat supported.
#'
#' @param bam BAM (must be indexed!) file name or object with @bam slot
#' @param sbp optional ScanBamParam object (autogenerated if missing)
#' @param pup optional PileupParam object (autogenerated if missing)
#' @param callIndels call indels? (This can reduce compute and RAM if FALSE.)
#' @param ref aligned reference mitogenome (default is rCRS/GRCh37+)
#' @param ... additional args to pass on to the pileup() function
#'
#' @return an MVRanges object, constructed from pileup results
#'
#' @import GenomicAlignments
#' @import GenomeInfoDb
#' @import Rsamtools
#'
#' @examples
#' library(MTseekerData)
#' BAMdir <- system.file("extdata", "BAMs", package="MTseekerData")
#' patientBAMs <- list.files(BAMdir, pattern="^pt.*.bam$")
#' (bam <- file.path(BAMdir, patientBAMs[1]))
#' (sbp <- scanMT(bam))
#' (mvr <- pileupMT(bam, sbp=sbp, callIndels=TRUE, ref="rCRS"))
#'
#' @export
pileupMT <- function(bam, sbp=NULL, pup=NULL, callIndels=TRUE, ref=c("rCRS","GRCh37","GRCh38","hg38", "GRCm38","C57BL/6J","NC_005089","mm10"), ...) {
# List of reference genomes and their lengths
refSeqLengths <- .refSeqLengths() # synonyms
# Stop if the user forgets to specify a reference genome (FIXME: autodetect)
if (length(ref) > 1) {
stop("You forgot to set a reference genome! MTseeker currently supports: ", paste(names(refSeqLengths), collapse= ", "))
}
# scant support for other mitogenomes
if (!ref %in% names(refSeqLengths)) {
stop("Only the ", paste(names(refSeqLengths), collapse=" and "),
" mitochondrial genome",
ifelse(length(refSeqLengths) > 1, "s are", " is"),
" supported by pileupMT() at present.")
}
# Determine which reference genome to use
# Currently supports mouse and human
ref <- .getRefSyn(match.arg(ref)) # matching
# If you are working with multiple files
if (length(bam) > 1) {
mvrl <- MVRangesList(lapply(bam, pileupMT, ref=ref, sbp=sbp))
sampNames <- lapply(bam,
function(x)
base::sub(paste0(".", ref), "",
base::sub(".bam", "", basename(x))))
names(mvrl) <- sampNames
return(mvrl)
}
# can support multiple bams if we have sample names
# perhaps it is worthwhile to autoextract them now
if (is.null(sbp)) sbp <- scanMT(bam, mapqFilter=20)
bamWhat(sbp) <- "strand"
if (is.null(pup)) pup <- PileupParam(distinguish_strands=TRUE,
distinguish_nucleotides=TRUE,
ignore_query_Ns=TRUE,
include_deletions=callIndels,
include_insertions=callIndels)
pu <- pileup(file=bam, scanBamParam=sbp, pileupParam=pup, ...)
# may be handy for editing
refSeqDNA <- .getRefSeq(ref)
# Need the total number of reads in order to calculate coverage
readsSBP <- sbp
bamWhat(readsSBP) <- "seq"
reads <- readGAlignments(file=bam, param=readsSBP)
numReads <- length(reads)
readsWidth <- median(qwidth(reads)) - 1
# Name of the bam/sample
sampNames <- base::sub(paste0(".", ref), "", base::sub(".bam", "", basename(bam)))
# number of reads * length of reads / length of ref sequence
covg <- round(numReads * readsWidth / width(refSeqDNA))
if (is.na(covg)) covg <- 0
if (callIndels == FALSE) {
indels <- subset(pu, FALSE) # brute force
} else {
# will need to handle '-' and '+' separately
indels <- subset(pu, nucleotide %in% c('-', '+'))
if (nrow(indels) > 0) {
# for obtaining the supporting reads and CIGARs from the BAM:
indels$start <- indels$pos
indels$end <- indels$pos
indelSBP <- sbp
bamWhich(indelSBP) <- as(indels, "GRanges")
bamWhat(indelSBP) <- "seq"
indelReads <- readGAlignments(file=bam, param=indelSBP)
# Get the reads the physically contain the insertion and deletions
indelIndex <- grep("D|I", cigar(indelReads))
# Every other read will support the reference
everything <- seq(1, length(indelReads), 1)
refIndex <- which(is.na(match(everything, indelIndex)))
refSupport <- indelReads[refIndex]
# Now this contains just indel reads
# Hopefully this will speed things up with less iteration
indelReads <- indelReads[indelIndex]
# Initialize to get the alternative sequences
mcols(indelReads)$indelStart <- NA_integer_
mcols(indelReads)$indelEnd <- NA_integer_
mcols(indelReads)$ref <- NA_character_
mcols(indelReads)$alt <- NA_character_
# This is to help with debugging
mcols(indelReads)$bam <- sampNames
#mcols(indelReads)$potentialHaplo <- FALSE
message("Tallying indels for: ", sampNames)
if (length(indelIndex) > 500) {
message("This may take a while, determining ", length(indelIndex),
" reads for indels from ", sampNames)
}
# Empty GAlignment that will have new indels appended to it
newIndelReads <- indelReads[0]
# Returns the ref and alt read for each variant
#lapply(indelReads, .reverseCigar, ref=ref, reference=refSeqDNA)
for (i in 1:length(indelReads)) {
newIndelReads <- append(newIndelReads,
.reverseCigar(indelReads[i], ref, refSeqDNA, i))
}
# Converts to a MVRanges
mvrIndel <- .indelToMVR(newIndelReads,refSupport, ref, bam, covg)
# Copied and pasted from below
mvrIndel$VAF <- altDepth(mvrIndel)/totalDepth(mvrIndel)
metadata(mvrIndel)$refseq <- refSeqDNA
metadata(mvrIndel)$bam <- basename(bam)
metadata(mvrIndel)$sbp <- indelSBP
metadata(mvrIndel)$pup <- pup
mvrIndel$bam <- basename(bam)
genome(mvrIndel) <- ref
}
# data(fpFilter_Triska, package="MTseeker") # for when they are...
}
message("Tallying SNVs for: ", sampNames)
# this may belong in a separate helper function...
pu <- subset(pu, nucleotide %in% c('A','C','G','T'))
pu$which_label <- NULL # confusing here
pu$ref <- factor(strsplit(as.character(refSeqDNA), '')[[1]],
levels=levels(pu$nucleotide))[pu$pos]
pu$alt <- pu$nucleotide
pu$totalDepth <- .byPos(pu, "count")
is.na(pu$alt) <- (pu$nucleotide == pu$ref)
# Only want to keep variants that differ from reference
keep <- which(!is.na(pu$alt))
pu <- pu[keep,]
# If there exists SNPs, turn the pu into MVRanges
if (nrow(pu) != 0) {
strandMvr <- .puToMvr(pu, refSeqDNA, ref, bam, covg)
# Because strandedness (heavy and light strands) matters sometimes
# Some of the variants appear twice, despite having the same consequences
# All I wants to do is add together their alt depths and set all the strands to +
# They are all represented as through they are on the + strand anyways
heavyMvr <- strandMvr[which(strand(strandMvr) == "+")]
lightMvr <- strandMvr[which(strand(strandMvr) == "*")]
matchedLight <- match(names(lightMvr), names(heavyMvr))
# These are light stranded variants that do not have duplicated heavy stranded variants
# However they are still representing what the variant would look like on the heavy strand
uniqueLight <- which(is.na(matchedLight))
uniqueLightMvr <- lightMvr[uniqueLight]
# These are overlapping light strands
matchedLightIndex <- which(!is.na(matchedLight))
matchedLightMvr <- lightMvr[matchedLightIndex]
# Keep track of variants found on both strands for debugging purposes
if (length(heavyMvr) > 0) heavyMvr$bothStrands <- FALSE
if (length(uniqueLightMvr) > 0) uniqueLightMvr$bothStrands <- FALSE
# Only have to add the altDepths since the refDepths will be the same
# for 'duplicated' variants
if (length(heavyMvr) > 0) {
altDepth(heavyMvr[matchedLight[matchedLightIndex]]) <-
altDepth(heavyMvr[matchedLight[matchedLightIndex]]) +
altDepth(matchedLightMvr)
heavyMvr[matchedLight[matchedLightIndex]]$bothStrands <- TRUE
}
# Merge together the light and heavy strand variants
mvr <- MVRanges(c(heavyMvr, uniqueLightMvr), coverage=covg)
# Recalculate the totalDepth
totalDepth(mvr) <- altDepth(mvr) + refDepth(mvr)
mvr <- sort(mvr, ignore.strand=T)
metadata(mvr)$bam <- basename(bam)
metadata(mvr)$sbp <- sbp
metadata(mvr)$pup <- pup
names(mvr) <- MTHGVS(mvr)
}
# There are no indels and no SNPs
# Create an empty MVRanges
if (nrow(pu) == 0 && nrow(indels) == 0) {
mvr <- MVRanges(GRanges(c(seqnames=NULL,ranges=NULL,strand=NULL)),
coverage = covg)
return(mvr)
}
# If there are no SNPs but there are indels
else if (nrow(pu) == 0 && nrow(indels) > 0 ) {
mvr <- mvrIndel
}
# Merge indels and SNPs together if both exist
else if (nrow(indels) > 0 && nrow(pu) != 0) {
mvr <- MVRanges(c(mvr, mvrIndel), coverage=covg)
mvr <- sort(mvr, ignore.strand=T)
names(mvr) <- MTHGVS(mvr)
}
metadata(mvr)$bam <- basename(bam)
metadata(mvr)$sbp <- sbp
metadata(mvr)$pup <- pup
metadata(mvr)$coverageRle <- coverage(mvr)
# A double check to make sure these two values get tallied
totalDepth(mvr) <- altDepth(mvr) + refDepth(mvr)
mvr$VAF <- altDepth(mvr) / totalDepth(mvr)
return(mvr)
}
# helper fn
.byPos <- function(x, y) {
rowsum(as.numeric(x[,y]), as.character(x$pos), na.rm=1)[as.character(x$pos),]
}
# helper fn
.puToGR <- function(pu) {
#makeGRangesFromDataFrame(pu, start.field="pos", end.field="pos", strand="*",
# keep.extra.columns=TRUE)
makeGRangesFromDataFrame(pu, start.field="pos", end.field="pos",
keep.extra.columns=TRUE)
}
# helper fn
.refSeqLengths <- function() {
refSeqLengths <- c()
data(rCRSeq, package="MTseeker") # human
data(NC_005089seq, package="MTseeker") # mouse
# data(mtRefs, package="MTseeker") # both
refSeqLengths["rCRS"] <- width(rCRSeq)
refSeqLengths["GRCh38"] <- width(rCRSeq)
refSeqLengths["hg38"] <- width(rCRSeq)
refSeqLengths["GRCh37"] <- width(rCRSeq)
refSeqLengths["NC_005089"] <- width(NC_005089seq)
refSeqLengths["GRCm38"] <- width(NC_005089seq)
refSeqLengths["mm10"] <- width(NC_005089seq)
refSeqLengths["C57BL/6J"] <- width(NC_005089seq)
return(refSeqLengths)
}
# helper fn
.getRefSyn <- function(ref) {
# simplify references down to the supported "mouse" or "human" mitogenomes
if (ref %in% c("C57BL/6J", "NC_005089", "GRCm38","mm10")) ref <- "NC_005089"
if (ref %in% c("hg38", "GRCh37", "GRCh38")) ref <- "rCRS"
return(ref)
}
# helper fn
.getRefSeq <- function(ref) {
data(rCRSeq, package="MTseeker") # human
data(NC_005089seq, package="MTseeker") # mouse
refs <- DNAStringSet(c(rCRS=rCRSeq[['chrM']],
NC_005089=NC_005089seq[['chrM']]))
return(refs[.getRefSyn(ref)])
}
# helper fn
# Will take a single read and return the reference and alternative sequence
.reverseCigar <- function(indelRead, ref, reference, index) {
# These are reads that support the reference (no indels found in them)
if ( (!grepl("I", cigar(indelRead))) && (!grepl("D", cigar(indelRead))) ) {
return(indelRead[0])
}
else {
# Keep track of how many base pairs are being added in the cigar string
addPos <- 0 # Matches (M in cigar strings)
softPos <- 0 # Soft clips (S in cigar strings)
splicePos <- 0 # Spliced zones (N in cigar strings)
splitCigar <- gsub("([[:digit:]]+[[:alpha:]])", "\\1 ", cigar(indelRead))
splitCigar <- unlist(strsplit(splitCigar, " "))
# Find which elements of the cigar string are insertion or deletion
# Are there multiple insertions and deletions?
indelIndex <- which((grepl("I|D", splitCigar)))
# Create a new indel read to edit
newIndelRead <- indelRead
# Iterate through in case there are multiple deletions and insertions
for (i in 1:length(indelIndex)) {
#potentialHaplo <- FALSE
# Find the number of base pairs that are soft clips
# Only want to look at the information before the currently considered indel
softPos <- grep("S", splitCigar[1:indelIndex[i]])
softPos <- sum(as.numeric(gsub("\\D", "", splitCigar[softPos])))
if (length(softPos) == 0) softPos <- 0
# Find the number of base pairs that are matches
# Only want to look at the information before the currently considered indel
addPos <- grep("M", splitCigar[1:indelIndex[i]])
addPos <- sum(as.numeric(gsub("\\D", "", splitCigar[addPos])))
if (length(addPos) == 0) addPos <- 0
# There can be spliced reads
# Only have to add this onto the reference sequence
splicePos <- grep("N", splitCigar[1:indelIndex[i]])
splicePos <- sum(as.numeric(gsub("\\D", "", splitCigar[splicePos])))
if (length(splicePos) == 0) splicePos <- 0
# If there is a previous indel in the same read
# Need to consider how the start position will change on the reference
# If there was an insertion beforehand, be careful when calculation startPos
# Because those bp are adding length that does not exist into the ref seq
# Add when there is a deletion
# Subtract when there is an insertion
if (i != 1) {
# Indices or previous insertions or deletions in the same read
prevIns <- which(grepl("I", splitCigar[1: (indelIndex[i] - 1) ]))
prevDel <- which(grepl("D", splitCigar[1: (indelIndex[i] - 1) ]))
# Sums of how many bp are deleted or inserted
prevInsSum <- sum(as.numeric(gsub("\\D", "", splitCigar[prevIns])))
prevDelSum <- sum(as.numeric(gsub("\\D", "", splitCigar[prevDel])))
}
## Now do the actual insertions or deletions
# Insertions
if (grepl("I", splitCigar[indelIndex[i]])) {
# Start of range for the reference
# Always off by one
startPos <- start(indelRead) + addPos - 1 + splicePos
# Get the start index for insertion for the raw read
altIndexStart <- addPos + softPos
# Multiple indels
# Must take into consideration how that will effect the start positions
# Add to the reference with previous deletion (insertions do not exist in the reference)
# Add to the alternate with previous insertion (deletions do not add length to the alternate)
if (i != 1) {
startPos <- startPos + prevDelSum
altIndexStart <- altIndexStart + prevInsSum
}
# According to VRanges documentation, insertions have width=1
# So start and end are the same for the reference
endPos <- startPos
# Get the reference
refs <- extractAt(reference, IRanges(startPos, endPos))
refs <- unname(as.character(unlist(refs)))
# Length of insertion
insLength <- as.numeric(gsub("\\D", "", splitCigar[indelIndex[i]]))
# Get the alt sequence from the raw read
altSeq <- as.character(mcols(indelRead)$seq)
altSplit <- unlist(strsplit(altSeq, split=""))
altIndex <- seq(altIndexStart, altIndexStart + insLength, 1)
altInd <- altSplit[altIndex]
alt <- paste(altInd, collapse="")
# If the first element of cigar string is to do the insertion
# We have no way of knowning the element before the insertion
# So we have to use the reference
if (indelIndex[i] == 1) {
alt <- paste0(refs, alt, collapse="")
altInd <- c(refs, altInd)
}
# Raw read has an "N" where the insertion occurs
# Toss it out
if (grepl("N", alt)) {
altInd <- ""
alt <- ""
refs <- ""
startPos <- NA_integer_
endPos <- NA_integer_
}
# A simple test to make sure insertion was done correctly
# That became less simple
else if (altInd[1] != refs) {
# If the variants falls within a haplogroup region
# Just leave it be
data(haplomask_whitelist)
#haploSnps <- subsetByOverlaps(ranges(haplomask_whitelist[[1]]), IRanges(startPos, endPos))
#haploDels <- subsetByOverlaps(ranges(haplomask_whitelist[[2]]), IRanges(startPos, endPos))
haploIns <- subsetByOverlaps(ranges(haplomask_whitelist[[3]]), IRanges(startPos, endPos))
if (length(haploIns) > 0 && ref == "rCRS") {
#message(paste("Potential haplogroup insertion variant at position:", as.character(startPos), "for sample:", as.character(mcols(indelRead)$bam)))
#potentialHaplo <- TRUE
# Leaving this if statement because this exception is fine to keep
}
else {
message(paste("Incorrect insertion for indel read index:", as.character(index), "for", as.character(mcols(indelRead)$bam)))
comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
}
}
# Sanity check!
#comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
} # I
# Deletions
else if (grepl("D", splitCigar[indelIndex[i]])) {
# Create range for the deletion
# This must be zero indexed or something, because it is always off by one
startPos <- start(indelRead) + addPos - 1 + splicePos
# Get the index for where the deletion takes place
altIndexStart <- addPos + softPos
# Multiple indels
# Must take into consideration how the start and end values will change
if (i != 1) {
startPos <- startPos + prevDelSum
altIndexStart <- altIndexStart + prevInsSum
}
# Length of deletion and end position
delLength <- as.numeric(gsub("\\D", "", splitCigar[indelIndex[i]]))
# In rCRS, there is an N located at bp 3107
if (ref == "rCRS" && startPos == 3107 && delLength == 1) {
startPos <- startPos - 1
altIndexStart <- altIndexStart - 1
}
# Odd 1 off error that should be deleting the N at 3107
# Showed up in TCGA data
if (ref == "rCRS" && startPos == 3105 && delLength == 1) {
startPos <- startPos + 1
altIndexStart <- altIndexStart + 1
}
endPos <- startPos + delLength
# Get the reference
refs <- extractAt(reference, IRanges(startPos, endPos))
refs <- unname(as.character(unlist(refs)))
refSplit <- unlist(strsplit(refs, split=""))
# Get the alt sequence from the raw read
altSeq <- as.character(mcols(indelRead)$seq)
altSplit <- unlist(strsplit(altSeq, split=""))
alt <- altSplit[altIndexStart]
# If the first element of cigar string is to do the deletion
# We have no way of knowning the element before the deletion
# So we have to use the reference
if (indelIndex[i] == 1) {
alt <- refSplit[1]
}
# If the raw read has an "N" where the deletion occurs
# Toss it out
if (grepl("N", alt)) {
alt <- ""
refs <- ""
startPos <- NA_integer_
endPos <- NA_integer_
}
# A simple check to ensure the deletion is supposedly happening correctly
# That became less simple
else if ( alt != refSplit[1] ) {
# Keeps track if the variant falls in a common haplogroup region (ancestral variation)
data(haplomask_whitelist)
#haploSnps <- subsetByOverlaps(ranges(haplomask_whitelist[[1]]), IRanges(startPos, endPos))
haploDels <- subsetByOverlaps(ranges(haplomask_whitelist[[2]]), IRanges(startPos, endPos))
#haploIns <- subsetByOverlaps(ranges(haplomask_whitelist[[3]]), IRanges(startPos, endPos))
if (length(haploDels) > 0 && ref == "rCRS") {
#message(paste("Potential haplogroup deletion variant at position:", as.character(startPos), "for sample:",as.character(mcols(indelRead)$bam)))
#potentialHaplo <- TRUE
# Leave the if statement here because we do not want to remove haplogroup variants
}
else if (startPos == 3106) {
# Continue
# A lot of the bp before the N at 3107 are different
}
else {
message(paste("Incorrect deletion for indel read index:", as.character(index), "for", as.character(mcols(indelRead)$bam)))
comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
}
}
# Sanity check!
#comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
} # D
# Store the information
# Only have one indel
if (i == 1) {
mcols(newIndelRead)$indelStart <- startPos
mcols(newIndelRead)$indelEnd <- endPos
mcols(newIndelRead)$alt <- alt
mcols(newIndelRead)$ref <- refs
#mcols(newIndelRead)$potentialHaplo <- potentialHaplo
}
# Append another indel to the first one
# Occurs when you have multiple indels in a single read
else {
appendIndelRead <- indelRead
mcols(appendIndelRead)$indelStart <- startPos
mcols(appendIndelRead)$indelEnd <- endPos
mcols(appendIndelRead)$alt <- alt
mcols(appendIndelRead)$ref <- refs
#mcols(appendIndelRead)$potentialHaplo <- potentialHaplo
newIndelRead <- append(newIndelRead, appendIndelRead)
}
} # For
} # Else
# Get rid of any indel reads which contained bad information
# They should have NA for their start position
keep <- which(!is.na(mcols(newIndelRead)$indelStart))
newIndelRead <- newIndelRead[keep]
return(newIndelRead)
}
# helper fn
# Turn pileup snvs into mvranges
.puToMvr <- function(pu, refSeqDNA, ref, bam, covg) {
pu$altDepth <- ifelse(is.na(pu$alt), NA_integer_, pu$count)
pu$refDepth <- pu$totalDepth - .byPos(pu, "altDepth")
pu$isAlt <- !is.na(pu$alt)
pu$alleles <- .byPos(pu, "isAlt") + 1
#pu$strand <- as.character(pu$strand)
columns <- c("seqnames","pos","ref","alt","totalDepth","refDepth","altDepth", "strand")
gr <- keepSeqlevels(.puToGR(subset(pu, isAlt|alleles==1)[,columns]),
unique(pu$seqnames))
# seqnames for mouse genome is "MT"
seqinfo(gr) <- Seqinfo(levels(seqnames(gr)), width(refSeqDNA),
isCircular=TRUE, genome=ref)
vr <- makeVRangesFromGRanges(gr)
# Keeping all light strands as * bc MVRanges blows up otherwise
# VRanges are not allowed to have light strand variants apparently
strand(vr) <- as.factor(strand(gr))
strand(vr[which(strand(vr) == "-")]) <- "*"
vr <- keepSeqlevels(vr, levels(seqnames(gr)))
seqinfo(vr) <- seqinfo(gr)
mvr <- MVRanges(vr, coverage=covg)
#strand(mvr) <- rle(as.factor(strand(gr)))
sampleNames(mvr) <- base::sub(paste0(".", ref), "",
base::sub(".bam", "",
basename(bam)))
names(mvr)[which(mvr$ref != mvr$alt)] <- MTHGVS(subset(mvr, ref != alt))
altDepth(mvr)[is.na(altDepth(mvr))] <- 0
mvr$VAF <- altDepth(mvr)/totalDepth(mvr)
metadata(mvr)$refseq <- refSeqDNA
mvr$bam <- basename(bam)
genome(mvr) <- ref
names(mvr) <- MTHGVS(mvr)
return(mvr)
}
# helper fn
# Will make turn the indel information into a MVRanges
.indelToMVR <- function(indelReads, refSupport, ref, bam, covg) {
# This is mostly copied and pasted from the mvr created at the end of pileup
indelSupport <- as.data.frame(indelReads)
indelSupport$which_label <- NULL
# Set these columns to be able to turn into a GRanges
indelSupport$refDepth <- NA_integer_
indelSupport$altDepth <- 1
indelSupport$totalDepth <- 1
indelSupport$pos <- indelSupport$indelStart
### Not sure what this line is supposed to do
#pu$alleles <- .byPos(pu, "isAlt") + 1
# Turn into GRanges
columns <- c("seqnames","pos","ref","alt","totalDepth","refDepth","altDepth", "strand")
grIndel <- keepSeqlevels(.puToGR(indelSupport[,columns]), unique(indelSupport$seqnames))
seqinfo(grIndel) <- Seqinfo(levels(seqnames(grIndel)), width(.getRefSeq(ref)), isCircular=TRUE, genome=ref)
# Turn into VRanges
vrIndel <- makeVRangesFromGRanges(grIndel)
vrIndel <- keepSeqlevels(vrIndel, levels(seqnames(grIndel)))
seqinfo(vrIndel) <- seqinfo(grIndel)
# Need to mess around with strands again
strand(vrIndel) <- strand(grIndel)
strand(vrIndel[strand(vrIndel) == "-"]) <- "*"
# Turn into MVRanges
mvrIndel <- MVRanges(vrIndel, coverage = covg)
sampleNames(mvrIndel) <- base::sub(paste0(".", ref), "", base::sub(".bam", "", basename(bam)))
# Fix ranges
indelRanges <- IRanges(as.numeric(indelSupport$indelStart), as.numeric(indelSupport$indelEnd))
ranges(mvrIndel) <- indelRanges
# Assign names
names(mvrIndel) <- MTHGVS(mvrIndel)
####################################
strandMvr <- mvrIndel
# Indels with duplicated variants
# Need to consolidate them together
dupHeavy <- strandMvr[which(strand(strandMvr) == "+")]
dupLight <- strandMvr[which(strand(strandMvr) == "*")]
# Now have the correct depth for each variant
# Each variant now appears once for each strand
uniqueHeavy <- unique(dupHeavy)
count <- table(names(dupHeavy))
altDepth(uniqueHeavy[names(count)]) <- unname(count)
uniqueLight <- unique(dupLight)
count <- table(names(dupLight))
altDepth(uniqueLight[names(count)]) <- unname(count)
# Find any variants that have reads on both heavy and light strands
matchedLight <- match(names(uniqueLight), names(uniqueHeavy))
# For debugging purposes, keep track of which variants came from both strands
# Variants that were only found on the light strand
lightMvr <- uniqueLight[which(is.na(matchedLight))]
# These are overlapping light strands
matchedLightIndex <- which(!is.na(matchedLight))
matchedLightMvr <- uniqueLight[matchedLightIndex]
uniqueHeavy$bothStrands <- FALSE
lightMvr$bothStrands <- FALSE
# Only have to add the altDepths since the refDepths will be the same for 'duplicated' variants
altDepth(uniqueHeavy[matchedLight[matchedLightIndex]]) <- altDepth(uniqueHeavy[matchedLight[matchedLightIndex]]) + altDepth(matchedLightMvr)
uniqueHeavy[matchedLight[matchedLightIndex]]$bothStrands <- TRUE
# Combine them together
mvr <- MVRanges(c(uniqueHeavy, lightMvr), coverage=covg)
mvr <- sort(mvr, ignore.strand=T)
###############################
# Only keep unique variants (based upon names that were just assigned)
#mvrIndelunique <- sort(unique(mvrIndel), ignore.strand=T)
# Assign altDepths based upon what was unique
# Sometimes the order gets mixed up, so keep track of it
#order <- match(names(mvrIndelunique), names(table(names(mvrIndel))))
#count <- unname(table(names(mvrIndel))[order])
#altDepth(mvrIndelunique) <- count
# Determine the refDepth
refDepth(mvr) <- unname(countOverlaps(mvr, refSupport))
# Recalculate totalDepth
totalDepth(mvr) <- refDepth(mvr) + altDepth(mvr)
#mvr <- MVRanges(mvrIndelunique, coverage = covg)
return(mvr)
}
# helper fn
# Usually commented out unless you run into an error
# A sanity check to compare the raw read against the reference
# Prints other information like start position, cigar string, and the calculated alt and ref
.sanCheck <- function(reference, startPos, indelRead, softPos, refs, alt, splitCigar) {
altSeq <- as.character(mcols(indelRead)$seq)
# Get the corresponding ref sequence
start <- start(indelRead) - softPos
if (start < 0) start <- 1
refRange <- IRanges(start, end(indelRead))
refSeq <- as.character(unlist(extractAt(reference, refRange)))
# Compare the two strings
# Tried doing this using positions based upon cigar string but sometimes got off by one errors
comp <- pairwiseAlignment(refSeq, altSeq)
show(comp)
show(splitCigar)
print(paste("ref: ", refs))
print(paste("alt: ", alt))
print(paste("start: ", as.character(startPos)))
return(comp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.