#' test harness for simultaneous mouse/human MT variant calling (for tracking)
#'
#' Note that the default pileup params here do NOT distinguish strands in PDXs.
#'
#' FIXME: add an iterator to create (human, mouse) MVRangesLists when running
#' pileupXenograft on thousands of individual cells (as in PDX studies).
#'
#' @param bam the BAM or object with a @bam slot
#' @param sbp optional ScanBamParam object (autogenerated if missing)
#' @param pup optional PileupParam object (autogenerated if missing)
#' @param callIndels call indels? (This can reduce compute and RAM if FALSE.)
#' @param minVAF minimum VAF to accept (default is 0.03, usual NuMT cutoff)
#' @param minDepth minimum read depth to accept (default is 20 for sanity)
#' @param verbose be verbose, as for profiling? (default is FALSE / quiet)
#' @param ... additional args to pass on to the pileup() function
#'
#' @return a list of MVRanges (one human, one mouse)
#'
#' @examples
#'
#' BAMdir <- system.file("extdata", "BAMs", package="MTseekerData")
#' addPath <- function(x) file.path(BAMdir, x)
#' pdxBAMs <- addPath(list.files(path=BAMdir, pattern="^PDX.*.bam$"))
#' names(pdxBAMs) <- sapply(strsplit(basename(pdxBAMs), "_"), `[`, 1)
#' mvrls <- lapply(pdxBAMs, pileupXenograft, verbose=TRUE)
#' vapply(mvrls, names, character(2))
#'
#' @export
pileupXenograft <- function(bam, sbp=NULL, pup=NULL, callIndels=FALSE, minVAF=0.03, minDepth=20, verbose=FALSE, ...) {
t_0 <- Sys.time()
# we will split this:
if (verbose) message("Assembling joint mitochondrial pileup...")
pu <- .getMtPile(bam, sbp, pup, strand=FALSE, callIndels=callIndels)
t_pu <- Sys.time() - t_0
if (verbose) message("Pileup took ", round(t_pu, 3), " ", attr(t_pu, "units"))
# detect multiple references
# FIXME: refactor this into a function
if (verbose) message("Fixing references...")
refChrs <- levels(factor(pu$seqnames))
equivs <- c(mm10="GRCm38", hg38="GRCh38")
names(refChrs) <- sapply(strsplit(refChrs, "_"), `[`, 1)
for (i in names(equivs)) names(refChrs) <- sub(i, equivs[i], names(refChrs))
refGenome <- names(refChrs)
names(refGenome) <- refChrs
# split by reference
# FIXME: generalize to arbitrary meta-MT-genomes
if (verbose) message("Splitting pileup by species...")
pu$seqnames <- factor(pu$seqnames) # avoid nonsense
pus <- lapply(split(pu, refGenome[pu$seqnames]), .addRefSeq)
pus <- lapply(pus, .addPuCoverage)
for (reference in names(pus)) pus[[reference]][, "reference"] <- reference
names(pus) <- sub("GRCh38", "human", sub("GRCm38", "mouse", names(pus)))
# lapply(pus, attr, "reference") # to verify it worked...
# call variants
if (callIndels == TRUE) {
message("Indel calling is not currently supported in xenografts; skipping")
}
if (verbose) message("Dropping sites with VAF < ", minDepth, "...")
variants <- lapply(pus, .nonRef, minVAF=minVAF, minDepth=minDepth)
# turn into a VRangesList, add read counts, coerce to MVRanges
# FIXME: refactor the hell out of this
if (verbose) message("Converting to a VRangesList...")
variantVRL <- VRangesList(lapply(lapply(variants, .puGR), .grVR))
variantVRL <- VRangesList(lapply(variantVRL, .addReadCounts, bam=bam))
variantVRL <- VRangesList(lapply(variantVRL,
function(v)
new("MVRanges", v, coverage=.getCov(v))))
# tidy up the results
# FIXME: refactor to function
if (verbose) message("Tidying up...")
mergedRef <- paste(sapply(variantVRL,
function(x) unique(mcols(x)$reference)),
collapse="_")
for (i in names(variantVRL)) {
ref <- unique(variantVRL[[i]]$reference)
variantVRL[[i]] <- keepSeqlevels(variantVRL[[i]],
seqlevelsInUse(variantVRL[[i]]))
seqlevels(variantVRL[[i]]) <- paste0(ref, "_MT")
seqlengths(variantVRL[[i]]) <- width(attr(pus[[i]], "reference"))
genome(variantVRL[[i]]) <- mergedRef
isCircular(variantVRL[[i]]) <- TRUE
}
# convert to an MVRangesList
if (verbose) message("Converting to an MVRangesList...")
res <- as(variantVRL, "MVRangesList")
t_1 <- Sys.time() - t_0
if (verbose) message("Finished in ", round(t_1, 3), " ", attr(t_1, "units"))
return(res)
}
# helper
.puGR <- function(pu) {
if (nrow(pu) == 0) {
res <- GRanges()
} else {
res <- makeGRangesFromDataFrame(pu, start.field="pos", end.field="pos",
keep.extra.columns=TRUE)
}
metadata(res)$coverageRle <- attr(pu, "coverageRle")
return(res)
}
# helper
.grVR <- function(gr) {
if (length(gr) == 0) {
res <- VRanges()
} else {
res <- makeVRangesFromGRanges(gr)
}
# is this necessary?
metadata(res)$coverageRle <- metadata(gr)$coverageRle
return(res)
}
# helper
.addRefSeq <- function(pu) {
chr <- unique(sapply(strsplit(as.character(pu$seqnames), "_"), `[`, 1))
stopifnot(length(chr) == 1)
attr(pu, "reference") <- MTseeker:::.getRefSeq(chr)
return(pu)
}
# helper
.getReadCounts <- function(x, bam, equivs=NULL) {
if (is.null(equivs)) equivs <- c(mm10="GRCm38", hg38="GRCh38")
mapped <- subset(idxstatsBam(bam), mapped > 0)[, c("seqnames", "mapped")]
rownames(mapped) <- as.character(mapped[,"seqnames"])
for (e in names(equivs)) {
rownames(mapped) <- sub(e, equivs[e], sub("_+", "_", rownames(mapped)))
}
mapped[unique(as.character(seqnames(x))), "mapped"]
}
# helper
.addReadCounts <- function(mvr, bam, readLength=98) {
metadata(mvr)$bam <- bam
metadata(mvr)$readCount <- .getReadCounts(mvr, bam=bam)
return(mvr)
}
# helper
.nonRef <- function(pu, minVAF=0.05, minDepth=20) {
coverageRle <- attr(pu, "coverageRle")
pu$which_label <- NULL # confusing here, although used for coverageRle
pu$ref <- factor(strsplit(as.character(attr(pu, "reference")[[1]]), "")[[1]],
levels=levels(pu$nucleotide))[pu$pos]
pu <- subset(pu, nucleotide %in% c('A','C','G','T'))
pu$alt <- pu$nucleotide
pu$totalDepth <- MTseeker:::.byPos(pu, "count")
is.na(pu$alt) <- (pu$nucleotide == pu$ref)
pu$altDepth <- ifelse(is.na(pu$alt), NA_integer_, pu$count)
pu$refDepth <- pu$totalDepth - MTseeker:::.byPos(pu, "altDepth")
pu$isAlt <- !is.na(pu$alt)
pu$alleles <- MTseeker:::.byPos(pu, "isAlt") + 1
res <- subset(pu, alleles > 1 & totalDepth >= minDepth)
res$VAF <- res$count / res$totalDepth
res$variant <- paste0(res$reference, ":m.", res$pos, res$ref, ">", res$alt)
res <- subset(res, VAF >= minVAF & isAlt)
attr(res, "coverageRle") <- coverageRle
return(res)
}
# helper
.getMtPile <- function(bam, sbp=NULL, pup=NULL, strand=FALSE, callIndels=FALSE){
if (is.null(sbp)) sbp <- MTseeker:::scanMT(bam, mapqFilter=20)
if (strand) bamWhat(sbp) <- "strand"
if (is.null(pup)) pup <- PileupParam(min_mapq=20,
min_base_quality=13,
ignore_query_Ns=TRUE,
min_nucleotide_depth=1,
distinguish_strands=strand,
distinguish_nucleotides=TRUE,
include_deletions=callIndels,
include_insertions=callIndels)
pileup(file=bam, scanBamParam=sbp, pileupParam=pup)
}
# helper
.puToMvr <- function(pu, refSeqDNA, ref, bam, covg) {
pu$altDepth <- ifelse(is.na(pu$alt), NA_integer_, pu$count)
pu$refDepth <- pu$totalDepth - MTseeker:::.byPos(pu, "altDepth")
pu$isAlt <- !is.na(pu$alt)
pu$alleles <- .byPos(pu, "isAlt") + 1
#pu$strand <- as.character(pu$strand)
columns <-
c("seqnames","pos","ref","alt","totalDepth","refDepth","altDepth", "strand")
gr <- keepSeqlevels(.puGR(subset(pu, isAlt|alleles==1)[,columns]),
unique(pu$seqnames))
# seqnames for mouse genome is "MT"
seqinfo(gr) <- Seqinfo(levels(seqnames(gr)), width(refSeqDNA),
isCircular=TRUE, genome=ref)
vr <- makeVRangesFromGRanges(gr)
vr <- keepSeqlevels(vr, levels(seqnames(gr)))
seqinfo(vr) <- seqinfo(gr)
mvr <- MVRanges(vr, coverage=covg)
#strand(mvr) <- rle(as.factor(strand(gr)))
sampleNames(mvr) <- base::sub(paste0(".", ref), "",
base::sub(".bam", "",
basename(bam)))
names(mvr)[which(mvr$ref != mvr$alt)] <- MTHGVS(subset(mvr, ref != alt))
altDepth(mvr)[is.na(altDepth(mvr))] <- 0
mvr$VAF <- altDepth(mvr)/totalDepth(mvr)
metadata(mvr)$refseq <- refSeqDNA
mvr$bam <- basename(bam)
genome(mvr) <- ref
names(mvr) <- MTHGVS(mvr)
return(mvr)
}
# helper fn
.strpop <- function(x, sep=" ", idx=NULL) {
vec <- strsplit(x, sep)[[1]]
if (is.null(idx)) idx <- length(vec)
paste(vec[idx], collapse=sep)
}
# helper fn
.getPuSeqLen <- function(pu) {
as.integer(.strpop(levels(factor(pu$which_label)), "(:|\\-)"))
}
# helper fn
.addPuCoverage <- function(pu) {
covg <- rep(0, .getPuSeqLen(pu))
covg[pu$pos] <- pu$count
attr(pu, 'coverageRle') <- Rle(covg)
return(pu)
}
# helper fn
.getCoverageRle <- function(x) {
metadata(x)$coverageRle
}
# helper fn
.getCov <- function(x) {
mean(.getCoverageRle(x))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.