#' extract nucleotide counts for targeted variants from the BAM file.
#' @description Given a BAM file and target loci, `bamreadcounts` fetches redcounts for A, T, G, C, Ins, and Del. Function name is an homage to https://github.com/genome/bam-readcount
#' @param bam Input bam file(s). Required.
#' @param loci Loci file. Can be a tsv file or a data.frame. First two columns should contain chromosome and position (by default assumes coordinates are 1-based)
#' @param zerobased are coordinates zero-based. Default FALSE.
#' @param mapq Map quality. Default 10
#' @param sam_flag SAM FLAG to filter reads. Default 1024
#' @param op Output file basename. Default parses from BAM file
#' @param fa Indexed fasta file. If provided, extracts and adds reference base to the output tsv.
#' @param nthreads Number of threads to use. Each BAM file will be launched on a separate thread. Works only on Unix and macOS.
#' @useDynLib maftools, .registration = TRUE
#' @export
bamreadcounts = function(bam = NULL, loci = NULL, zerobased = FALSE, mapq = 10, sam_flag = 1024, op = NULL, fa = NULL, nthreads = 4){
if(any(is.null(bam), is.null(loci))){
stop("Missing BAM or loci file!")
}
op_files = lapply(bam, function(x){
bam_ext = substr(x = basename(x), start = nchar(basename(path = x))-3, nchar(basename(x)))
if(bam_ext != ".bam"){
stop("Input file is not a BAM file: ", x)
}
if(!file.exists(x)){
stop("BAM file does not exist: ", x)
}
if(!file.exists(paste0(x , ".bai"))){
#Check for foo.bai as well
x_bn = unlist(data.table::tstrsplit(x = basename(x), split = "\\.", keep = 1))
if(! file.exists(paste0(dirname(x),"/", x_bn, ".bai"))){
stop("Index file for ", x, " not found!")
}
}
gsub(pattern = "\\.bam$", replacement = "", x = basename(x), ignore.case = TRUE)
})
if(is.null(op)){
op = as.character(unlist(op_files))
op_files = lapply(op, function(x) {
paste0(x, "_nucleotide_counts")
})
op_files = as.character(unlist(op_files))
}else{
if(length(op) != length(bam)){
stop("No. of output file names must be equal to no. of BAM files.")
}
op_files = paste0(op, "_nucleotide_counts")
}
if(all(file.exists(op_files))){
warning("Counts are already generated!")
res = lapply(seq_along(op_files), function(x){
data.table::fread(file = paste0(op_files[x], ".tsv"), sep = "\t", header = TRUE)
})
names(res) = op
return(res)
}
if(is.null(fa)){
fa = "NULL"
}
if(is.data.frame(loci)){
colnames(loci)[c(1:2)] = c("chr", "start")
}else if(file.exists(loci)){
loci = data.table::fread(file = loci)
colnames(loci)[c(1:2)] = c("chr", "start")
}else{
stop("loci must be a data.frame or a tsv file!")
}
data.table::setDF(x = loci)
if(zerobased){
loci$start = as.numeric(loci$start) + 1
}
lfile = tempfile(pattern = 'bamrc_', fileext = ".tsv")
data.table::fwrite(x = loci[,c(1:2)], file = lfile, col.names = FALSE, sep = "\t", row.names = FALSE)
cat("Fetching readcounts from BAM files..\n")
parallel::mclapply(seq_along(bam), function(idx){
cat("Processing", bam[idx], ":\n")
withCallingHandlers(suppressWarnings(invisible(.Call("ntc", bam[idx], lfile, mapq, sam_flag, fa, op_files[idx], PACKAGE = "maftools"))))
}, mc.cores = nthreads)
res = lapply(seq_along(op_files), function(x){
data.table::fread(file = paste0(op_files[x], ".tsv"), sep = "\t", header = TRUE)
})
names(res) = op
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.