##' Using \code{Rsamtools} functions to manipulate tabix file, the indexed BED file is read
##' and the features present in the selected region are retrieved.
##' @title Retrieve a subset of an indexed BED file
##' @param file a BED file containing the data to be read.
##' It should be compressed with \code{bgzip} and indexed by tabix algorithm.
##' @param subset.reg a data.frame or GRanges object with the regions to subset from.
##' If a data.frame, it must have columns named 'chr', 'start' and 'end'.
##' @param header should the first line of \code{file} be used as header. Default is \code{TRUE}.
##' @return a data.frame with the retrieved BED information.
##' @author Jean Monlong, Diego Garrido-MartÃn
##' @export
##' @import data.table
read.bedix <- function(file, subset.reg = NULL, header = TRUE)
{
if(!is.character(file)){
file <- as.character(file)
}
if(!file.exists(file)){
file <- paste0(file, ".bgz")
}
if(!file.exists(file)){
stop(file, "Input file not found (with and without .bgz extension).")
}
if(!file.exists(paste0(file, ".tbi"))){
stop(paste0(file, ".tbi"), "Index file not found.")
}
if(is.null(subset.reg)){
return(utils::read.table(file, as.is = TRUE, header = header))
}
if (is.data.frame(subset.reg)) {
if(!all(c("chr", "start", "end") %in% colnames(subset.reg))){
stop("Missing column in 'subset.reg'. 'chr', 'start' and 'end' are required.")
}
subset.reg <- with(subset.reg,
GenomicRanges::GRanges(chr, IRanges::IRanges(start, end)))
} else if (class(subset.reg) != "GRanges") {
stop("'subset.reg' must be a data.frame or a GRanges object.")
}
subset.reg <- subset.reg[order(as.character(GenomicRanges::seqnames(subset.reg)),
GenomicRanges::start(subset.reg))]
read.chunk <- function(gr){
bed <- tryCatch(unlist(Rsamtools::scanTabix(file, param = GenomicRanges::reduce(gr)),
use.names = FALSE, recursive = FALSE), error = function(e) c())
if (length(bed) == 0) {
return(NULL)
}
ncol <- length(strsplit(bed[1], "\t")[[1]])
bed <- matrix(unlist(strsplit(bed, "\t", fixed = TRUE), use.names=FALSE, recursive = FALSE),
length(bed), ncol, byrow = TRUE)
bed <- data.table::data.table(bed)
if (header) {
data.table::setnames(bed,
as.character(utils::read.table(file, nrows = 1, as.is = TRUE)))
}
bed <- bed[, lapply(.SD, function(ee)utils::type.convert(as.character(ee), as.is = TRUE))]
bed <- as.data.frame(bed)
return(bed)
}
if (length(subset.reg) > 10000) {
chunks <- cut(1:length(subset.reg), ceiling(length(subset.reg)/10000))
bed.df <- plyr::ldply(levels(chunks), function(ch.id){
read.chunk(subset.reg[which(chunks == ch.id)])
})
} else {
bed.df <- read.chunk(subset.reg)
}
if(!is.null(bed.df)){
bed.df <- bed.df[order(bed.df$chr, bed.df$start), ]
}
return(bed.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.