#' Calls SpliceWiz's C++ function to retrieve coverage from a COV file
#'
#' This function returns an RLE / RLEList or data.frame
#' containing coverage data from the given COV file\cr\cr
#' COV files are generated by SpliceWiz's [processBAM] and [BAM2COV] functions.
#' It records alignment coverage for each nucleotide in the given BAM file.
#' It stores this data in "COV" format, which is an indexed BGZF-compressed
#' format specialised for the storage of unstranded and stranded alignment
#' coverage in RNA sequencing.\cr\cr
#' Unlike BigWig files, COV files store coverage for both positive and negative
#' strands.\cr\cr
#' These functions retrieves coverage data from the specified COV file. They
#' are computationally efficient as they utilise random-access to rapidly
#' search for the requested data from the COV file.\cr\cr
#'
#' @param file (Required) The file name of the COV file
#' @param seqname (Required for `getCoverage_DF`) A string denoting the
#' chromosome name. If left blank in `getCoverage`, retrieves RLEList
#' containing coverage of the entire file.
#' @param start,end 1-based genomic coordinates. If `start = 0` and
#' `end = 0`, will retrieve RLE of specified chromosome.
#' @param strand Either "*", "+", or "-"
#' @param strandMode The stranded-ness of the RNA-seq experiment. "unstranded"
#' means that an unstranded protocol was used. Stranded protocols can be
#' either "forward", where the first read is the same strand as the
#' expressed transcript, or "reverse" where the second strand is the same
#' strand as the expressed transcript.
#' @param regions A GRanges object for a set of regions to obtain mean / total
#' coverage from the given COV file.
#' @param region In `getCoverageBins`, a single query region as a GRanges object
#' @param bins In `getCoverageBins`, the number of bins to divide the given
#' `region`. If `bin_size` is given, overrides this parameter
#' @param bin_size In `getCoverageBins`, the number of nucleotides per bin
#' @return
#' For `getCoverage`: If seqname is left as "", returns an RLEList of the
#' whole BAM file, with each RLE in the list containing coverage data for
#' one chromosome. Otherwise, returns an RLE containing coverage data for
#' the requested genomic region
#'
#' For `getCoverage_DF`: Returns a two-column data frame, with the first column
#' `coordinate` denoting genomic coordinate, and the second column `value`
#' containing the coverage depth for each coordinate nucleotide.
#'
#' For `getCoverageRegions`: Returns a GRanges object with an extra metacolumn:
#' `cov_mean`, which gives the mean coverage of each of the given ranges.
#'
#' For `getCoverageBins`: Returns a GRanges object which spans the given
#' `region`, divided by the number of `bins` or by width as given by
#' `bin_size`. Mean coverage in each bin is calculated (returned by the
#' `cov_mean` metadata column). This function is useful for retrieving
#' coverage of a large region for visualisation, especially when the
#' size of the region vastly exceeds the width of the figure.
#'
#' @examples
#' se <- SpliceWiz_example_NxtSE()
#'
#' cov_file <- covfile(se)[1]
#'
#' # Retrieve Coverage as RLE
#'
#' cov <- getCoverage(cov_file, seqname = "chrZ",
#' start = 10000, end = 20000,
#' strand = "*"
#' )
#'
#' # Retrieve Coverage as data.frame
#'
#' cov.df <- getCoverage_DF(cov_file, seqname = "chrZ",
#' start = 10000, end = 20000,
#' strand = "*"
#' )
#'
#' # Retrieve mean coverage of 100-nt window regions as defined
#' # in a GRanges object:
#'
#' gr <- GenomicRanges::GRanges(
#' seqnames = "chrZ",
#' ranges = IRanges::IRanges(
#' start = seq(1, 99901, by = 100),
#' end = seq(100, 100000, by = 100)
#' ), strand = "-"
#' )
#'
#' gr.unstranded <- getCoverageRegions(cov_file,
#' regions = gr,
#' strandMode = "unstranded"
#' )
#'
#' gr.stranded <- getCoverageRegions(cov_file,
#' regions = gr,
#' strandMode = "reverse"
#' )
#'
#' # Retrieve binned coverage of a large region
#'
#' gr.fetch <- getCoverageBins(
#' cov_file,
#' region = GenomicRanges::GRanges(seqnames = "chrZ",
#' ranges = IRanges::IRanges(start = 100, end = 100000),
#' strand = "*"
#' ),
#' bins = 2000
#' )
#'
#' # Plot coverage using ggplot:
#'
#' require(ggplot2)
#'
#' ggplot(cov.df, aes(x = coordinate, y = value)) +
#' geom_line() + theme_white
#'
#' ggplot(as.data.frame(gr.unstranded),
#' aes(x = (start + end) / 2, y = cov_mean)) +
#' geom_line() + theme_white
#'
#' ggplot(as.data.frame(gr.fetch),
#' aes(x = (start + end)/2, y = cov_mean)) +
#' geom_line() + theme_white
#'
#' # Export COV data as BigWig
#'
#' cov_whole <- getCoverage(cov_file)
#' bw_file <- file.path(tempdir(), "sample.bw")
#' rtracklayer::export(cov_whole, bw_file, "bw")
#' @name Coverage
#' @md
NULL
#' @describeIn Coverage Retrieves alignment coverage as an RLE or RLElist
#' @export
getCoverage <- function(file, seqname = "", start = 0, end = 0,
strand = c("*", "+", "-")) {
strand <- match.arg(strand)
if (!(strand %in% c("*", "+", "-"))) {
.log(paste("In getCoverage(),",
"Invalid strand. '*', '+' or '-'"))
}
strand_int <- ifelse(strand == "*", 2,
ifelse(strand == "+", 1, 0))
if (!is.numeric(start) || !is.numeric(end) ||
(as.numeric(start) > as.numeric(end) & as.numeric(end) > 0)) {
.log(paste("In getCoverage(),",
"Zero or negative regions are not allowed"))
}
if (seqname == "") {
raw_list <- c_RLEList_From_Cov(normalizePath(file), strand_int)
final_list <- list()
if (length(raw_list) > 0) {
for (i in seq_len(length(raw_list))) {
final_list[[i]] <- S4Vectors::Rle(
raw_list[[i]]$values, raw_list[[i]]$lengths
)
}
} else {
return(NULL)
}
final_RLE <- as(final_list, "RleList")
names(final_RLE) <- names(raw_list)
return(final_RLE)
} else if (end == 0) {
raw_RLE <- c_RLE_From_Cov(
normalizePath(file), as.character(seqname),
0, 0, strand_int
)
final_RLE <- S4Vectors::Rle(raw_RLE$values, raw_RLE$lengths)
} else {
raw_RLE <- c_RLE_From_Cov(
normalizePath(file), as.character(seqname),
round(as.numeric(start)), round(as.numeric(end)),
strand_int
)
final_RLE <- S4Vectors::Rle(raw_RLE$values, raw_RLE$lengths)
}
}
.cov_process_regions <- function(file, gr, seq, strand_gr, strand_cov) {
# adds cov_mean from cov file to gr, only for given seqname seq
# strand_gr and strand_cov are matching strand info for gr and cov
if (!any(
as.character(seqnames(gr)) %in% seq &
as.character(strand(gr)) %in% strand_gr
)) {
return(gr)
}
todo <- which(
as.character(seqnames(gr)) %in% seq &
as.character(strand(gr)) %in% strand_gr
)
cov_data <- getCoverage(
file, seq,
min(start(gr[todo])) - 1,
max(end(gr[todo])),
strand_cov
)
if (is.null(gr$cov_mean)) gr$cov_mean <- 0
gr$cov_mean[todo] <- round(
aggregate(
cov_data, IRanges(start(gr[todo]), end(gr[todo])), FUN = mean
), 2
)
return(gr)
}
#' @describeIn Coverage Retrieves alignment coverage as a data.frame
#' @export
getCoverage_DF <- function(file, seqname = "", start = 0, end = 0,
strand = c("*", "+", "-")
) {
if (seqname == "") .log("seqname must not be omitted in getCoverage_DF")
cov <- getCoverage(file, seqname, start, end, strand)
view <- IRanges::Views(cov, start + 1, end)
view.df <- as.data.frame(view[[1]])
return(data.frame(
coordinate = seq(start + 1, end), value = view.df$value
))
}
#' @describeIn Coverage Retrieves total and mean coverage of a GRanges object
#' from a COV file
#' @export
getCoverageRegions <- function(file, regions,
strandMode = c("unstranded", "forward", "reverse")
) {
strandMode <- match.arg(strandMode)
if (strandMode == "") strandMode <- "unstranded"
if (!is(regions, "GRanges")) .log("regions must be a GRanges object")
if (!isCOV(file)) .log("Given file is not of COV format")
seqlevels <- c_Cov_Seqnames(normalizePath(file))
# trim regions by available seqlevels
if (!any(seqlevels %in% seqlevels(regions)))
.log("COV file and given regions have incompatible seqnames")
seqlevels(regions, pruning.mode = "coarse") <- seqlevels
if (length(regions) == 0) {
return(regions)
}
if (strandMode == "unstranded") {
for (seq in unique(seqnames(regions))) {
regions <- .cov_process_regions(file, regions, seq,
c("+", "-", "*"), "*")
}
} else if (strandMode == "forward") {
for (seq in unique(seqnames(regions))) {
regions <- .cov_process_regions(file, regions, seq, "*", "*")
regions <- .cov_process_regions(file, regions, seq, "+", "+")
regions <- .cov_process_regions(file, regions, seq, "-", "-")
}
} else {
for (seq in unique(seqnames(regions))) {
regions <- .cov_process_regions(file, regions, seq, "*", "*")
regions <- .cov_process_regions(file, regions, seq, "-", "+")
regions <- .cov_process_regions(file, regions, seq, "+", "-")
}
}
return(regions)
}
#' @describeIn Coverage Retrieves coverage of a single region from a COV file,
#' binned by the given number of bins or bin_size
#' @export
getCoverageBins <- function(file, region, bins = 2000,
strandMode = c("unstranded", "forward", "reverse"),
bin_size
) {
strandMode <- match.arg(strandMode)
if (strandMode == "") strandMode <- "unstranded"
if (!is(region, "GRanges")) .log("region must be a GRanges object")
region <- region[1]
if (!isCOV(file)) .log("Given file is not of COV format")
seqlevels <- c_Cov_Seqnames(normalizePath(file))
if(!(as.character(seqnames(region)) %in% seqlevels))
.log("Given region is on a chromosome that is missing in COV file")
if(!is.numeric(bins) || bins < 1) .log("`bins` must be a numeric value")
if(missing(bin_size) || !is.numeric(bin_size) ||
bin_size > width(region) || bin_size < 1) {
bin_size <- ceiling(width(region) / bins)
}
if (strandMode == "unstranded") {
strand <- "*"
} else if (strandMode == "reverse") {
if(strand(region) == "+") {
strand <- "-"
} else if(strand(region == "-")) {
strand <- "+"
} else {
strand <- "*"
}
} else {
strand <- as.character(strand(region))
}
df <- as.data.frame(.internal_get_coverage_as_df(
"sample", file,
as.character(seqnames(region)),
start(region), end(region), strand)
)
bin_gr <- .bin_ranges(df$x, binwidth = bin_size)
bin_gr$seqnames <- as.character(GenomeInfoDb::seqnames(region))
bin_gr$strand <- strand
gr.fetch <- .grDT(bin_gr)
df <- .bin_df(df, bin_size)
gr.fetch$cov_mean <- df$sample
return(gr.fetch)
}
################################# PLOT TRACKS INTERNALS ########################
.internal_get_coverage_as_df <- function(samples, files, seqname, start, end,
strand = c("*", "+", "-")) {
strand <- match.arg(strand)
if (!(strand %in% c("*", "+", "-"))) {
.log(paste("In getCoverage(),",
"Invalid strand. '*', '+' or '-'"))
}
covData <- list()
for (i in seq_len(length(files))) {
cov <- getCoverage(files[i], seqname, start - 1, end, strand)
view <- IRanges::Views(cov, start, end)
view.df <- as.data.frame(view[[1]])
covData[[i]] <- view.df
}
df <- do.call(cbind, covData)
colnames(df) <- samples
x <- seq(start, end)
df <- cbind(x, df)
return(df)
}
.bin_df <- function(df, binwidth = 3, exon_gr = NULL) {
DT <- as.data.table(df)
brks <- seq(1,
nrow(DT) + 1,
length.out = (nrow(DT) + 1) / binwidth
)
# Use single nucleotide resolution for exon bins
if(!is.null(exon_gr)) {
for(i in seq_len(length(exon_gr))) {
brks <- c(brks, which(
DT$x >= BiocGenerics::start(exon_gr[i]) &
DT$x <= BiocGenerics::end(exon_gr[i])
))
}
}
brks <- sort(unique(brks))
bin <- NULL
DT[, c("bin") := findInterval(seq_len(nrow(DT)), brks)]
DT2 <- DT[, lapply(.SD, mean, na.rm = TRUE), by = "bin"]
DT2[, c("bin") := NULL]
return(as.data.frame(DT2))
}
.bin_ranges <- function(coords, binwidth = 3, exon_gr = NULL) {
# coords is a vector of coordinates
brks <- seq(1,
length(coords) + 1,
length.out = (length(coords) + 1) / binwidth
)
if(!is.null(exon_gr)) {
for(i in seq_len(length(exon_gr))) {
brks <- c(brks, which(
coords >= BiocGenerics::start(exon_gr[i]) &
coords <= BiocGenerics::end(exon_gr[i])
))
}
}
brks <- sort(unique(brks))
starts <- ceiling(brks)[-length(brks)]
ends <- c(starts[-1] - 1, length(coords))
return(data.frame(
start = coords[starts],
end = coords[ends]
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.