# Generic -----------------------------------------------------------------
setGeneric(
"extract_tornado",
signature = "data",
function(
data, features, bins,
pad_value = 0, ...
) standardGeneric("extract_tornado")
)
# Bigwig Files ------------------------------------------------------------
# Abuse magic summary method for bigwig files
#' @importClassesFrom rtracklayer BigWigFileList
setMethod(
"extract_tornado",
signature = c(data = "BigWigFileList"),
function(
data, features, bins,
pad_value = 0, ...
) {
ans <- matrix(0, nrow = length(features), ncol = nrun(bins))
ans <- vapply(
data, summary, ans,
which = features, size = nrun(bins),
as = "matrix", type = "mean",
defaultValue = pad_value
)
aperm(ans, c(2, 1, 3))
}
)
# Convert BigWigFile to BigWigFileList
#' @importClassesFrom rtracklayer BigWigFile
setMethod(
"extract_tornado",
signature = c(data = "BigWigFile"),
function(
data, features, bins,
pad_value = 0, ...
) {
nms <- names(data)
data <- BigWigFileList(setNames(path(data), nms))
callGeneric(data, features, bins, pad_value, ...)
}
)
# Try to coerce characters to BigWigFileLists
setMethod(
"extract_tornado",
signature = c(data = "character"),
function(
data, features, bins,
pad_value = 0, ...
) {
stopifnot(
"The `data` argument doesn't appear to be a valid path to a file." =
all(file.exists(data)),
"The file type is unsupported." =
all(endsWith(data, ".bw"))
)
data <- BigWigFileList(data)
callGeneric(data, features, bins, pad_value, ...)
}
)
# Genomic Ranges ----------------------------------------------------------
setOldClass("ListOfRleList")
#' @importClassesFrom GenomicRanges GRanges GNCList
#' @importFrom GenomicRanges coverage GNCList
#' @importFrom IRanges overlapsAny
setMethod(
"extract_tornado",
signature = c(data = "GRanges"),
function(
data, features, bins,
pad_value = 0, ...
) {
data <- data[overlapsAny(data, GNCList(reduce(features)))]
data <- coverage(data)
data <- structure(list(data), class = "ListOfRleList")
callGeneric(data, features, bins, pad_value, ...)
}
)
#' @importClassesFrom GenomicRanges GRangesList
#' @importFrom IRanges stack
setMethod(
"extract_tornado",
signature = c(data = "GRangesList"),
function(
data, features, bins,
pad_value = 0, ...
) {
data <- stack(data)
data <- data[overlapsAny(data, GNCList(reduce(features)))]
data <- lapply(split(data, data$name), coverage)
data <- structure(data, class = "ListOfRleList")
callGeneric(data, features, bins, pad_value, ...)
}
)
# TabixFile ---------------------------------------------------------------
#' @importClassesFrom Rsamtools TabixFile
#' @importFrom Rsamtools headerTabix
setMethod(
"extract_tornado",
signature = c(data = "TabixFile"),
function(
data, features, bins,
pad_value = 0,
barcode_column = NULL,
barcode_groups = NULL,
...
) {
if (is.null(barcode_column) || is.null(barcode_groups)) {
barcode_column <- barcode_groups <- NULL
}
if (!is.null(barcode_groups)) {
groups <- names(barcode_groups) %||% seq_along(barcode_groups)
groups <- rep.int(groups, lengths(barcode_groups))
barcodes <- unlist(barcode_groups, use.names = FALSE)
} else {
barcodes <- NULL
}
# Setup column classes from tabix header
info <- headerTabix(data)$indexColumns
colclasses <- rep(NA, max(info, barcode_column))
colclasses[info] <- c("character", "integer", "integer")
colclasses[barcode_column] <- "character"
# Read in Tabix data
data <- unlist(Rsamtools::scanTabix(data, param = features),
use.names = FALSE)
data <- read.table(
text = data, sep = "\t", quote = "", comment.char = "",
colClasses = colclasses, check.names = FALSE, encoding = "UTF8",
as.is = TRUE
)
# Generate index for splitting
if (!is.null(barcodes)) {
idx <- groups[match(data[[barcode_column]], barcodes)]
} else {
idx <- rep(1L, nrow(data))
}
data <- GRanges(
data[[info[["seq"]]]],
IRanges(
data[[info[["start"]]]],
data[[info[["end"]]]]
)
)
data <- structure(lapply(split(data, idx), coverage),
class = "ListOfRleList")
callGeneric(data, features, bins, pad_value, ...)
}
)
# Rle lists ---------------------------------------------------------------
# Essentially does all the non-bigwig file operations
setMethod(
"extract_tornado",
signature = c(data = "ListOfRleList"),
function(
data, features, bins,
pad_value = 0, ...
) {
data <- pad_coverage(data, features, pad_value)
# Calculate indices
feat_len <- length(features)
index <- rep(end(bins), feat_len)
index <- index + rep(0:(feat_len - 1) * length(bins), each = nrun(bins))
# Summarise coverage
template <- matrix(0, nrow = nrun(bins), ncol = length(features))
ans <- vapply(data, summarise_core, template,
feats = features, index = index, dim = dim(template))
ans / runLength(bins)[1]
}
)
# Helpers -----------------------------------------------------------------
# Old wrapper, replaced by ListOfRleList objects
# summarise_coverage <- function(coverage, feats, bins) {
# template <- matrix(0, nrow = nrun(bins), ncol = length(feats))
#
# # Calculate indices
# index <- rep(end(bins), length(feats))
# index <- index + rep(0:(length(feats) - 1) * length(bins), each = nrun(bins))
#
# ans <- vapply(coverage, summarise_workhorse, template,
# feats = feats, index = index, dim = dim(template))
#
# # Flip minus strand orientation
# swap <- which(decode(strand(feats) == "-"))
# ans[ , swap, ] <- ans[rev(seq_len(nrun(bins))), swap, ]
#
# ans / runLength(bins)[1]
# }
#' @importFrom GenomeInfoDb seqnames seqlevelsStyle
#' @importMethodsFrom GenomicRanges range
pad_coverage <- function(coverage, features, pad_value = 0) {
# Get 'empirical' seqlengths from features
features <- range(features, ignore.strand = TRUE)
features <- setNames(end(features), decode(seqnames(features)))
covnames <- Reduce(intersect, lapply(coverage, names))
padme <- check_seqlevels(covnames, unique(names(features)))
lapply(coverage, function(covr) {
seqs <- intersect(names(covr), names(features))
pad <- features[seqs] - lengths(covr)[seqs]
pad <- pad[seqs]
for (i in names(pad)[pad > 0]) {
covr[[i]] <- c(covr[[i]], Rle(pad_value, pad[i]))
}
for (i in names(padme)) {
covr[[i]] <- Rle(pad_value, features[i])
}
return(covr)
})
}
check_seqlevels <- function(seqlevels_x, seqlevels_y) {
if (length(intersect(seqlevels_x, seqlevels_y)) == 0) {
style_cover <- seqlevelsStyle(seqlevels_x)[[1]]
style_feats <- seqlevelsStyle(seqlevels_y)[[1]]
if (style_feats != style_cover) {
msg <- paste0(
"The features and data have no common sequences. This might be due ",
"to a difference in style: the features follow the '", style_feats,
"' convention, whereas the data follow the '", style_cover,
"' convention. Try adjusting the features with ",
"`GenomeInfoDb::seqlevelsStyle()`."
)
} else {
msg <- paste0(
"The features and data have no common sequences, even though they ",
"share style conventions of naming sequences."
)
}
stop(msg, call. = FALSE)
}
dif <- setdiff(seqlevels_y, seqlevels_x)
if (length(dif)) {
difnames <- comma_and(dif)
msg <- paste0(
"The following sequence names were missing from the data but present in ",
"the features: ", difnames, ". The data will be padded for these ",
"sequences."
)
warning(msg, call. = FALSE)
}
dif
}
# Core function -----------------------------------------------------------
# This function exists for cases when the input is an RleList object
#
# There is a trick in `summarise_coverage` inspired by the 'summed area table'
# but in just 1 dimension (https://en.wikipedia.org/wiki/Summed-area_table).
# Instead of calculating a bunch of means over bins, the cumulative sum is
# calculated, which is later split into means.
#
# Example:
# # Classic split along bins, calculate sums, divide by lengths
# classic <- function(x, bins) {
# unname(vapply(split(x, bins), sum, numeric(1))) / runLength(bins)[1]
# }
#
# # Only consider ends of cumulative sum
# trick <- function(x, bins) {
# x <- cumsum(x)[end(bins)]
# x[-1] <- x[-1] - x[-length(x)]
# x / runLength(bins)[1]
# }
#
# # Test data
# x <- rnorm(1e6)
# bins <- Rle(1:1e4, 100)
#
# # Compare
# bench::mark(
# {classic(x, bins)},
# {trick(x, bins)},
# )
# summarise_core <- function(coverage, feats, index, dim) {
# # Select coverage at features
# coverage <- coverage[feats]
#
# # Calculate cumulative sums
# coverage <- unlist(coverage, use.names = FALSE, recursive = FALSE)
# coverage <- cumsum(rep.int(runValue(coverage), runLength(coverage)))
#
# # Take bin ends and subtract previous bin end
# coverage <- coverage[index]
# coverage <- coverage - c(0, coverage[-length(coverage)])
#
# # Reshape as matrix
# dim(coverage) <- dim
# coverage
# }
#' Title
#'
#' @param data A named `RleList` object, wherein names correspond to
#' `seqlevels(feats)`.
#' @param feats A `GRanges` object of equal width wherein the `seqlevels()`
#' correspond to the names in `data`.
#' @param index An `integer` vector with bin ends.
#' @param dim An `integer(2)` with the output dimensions.
#'
#' @return A `matrix` with dimensions
#' @noRd
summarise_core <- function(data, feats, index, dim) {
# Select coverage at features
data <- data[feats]
data <- unlist(data, FALSE, FALSE)
# Find which run index falls into and how many elements into that run
coverend <- end(data)
interval <- findInterval(index, coverend, left.open = TRUE) + 1
overflow <- index - coverend[interval]
# Calculate cumsums
ans <- cumsum(runValue(data) * runLength(data))[interval]
ans <- ans + overflow * runValue(data)[interval]
# Convert cumsums to binned averages
ans <- ans - c(0, ans[-length(ans)])
dim(ans) <- dim
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.