#' @title Get Profile Data surrounding specified ranges
#'
#' @description Get coverage Profile Data surrounding specified ranges
#'
#' @details
#' This will take all provided ranges and set as identical width ranges,
#' extending by the specified amount both up and downstream of the centre of the
#' provided ranges. By default, the ranges extensions are symmetrical and only
#' the upstream range needs to be specified, however this parameterisation
#' allows for non-symmetrical ranges to be generated.
#'
#' These uniform width ranges will then be used to extract the value contained
#' in the score field from one or more BigWigFiles. Uniform width ranges are
#' then broken into bins of equal width and the average score found within each
#' bin.
#'
#' The binned profiles are returned as a DataFrameList called `profile_data` as
#' a column within the resized GRanges object.
#' Column names in each DataFrame are `score`, `position` and `bp`.
#'
#' If passing a BigWigFileList, profiles will be obtained in series by
#' default. To run in parallel pass a \link[BiocParallel]{MulticoreParam} object
#' to the `BPPARAM` argument.
#'
#' @param x A BigWigFile or BigWiFileList
#' @param gr A GRanges object
#' @param upstream The distance to extend upstream from the centre of each
#' range within `gr`
#' @param downstream The distance to extend downstream from the centre of each
#' range within `gr`
#' @param bins The total number of bins to break the extended ranges into
#' @param mean_mode The method used for calculating the score for each bin.
#' See \link[EnrichedHeatmap]{normalizeToMatrix} for details
#' @param log logical(1) Should the returned values be log2-transformed
#' @param offset Value added to data if log-transforming. Ignored otherwise
#' @param n_max Upper limit on the number of ranges to return profile data for.
#' By default, no limit will be applied .
#' @param BPPARAM Passed internally to \link[BiocParallel]{bplapply}
#' @param ... Passed to \link[EnrichedHeatmap]{normalizeToMatrix}
#'
#' @return
#' GRanges or GrangesList with column profile_data, as described above
#'
#' @examples
#' bw <- system.file("tests", "test.bw", package = "rtracklayer")
#' gr <- GRanges("chr2:1000")
#' pd <- getProfileData(bw, gr, upstream = 500, bins = 10)
#' pd
#' pd$profile_data
#'
#' @import GenomicRanges
#' @importFrom rtracklayer import.bw BigWigFile BigWigFileList
#' @importFrom tibble as_tibble
#' @importFrom tidyr pivot_longer nest
#' @importFrom tidyselect all_of
#' @importFrom S4Vectors DataFrame mcols
#' @importFrom IRanges SplitDataFrameList
#' @importFrom dplyr left_join
#' @importFrom BiocParallel SerialParam bplapply bpisup bpstart bpstop
#' @importFrom GenomeInfoDb seqlevels keepSeqlevels
#' @rdname getProfileData-methods
#' @export
setMethod(
"getProfileData",
signature = signature(x = "BigWigFile", gr = "GenomicRanges"),
function(
x, gr, upstream = 2500, downstream = upstream, bins = 100,
mean_mode = "w0", log = TRUE, offset = 1, n_max = Inf, ...
) {
if (!requireNamespace('EnrichedHeatmap', quietly = TRUE))
stop("Please install 'EnrichedHeatmap' to use this function.")
stopifnot(upstream > 0 & downstream > 0 & bins > 0)
## Manage the seqinfo objects to avoid warnings & errors
bw_seqlevels <- seqlevels(x)
gr <- gr[as.character(seqnames(gr)) %in% bw_seqlevels]
stopifnot(length(gr) > 0)
bw_seqlevels <- intersect(bw_seqlevels, seqnames(gr))
gr <- keepSeqlevels(gr, bw_seqlevels)
gr <- gr[!duplicated(gr)]
n <- length(gr)
if (n > n_max) {
stopifnot(n_max > 0)
message("Downsampling to a maximum of ", n_max, " ranges")
ind <- sort(sample.int(n, n_max))
gr <- gr[ind]
}
ids <- as.character(gr)
## Set the bins & resize
bin_width <- as.integer((upstream + downstream) / bins)
gr_resize <- resize(gr, width = 1, fix = "center")
gr_resize <- promoters(gr_resize, upstream, downstream)
vals <- import.bw(x, which = gr_resize)
stopifnot("score" %in% .mcolnames(vals))
stopifnot(is.logical(log) & is.numeric(offset))
if (log) vals$score <- log2(vals$score + offset)
mat <- EnrichedHeatmap::normalizeToMatrix(
signal = vals,
target = resize(gr_resize, width = 1, fix = "center"),
extend = (upstream + downstream) / 2, w = bin_width,
mean_mode = mean_mode, value_column = "score", ...
)
mat <- as.matrix(mat)
rownames(mat) <- ids
tbl <- as_tibble(mat, rownames = "range")
tbl <- pivot_longer(
tbl, cols = all_of(colnames(mat)), names_to = "position",
values_to = "score"
)
tbl[["position"]] <- factor(tbl[["position"]], levels = colnames(mat))
tbl[["bp"]] <- seq(
-upstream + bin_width / 2, downstream - bin_width / 2,
by = bin_width
)[as.integer(tbl[["position"]])]
tbl <- nest(tbl, profile_data = all_of(c("score", "position", "bp")))
gr_tbl <- left_join(as_tibble(gr), tbl, by = "range")
gr_resize$profile_data <- SplitDataFrameList(
lapply(gr_tbl$profile_data, DataFrame), compress = TRUE
)
gr_resize
}
)
#' @rdname getProfileData-methods
#' @export
setMethod(
"getProfileData",
signature = signature(x = "BigWigFileList", gr = "GenomicRanges"),
function(
x, gr, upstream = 2500, downstream = upstream, bins = 100,
mean_mode = "w0", log = TRUE, offset = 1, BPPARAM = SerialParam(), ...
) {
## Check BiocParallel is ready to go
if (!bpisup(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}
out <- bplapply(
x,
getProfileData,
gr = gr, upstream = upstream, downstream = downstream, bins = bins,
mean_mode = mean_mode, log = log, offset = offset, ...,
BPPARAM = BPPARAM
)
as(out, "GRangesList")
}
)
#' @rdname getProfileData-methods
#' @export
setMethod(
"getProfileData",
signature = signature(x = "character", gr = "GenomicRanges"),
function(
x, gr, upstream = 2500, downstream = upstream, bins = 100,
mean_mode = "w0", log = TRUE, offset = 1, ...
) {
stopifnot(all(file.exists(x)))
if (length(x) == 1) {
x <- BigWigFile(x)
} else {
x <- BigWigFileList(x)
}
getProfileData(
x, gr, upstream, downstream, bins, mean_mode, log, offset, ...
)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.