#' @rdname getGRegionsStat2
#' @title Statistic of Genomic Regions
#' @description A function to estimate the summarized measures of a specified
#' variable given in a GRanges object (a column from the metacolums of the
#' GRanges object) after split the GRanges object into intervals.
#' @details This function split a Grange object into intervals genomic regions
#' (GRs) of fixed size A summarized statistic (mean, median, geometric mean
#' or sum) is calculated for the specified variable values from each region.
#' Notice that if win.size == step.size, then non-overlapping windows are
#' obtained.
#' @param GR A \code{\link[GenomicRanges]{GRanges-class}} object or a
#' \code{\link[GenomicRanges]{GRangesList-class}} object carrying the
#' variables of interest in the GRanges metacolumn(s).
#' @param win.size An integer for the size of the windows/regions size of the
#' intervals of genomics regions.
#' @param step.size Interval at which the regions/windows must be defined
#' @param grfeatures A GRanges object corresponding to an annotated genomic
#' feature. For example, gene region, transposable elements, exons,
#' intergenic region, etc. If provided, then parameters 'win.size' and
#' step.size are ignored and the statistics are estimated for 'grfeatures'.
#' @param stat Statistic used to estimate the summarized value of the variable
#' of interest in each interval/window. Posible options are:
#'
#' \describe{
#' \item{\strong{'mean':}}{The mean of values inside each region.}
#' \item{\strong{'gmean':}}{The geometric mean of values inside each region.}
#' \item{\strong{'median':}}{The median of values inside each region.}
#' \item{\strong{'density':}}{The density of values inside each region. That
#' is, the sum of values found in each region divided by the width of
#' the region.}
#' \item{\strong{'count':}}{Compute the number/count of positions with values
#' greater than zero inside each regions.}
#' \item{\strong{'denCount':}}{The number of sites with value > 0 inside each
#' region divided by the width of the region.}
#' \item{\strong{'sum':}}{The sum of values inside each region.}
#' }
#'
#' If \strong{GR} have zero metacolum, then it is set \emph{stat = "count"} and
#' all the sites are included in the computation.
#' @param column Integer number denoting the column where the variable of
#' interest is located in the metacolumn of the GRanges object. Default is 1L if
#' the number of columns is greater than 1, otherwise NULL.
#' @param absolute Optional. Logic (default: FALSE). Whether to use the absolute
#' values of the variable provided. For example, the difference of methylation
#' levels could take negative values (TV) and we would be interested on the sum
#' of abs(TV), which is sum of the total variation distance.
#' @param select.strand Optional. If provided,'+' or '-', then the summarized
#' statistic is computed only for the specified DNA chain.
#' @param maxgap,minoverlap,type See
#' \code{\link[IRanges]{findOverlaps-methods}} in the
#' \strong{IRanges} package for a description of these arguments.
#' @param ignore.strand When set to TRUE, the strand information is ignored in
#' the overlap calculations.
#' @param scaling integer (default 1). Scaling factor to be used when
#' stat = 'density'. For example, if scaling = 1000, then density * scaling
#' denotes the sum of values in 1000 bp.
#' @param logbase A positive number: the base with respect to which logarithms
#' are computed when parameter 'entropy = TRUE' (default: logbase = 2).
#' @param missings Whether to write '0' or 'NA' on regions where there is not
#' data to compute the statistic.
#' @param naming Logical value. If TRUE, the rows GRanges object will be
#' given the names(grfeatures). Default is FALSE.
#' @param na.rm Logical value. If TRUE, the NA values will be removed.
#' @param verbose Logical. Default is TRUE. If TRUE, then the progress of the
#' computational tasks is given.
#' @param num.cores,tasks Parameters for parallel computation using package
#' \code{\link[BiocParallel]{BiocParallel-package}}: the number of cores to use,
#' i.e. at most how many child processes will be run simultaneously (see
#' \code{\link[BiocParallel]{bplapply}} and the number of tasks per job (only
#' for Linux OS). Only used when signal is a list of
#' \code{\link[GenomicRanges]{GRanges-class}} object.
#' @return A \code{\link[GenomicRanges]{GRanges-class}} object or a
#' \code{\link[GenomicRanges]{GRangesList-class}} object with the new genomic
#' regions and their corresponding summarized statistic.
#' @examples
#' library(GenomicRanges)
#' set.seed(1)
#' gr <- GRanges(seqnames = Rle( c('chr1', 'chr2', 'chr3', 'chr4'),
#' c(5, 5, 5, 5)),
#' ranges = IRanges(start = 1:20, end = 1:20),
#' strand = rep(c('+', '-'), 10),
#' A = seq(1, 0, length = 20))
#' gr$B <- runif(20)
#' grs <- getGRegionsStat2(gr, win.size = 4, step.size = 4)
#' grs
#'
#' ## Selecting the positive strand
#' grs <- getGRegionsStat2(gr, win.size = 4,
#' step.size = 4, select.strand = '+')
#' grs
#'
#' ## Selecting the negative strand
#' grs <- getGRegionsStat2(gr, win.size = 4,
#' step.size = 4, select.strand = '-')
#' grs
#'
#' @importFrom GenomeInfoDb seqnames seqlengths seqlevels
#' @importFrom GenomicRanges GRanges findOverlaps
#' @importFrom IRanges IRanges width
#' @importFrom stats median
#' @importFrom dplyr group_by summarise_all '%>%'
#' @importFrom S4Vectors subjectHits queryHits DataFrame mcols
#' @importFrom S4Vectors mcols<-
#' @importFrom BiocGenerics strand start end
#' @importFrom MethylIT uniqueGRanges sortBySeqnameAndStart unlist
#' @importFrom utils txtProgressBar
#' @seealso \code{\link{getGRegionsStat}}
#' @author Robersy Sanchez (\url{https://github.com/genomaths}).
#' @export
#' @aliases getGRegionsStat2
setGeneric("getGRegionsStat2",
function(GR,
win.size = 1,
step.size = 1,
grfeatures = NULL,
stat = c("sum", "mean", "gmean", "median",
"density", "count", "denCount"),
column = NULL,
absolute = FALSE,
select.strand = NULL,
maxgap = -1L,
minoverlap = 0L,
select = "all",
ignore.strand = TRUE,
type = c("within", "start", "end", "equal", "any"),
scaling = 1000L,
logbase = 2,
missings = 0,
naming = FALSE,
na.rm = TRUE,
num.cores = 1L,
tasks = 0,
verbose = TRUE,
...) {
## These NULL quiet: no visible binding for global variable 'x2'
if (!inherits(GR, "GRanges"))
stop("GR object must inherits from a GRanges class")
if (!is.null(grfeatures) && !inherits(grfeatures, "GRanges")) {
stop("* 'grfeatures', if provided, must be a GRanges object")
}
stat <- match.arg( stat,
c("sum", "mean", "gmean", "median",
"density", "count", "denCount"))
if (!is.element(missings, c(0, NA)))
missings <- NA
type <- match.arg(type)
m <- ncol(mcols(GR))
if (!is.null(column))
GR <- GR[, column]
else
if (m > 0)
GR <- GR[, 1L]
## === Some functions to use ===
stats <- function(x, stat = c(), absolute, na.rm) {
if (absolute)
x <- abs(x)
x <- switch(
stat,
count = sum(x != 0, na.rm = na.rm),
sum = sum(x, na.rm = na.rm),
mean = mean(x, na.rm = na.rm),
gmean = exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x)),
median = median(x, na.rm = na.rm),
density = sum(x, na.rm = na.rm),
denCount = sum(x != 0, na.rm = na.rm))
}
fn <- function(x)
stats(x, stat = stat, absolute = absolute, na.rm = na.rm)
## =============================== ##
if (!is.null(select.strand)) {
## possible values '-', '+', NULL
if (!is.element(select.strand, unique(strand(GR)))) {
stop("* The GRanges object does not have strand named ",
"'", select.strand, "'")
}
idx <- which(as.character(strand(GR)) == select.strand)
GR <- GR[idx]
ignore.strand <- FALSE
}
## Progress bar
if (verbose) {
# setup progress bar
pb <- txtProgressBar(max = 100, style = 3)
on.exit(close(pb)) # on exit, close progress bar
}
## === If genomic features are not specified ===
if (is.null(grfeatures)) {
if (verbose)
setTxtProgressBar(pb, 1) # update progress bar
all.wins <- slidingGRanges(GR = GR, win.size = win.size,
step.size = step.size)
## sites of interest inside of the windows
Hits <- findOverlaps(GR, all.wins, maxgap = maxgap,
minoverlap = minoverlap,
select = select,
ignore.strand = ignore.strand,
type = type)
if (length(Hits) > 0) {
m <- ncol(mcols(GR))
all.wins <- all.wins[ subjectHits(Hits) ]
GR <- GR[queryHits(Hits)]
strand(all.wins) <- strand(GR)
if (m > 0) {
mcols(all.wins) <- mcols(GR)
} else {
mcols(all.wins) <- 1
stat <- "count"
}
chr <- seqnames(all.wins)
if (!ignore.strand)
strands <- strand(all.wins)
else strands <- rep("*", length(all.wins))
## Variable to mark the limits of each GR
all.wins$cluster.id <- paste(
chr,
start(all.wins),
end(all.wins),
strands, sep = "_")
GR <- all.wins
rm(all.wins); gc()
GR <- as.data.frame(GR)
GR <- GR[, -c(1:5)]
if (verbose)
setTxtProgressBar(pb, 25) # update progress bar
GR <- GR %>% group_by(cluster.id) %>% summarise_all(list(fn))
if (verbose)
setTxtProgressBar(pb, 75) # update progress bar
strands <- matrix(unlist(strsplit(GR$cluster.id, "_")),
ncol = 4, byrow = TRUE)
GR <- data.frame(GR[, -1], chr = strands[, 1],
start = as.numeric(strands[, 2]),
end = as.numeric(strands[, 3]),
strand = as.character(strands[, 4]))
GR <- makeGRangesFromDataFrame(GR, keep.extra.columns = TRUE)
if (stat == "density" || stat == "denCount") {
widths <- width(GR)
mcols(GR) <- (scaling * as.matrix(mcols(GR))/widths)
}
colnames(mcols(GR)) <- stat
} else {
m <- ncol(mcols(GR))
if (m > 1) {
mcols(all.wins) <- matrix(
missings,
nrow = length(all.wins),
ncol = m)
colnames(mcols(all.wins)) <- colnames(mcols(GR))
} else
mcols(all.wins) <- missings
GR <- all.wins
warnings("There is not overlap between the 'GR' & the regions")
}
} else {
## sites of interest inside of the windows
if (verbose)
setTxtProgressBar(pb, 1) # update progress bar
Hits <- findOverlaps(
GR,
grfeatures,
maxgap = maxgap,
select = select,
minoverlap = minoverlap,
ignore.strand = ignore.strand,
type = type)
if (length(Hits) > 0) {
m <- ncol(mcols(GR))
grfeatures <- grfeatures[subjectHits(Hits)]
GR <- GR[ queryHits(Hits) ]
if (m > 0) {
mcols(grfeatures) <- matrix(missings,
nrow = length(grfeatures),
ncol = m)
mcols(grfeatures) <- mcols(GR)
} else {
mcols(grfeatures) <- 1
stat <- "count"
}
chr <- seqnames(grfeatures)
if (!ignore.strand) strands <- strand(grfeatures)
else strands <- rep("*", length(grfeatures))
if (class(names(grfeatures)) == "character" && naming)
grfeatures$cluster.id <- paste(chr, start(grfeatures),
end(grfeatures),
strands,
names(grfeatures), sep = "_")
else grfeatures$cluster.id <- paste(chr, start(grfeatures),
end(grfeatures), strands,
sep = "_")
GR <- grfeatures
rm(grfeatures)
gc()
GR <- as.data.frame(GR)
GR <- GR[, -c(1:5)]
if (verbose) setTxtProgressBar(pb, 25) # update progress bar
GR <- GR %>% group_by(cluster.id) %>% summarise_all(list(fn))
if (verbose)
setTxtProgressBar(pb, 75) # update progress bar
if (naming)
strands <- matrix(unlist(strsplit(GR$cluster.id, "_")),
ncol = 5, byrow = TRUE)
else strands <- matrix(unlist(strsplit(GR$cluster.id, "_")),
ncol = 4, byrow = TRUE)
GR <- data.frame(GR[, -1], chr = strands[, 1],
start = as.numeric(strands[, 2]),
end = as.numeric(strands[, 3]),
strand = as.character(strands[, 4]))
GR <- makeGRangesFromDataFrame(GR, keep.extra.columns = TRUE)
if (naming) names(GR) <- strands[, 5]
if (stat == "density") {
widths <- width(GR)
mcols(GR) <- (scaling * as.matrix(mcols(GR))/widths)
}
colnames(mcols(GR)) <- stat
} else {
m <- ncol(mcols(GR))
if (m > 1) {
mcols(grfeatures) <- matrix(
missings,
nrow = length(grfeatures),
ncol = m)
} else
mcols(grfeatures) <- missings
colnames(mcols(grfeatures)) <- colnames(mcols(GR))
GR <- grfeatures
warnings("There is not overlap between the 'GR' & the regions")
}
}
GR <- sortBySeqnameAndStart(GR)
if (verbose)
setTxtProgressBar(pb, 100) # update progress bar
return(GR)
}
)
#' @aliases getGRegionsStat2
#' @rdname getGRegionsStat2
#' @export
setMethod("getGRegionsStat2", signature(GR = "pDMP"),
function(GR,
win.size = 1,
step.size = 1,
grfeatures = NULL,
stat = c("sum", "mean", "gmean", "median",
"density", "count", "denCount"),
column = NULL,
absolute = FALSE,
select.strand = NULL,
maxgap = -1L,
minoverlap = 0L,
select = "all",
ignore.strand = TRUE,
type = c("within", "start", "end", "equal", "any"),
scaling = 1000L,
logbase = 2,
missings = 0,
naming = FALSE,
na.rm = TRUE,
num.cores = 1L,
tasks = 0,
verbose = TRUE,
...) {
if (verbose)
progressbar <- TRUE else progressbar <- FALSE
if (Sys.info()["sysname"] == "Linux") {
bpparam <- MulticoreParam(workers = num.cores, tasks = tasks,
progressbar = progressbar)
} else {
bpparam <- SnowParam(workers = num.cores, type = "SOCK",
progressbar = progressbar)
}
GR <- bplapply(GR, getGRegionsStat2,
win.size = win.size,
step.size = step.size,
grfeatures = grfeatures,
stat = stat,
column = column,
absolute = absolute,
select.strand = select.strand,
maxgap = maxgap,
minoverlap = minoverlap,
select = select,
ignore.strand = ignore.strand,
type = type,
scaling = scaling,
logbase = logbase,
missings = missings,
naming = naming,
na.rm = na.rm,
verbose = FALSE,
...,
BPPARAM = bpparam)
return(GR)
}
)
#' @aliases getGRegionsStat2
#' @rdname getGRegionsStat2
#' @export
setMethod("getGRegionsStat2", signature(GR = "InfDiv"),
function(GR,
win.size = 1,
step.size = 1,
grfeatures = NULL,
stat = c("sum", "mean", "gmean", "median",
"density", "count", "denCount"),
column = NULL,
absolute = FALSE,
select.strand = NULL,
maxgap = -1L,
minoverlap = 0L,
select = "all",
ignore.strand = TRUE,
type = c("within", "start", "end", "equal", "any"),
scaling = 1000L,
logbase = 2,
missings = 0,
naming = FALSE,
na.rm = TRUE,
num.cores = 1L,
tasks = 0,
verbose = TRUE,
...) {
if (verbose)
progressbar <- TRUE else progressbar <- FALSE
if (Sys.info()["sysname"] == "Linux") {
bpparam <- MulticoreParam(workers = num.cores, tasks = tasks,
progressbar = progressbar)
} else {
bpparam <- SnowParam(workers = num.cores, type = "SOCK",
progressbar = progressbar)
}
GR <- bplapply(GR, getGRegionsStat2,
win.size = win.size,
step.size = step.size,
grfeatures = grfeatures,
stat = stat,
column = column,
absolute = absolute,
select.strand = select.strand,
maxgap = maxgap,
minoverlap = minoverlap,
select = select,
ignore.strand = ignore.strand,
type = type,
scaling = scaling,
logbase = logbase,
missings = missings,
naming = naming,
na.rm = na.rm,
verbose = FALSE,
...,
BPPARAM = bpparam)
return(GR)
}
)
#' @aliases getGRegionsStat2
#' @rdname getGRegionsStat2
#' @export
setMethod("getGRegionsStat2", signature(GR = "list"),
function(GR,
win.size = 1,
step.size = 1,
grfeatures = NULL,
stat = c("sum", "mean", "gmean", "median",
"density", "count", "denCount"),
column = NULL,
absolute = FALSE,
select.strand = NULL,
maxgap = -1L,
minoverlap = 0L,
select = "all",
ignore.strand = TRUE,
type = c("within", "start", "end", "equal", "any"),
scaling = 1000L,
logbase = 2,
missings = 0,
naming = FALSE,
na.rm = TRUE,
num.cores = 1L,
tasks = 0,
verbose = TRUE,
...) {
if (verbose)
progressbar <- TRUE else progressbar <- FALSE
if (Sys.info()["sysname"] == "Linux") {
bpparam <- MulticoreParam(workers = num.cores, tasks = tasks,
progressbar = progressbar)
} else {
bpparam <- SnowParam(workers = num.cores, type = "SOCK",
progressbar = progressbar)
}
GR <- bplapply(GR, getGRegionsStat2,
win.size = win.size,
step.size = step.size,
grfeatures = grfeatures,
stat = stat,
column = column,
absolute = absolute,
select.strand = select.strand,
maxgap = maxgap,
minoverlap = minoverlap,
select = select,
ignore.strand = ignore.strand,
type = type,
scaling = scaling,
logbase = logbase,
missings = missings,
naming = naming,
na.rm = na.rm,
verbose = FALSE,
...,
BPPARAM = bpparam)
return(GR)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.