Nothing
#' Identify regions data by a coverage filter and get a count matrix
#'
#' Given a set of un-filtered coverage data (see [fullCoverage]), create
#' candidate regions by applying a cutoff on the coverage values,
#' and obtain a count matrix where the number of rows corresponds to the number
#' of candidate regions and the number of columns corresponds to the number of
#' samples. The values are the mean coverage for a given sample for a given
#' region.
#'
#' @param fullCov A list where each element is the result from
#' [loadCoverage] used with `returnCoverage = TRUE`. Can be generated
#' using [fullCoverage]. If `runFilter = FALSE`, then
#' `returnMean = TRUE` must have been used.
#' @inheritParams filterData
#' @param L The width of the reads used. Either a vector of length 1 or length
#' equal to the number of samples.
#' @param totalMapped A vector with the total number of reads mapped for each
#' sample. The vector should be in the same order as the samples in
#' `fullCov`. Providing this argument adjusts the coverage to reads in
#' `targetSize` library prior to filtering. See [getTotalMapped] for
#' calculating this vector.
#' @param targetSize The target library size to adjust the coverage to. Used
#' only when `totalMapped` is specified. By default, it adjusts to
#' libraries with 80 million reads.
#' @param runFilter This controls whether to run [filterData] or not. If
#' set to `FALSE` then `returnMean = TRUE` must have been used to
#' create each element of `fullCov` and the data must have been
#' normalized (`totalMapped` equal to `targetSize`).
#' @param returnBP If `TRUE`, returns `$bpCoverage` explained below.
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{ If `TRUE` basic status updates will be printed along
#' the way.}
#' \item{chrsStyle }{ Default: `UCSC`. Passed to
#' [extendedMapSeqlevels] via [getRegionCoverage].}
#' \item{species }{ Default: `homo_sapiens`. Passed to
#' [extendedMapSeqlevels] via [getRegionCoverage].}
#' \item{currentStyle }{ Default: `NULL`. Passed to
#' [extendedMapSeqlevels] via [getRegionCoverage].}
#' }
#' Passed to [filterData], [findRegions] and [define_cluster].
#'
#' Note that [filterData] is used internally
#' by [loadCoverage] (and hence `fullCoverage`) and has the important
#' arguments `totalMapped` and `targetSize` which can be used to
#' normalize the coverage by library size. If you already used these arguments
#' when creating the `fullCov` object, then don't specify them a second
#' time in [regionMatrix]. If you have not used these arguments, we
#' recommend using them to normalize the mean coverage.
#'
#' @return A list with one entry per chromosome. Then per chromosome, a list
#' with three components.
#' \describe{
#' \item{regions }{ A set of regions based on the coverage filter cutoff as
#' returned by [findRegions].}
#' \item{bpCoverage }{ A list with one element per region. Each element is a matrix with numbers of rows equal to the number of base pairs in the region and number of columns equal to the number of samples. It contains the base-level coverage information for the regions. Only returned when `returnBP = TRUE`.}
#' \item{coverageMatrix }{ A matrix with the mean coverage by sample for each
#' candidate region.}
#' }
#'
#' @details This function uses several other [derfinder-package]
#' functions. Inspect the code if interested.
#'
#' You should use at most one core per chromosome.
#'
#' @author Leonardo Collado-Torres
#' @export
#'
#' @importMethodsFrom IRanges nrow
#' @importFrom BiocParallel bpmapply
#' @importFrom GenomeInfoDb 'seqlengths<-'
#'
#' @examples
#' ## Create some toy data
#' library("IRanges")
#' x <- Rle(round(runif(1e4, max = 10)))
#' y <- Rle(round(runif(1e4, max = 10)))
#' z <- Rle(round(runif(1e4, max = 10)))
#' fullCov <- list("chr21" = DataFrame(x, y, z))
#'
#' ## Calculate a proxy of library size
#' libSize <- sapply(fullCov$chr21, sum)
#'
#' ## Run region matrix normalizing the coverage
#' regionMat <- regionMatrix(
#' fullCov = fullCov, maxRegionGap = 10L,
#' maxClusterGap = 300L, L = 36, totalMapped = libSize, targetSize = 4e4
#' )
#' \dontrun{
#' ## You can alternatively use filterData() on fullCov to reduce the required
#' ## memory before using regionMatrix(). This can be useful when mc.cores > 1
#' filteredCov <- lapply(fullCov, filterData,
#' returnMean = TRUE, filter = "mean",
#' cutoff = 5, totalMapped = libSize, targetSize = 4e4
#' )
#' regionMat2 <- regionMatrix(filteredCov,
#' maxRegionGap = 10L,
#' maxClusterGap = 300L, L = 36, runFilter = FALSE
#' )
#' }
#'
#' ## regionMatrix() can work with multiple chrs as shown below.
#' fullCov2 <- list("chr21" = DataFrame(x, y, z), "chr22" = DataFrame(x, y, z))
#' regionMat2 <- regionMatrix(
#' fullCov = fullCov2, maxRegionGap = 10L,
#' maxClusterGap = 300L, L = 36, totalMapped = libSize, targetSize = 4e4
#' )
#'
#' ## Combine results from multiple chromosomes
#' library("GenomicRanges")
#'
#' ## First extract the data
#' regs <- unlist(GRangesList(lapply(regionMat2, "[[", "regions")))
#' covMat <- do.call(rbind, lapply(regionMat2, "[[", "coverageMatrix"))
#' covBp <- do.call(c, lapply(regionMat2, "[[", "bpCoverage"))
#' ## Force the names to match
#' names(regs) <- rownames(covMat) <- names(covBp) <- seq_len(length(regs))
#' ## Combine into a list (not really needed)
#' mergedRegionMat <- list(
#' "regions" = regs, "coverageMatrix" = covMat,
#' "bpCoverage" = covBp
#' )
regionMatrix <- function(fullCov, cutoff = 5, L, totalMapped = 80e6,
targetSize = 80e6, runFilter = TRUE, returnBP = TRUE, ...) {
## Have to filter by something
stopifnot(!is.null(cutoff))
## fullCov has to be named
stopifnot(!is.null(names(fullCov)))
verbose <- .advanced_argument("verbose", TRUE, ...)
if (!runFilter) {
if (totalMapped != targetSize) {
stop("When using 'runFilter' = FALSE the arguments 'totalMapped' and 'targetSize' are not used. If you have not normalized the data, please do so before running regionMatrix(runFilter = FALSE).")
}
} else {
if (all(totalMapped == targetSize)) {
if (verbose) message("By using totalMapped equal to targetSize, regionMatrix() assumes that you have normalized the data already in fullCoverage(), loadCoverage() or filterData().")
}
}
## If filtering has been run, check that the information is there
if (!runFilter) {
if (!all(sapply(fullCov, function(x) {
all(c("coverage", "position", "meanCoverage") %in% names(x))
}))) {
stop("When 'runFilter' = FALSE, all the elements of 'fullCov' are expected to be the output from filterData(..., returnMean=TRUE) with some non-NULL 'cutoff'.")
}
}
## Define args
moreArgs <- list(
cutoff = cutoff, L = L, totalMapped = totalMapped,
targetSize = targetSize, runFilter = runFilter,
returnBP = returnBP, ...
)
## Define cluster
BPPARAM <- define_cluster(...)
## Get regions per chr
bpmapply(.regionMatrixByChr, fullCov, names(fullCov),
MoreArgs = moreArgs,
SIMPLIFY = FALSE, BPPARAM = BPPARAM
)
}
.regionMatrixByChr <- function(covInfo, chr, cutoff, L, totalMapped,
targetSize, runFilter, returnBP, ...) {
verbose <- .advanced_argument("verbose", TRUE, ...)
chrsStyle <- .advanced_argument("chrsStyle", getOption(
"chrsStyle",
"UCSC"
), ...)
species <- .advanced_argument("species", getOption(
"species",
"homo_sapiens"
), ...)
currentStyle <- .advanced_argument("currentStyle", NULL, ...)
if (verbose) {
message(paste(Sys.time(), "regionMatrix: processing", chr))
}
## Filter by 'one' or 'mean' and get mean coverage
if (runFilter) {
if (all(c("coverage", "position") %in% names(covInfo))) {
filt <- filterData(
data = covInfo$coverage, cutoff = cutoff,
returnMean = TRUE, returnCoverage = TRUE,
index = covInfo$position, filter = "mean",
totalMapped = totalMapped, targetSize = targetSize, ...
)
} else {
filt <- filterData(
data = covInfo, cutoff = cutoff,
returnMean = TRUE, returnCoverage = TRUE, filter = "mean",
totalMapped = totalMapped, targetSize = targetSize, ...
)
}
# if(is.null(filt$position)) filt$position <- Rle(TRUE, length(filt$meanCoverage))
## Identify regions
regs <- findRegions(
position = filt$position,
fstats = filt$meanCoverage, chr = chr, cutoff = cutoff, ...
)
## Prepare for getRegionCoverage
fullCovTmp <- list(filt)
seqlengths <- length(filt$position)
} else {
## Identify regions
regs <- findRegions(
position = covInfo$position,
fstats = covInfo$meanCoverage, chr = chr, cutoff = cutoff, ...
)
## Prepare for getRegionCoverage
fullCovTmp <- list(covInfo)
seqlengths <- length(covInfo$position)
}
## If there are no regions, return NULL
if (is.null(regs)) {
if (returnBP) {
return(list(
regions = GRanges(), coverageMatrix = NULL,
bpCoverage = NULL
))
} else {
return(list(regions = GRanges(), coverageMatrix = NULL))
}
}
## Format appropriately
names(regs) <- seq_len(length(regs))
## Set the length
names(seqlengths) <- chr
seqlengths(regs) <- seqlengths
## Prepare for getRegionCoverage
names(fullCovTmp) <- chr
## Get region coverage
regionCov <- getRegionCoverage(
fullCov = fullCovTmp, regions = regs,
totalMapped = NULL, targetSize = 0, verbose = verbose,
chrsStyle = chrsStyle, mc.cores = 1L, species = species,
currentStyle = currentStyle
)
if (verbose) message(paste(Sys.time(), "regionMatrix: calculating coverageMatrix"))
covMat <- lapply(regionCov, colSums)
covMat <- do.call(rbind, covMat)
if (verbose) message(paste(Sys.time(), "regionMatrix: adjusting coverageMatrix for 'L'"))
if (length(L) == 1) {
covMat <- covMat / L
} else if (length(L) == ncol(covMat)) {
for (i in length(L)) covMat[, i] <- covMat[, i] / L[i]
} else {
warning("Invalid 'L' value so it won't be used. It has to either be a integer/numeric vector of length 1 or length equal to the number of samples.")
}
## Finish
if (returnBP) {
res <- list(
regions = regs, coverageMatrix = covMat,
bpCoverage = regionCov
)
} else {
res <- list(regions = regs, coverageMatrix = covMat)
}
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.