#' Calculate p-values and identify regions
#'
#' First, this function finds the regions of interest according to specified
#' cutoffs. Then it permutes the samples and re-calculates the F-statistics.
#' The area of the statistics from these segments are then used to calculate
#' p-values for the original regions.
#'
#' @inheritParams calculateStats
#' @param fstats A numerical Rle with the F-statistics normally generated using
#' [calculateStats].
#' @param nPermute The number of permutations. Note that for a full chromosome,
#' a small amount (10) of permutations is sufficient. If set to 0, no
#' permutations are performed and thus no null regions are used, however, the
#' `$regions` component is created.
#' @param seeds An integer vector of length `nPermute` specifying the
#' seeds to be used for each permutation. If `NULL` no seeds are used.
#' @param chr A single element character vector specifying the chromosome name.
#' This argument is passed to [findRegions].
#' @param cutoff F-statistic cutoff to use to determine segments.
#' @param significantCut A vector of length two specifiying the cutoffs used to
#' determine significance. The first element is used to determine significance
#' for the P-values, while the second element is used for the Q-values (FDR
#' adjusted P-values).
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{ If `TRUE` basic status updates will be printed along
#' the way.}
#' \item{scalefac }{ This argument is passed to
#' [fstats.apply][derfinderHelper::fstats.apply] and should be the same as the one used
#' in [preprocessCoverage]. Default: 32.}
#' \item{method }{ Has to be either 'Matrix' (default), 'Rle' or 'regular'. See
#' details in [fstats.apply][derfinderHelper::fstats.apply].}
#' \item{adjustF }{ A single value to adjust that is added in the denominator
#' of the F-stat calculation. Useful when the Residual Sum of Squares of the
#' alternative model is very small. Default: 0.}
#' \item{writeOutput }{ If `TRUE` then the regions are saved before
#' calculating q-values, and then overwritten once the q-values are written.
#' This argument was introduced to save the results from the permutations (can
#' take some time) to investigate the problem described at
#' https://support.bioconductor.org/p/62026/}
#' \item{maxRegionGap }{ Passed to internal functions of [findRegions].
#' Default: 0.}
#' }
#' Passed to [findRegions], `smoothFunction` and
#' [define_cluster].
#' @inheritParams findRegions
#'
#' @return A list with four components:
#' \describe{
#' \item{regions }{ is a GRanges with metadata columns given by
#' [findRegions] with the additional metadata column `pvalues`:
#' p-value of the region calculated via permutations of the samples;
#' `qvalues`: the qvalues calculated using [qvalue][qvalue::qvalue];
#' `significant`: whether the p-value is less than 0.05 (by default);
#' `significantQval`: whether the q-value is less than 0.10 (by default).
#' It also includes the mean coverage of the region (mean from the mean
#' coverage at each base calculated in [preprocessCoverage]). Furthermore,
#' if `groupInfo` was not `NULL` in [preprocessCoverage], then
#' the group mean coverage is calculated as well as the log 2 fold change
#' (using group 1 as the reference). }
#' \item{nullStats}{ is a numeric Rle with the mean of the null statistics by
#' segment.}
#' \item{nullWidths}{ is a numeric Rle with the length of each of the segments
#' in the null distribution. The area can be obtained by multiplying the
#' absolute `nullstats` by the corresponding lengths.}
#' \item{nullPermutation}{ is a Rle with the permutation number from which the
#' null region originated from.}
#' }
#'
#' @author Leonardo Collado-Torres
#' @seealso [findRegions], [fstats.apply][derfinderHelper::fstats.apply],
#' [qvalue][qvalue::qvalue]
#' @export
#' @importMethodsFrom IRanges quantile nrow ncol mean lapply unlist cbind
#' @importFrom IRanges Views RleList nrow
#' @import S4Vectors
#' @importFrom BiocParallel bplapply
#' @importFrom qvalue qvalue
#' @importFrom derfinderHelper fstats.apply
#'
#' @examples
#' ## Collapse the coverage information
#' collapsedFull <- collapseFullCoverage(list(genomeData$coverage),
#' verbose = TRUE
#' )
#'
#' ## Calculate library size adjustments
#' sampleDepths <- sampleDepth(collapsedFull, probs = c(0.5), verbose = TRUE)
#'
#' ## Build the models
#' group <- genomeInfo$pop
#' adjustvars <- data.frame(genomeInfo$gender)
#' models <- makeModels(sampleDepths, testvars = group, adjustvars = adjustvars)
#'
#' ## Preprocess the data
#' ## Automatic chunksize used to then compare 1 vs 4 cores in the 'do not run'
#' ## section
#' prep <- preprocessCoverage(genomeData,
#' groupInfo = group, cutoff = 0,
#' scalefac = 32, chunksize = NULL, colsubset = NULL, mc.cores = 4
#' )
#'
#' ## Get the F statistics
#' fstats <- genomeFstats
#'
#' ## We recommend determining the cutoff to use based on the F-distribution
#' ## although you could also based it on the observed F-statistics.
#'
#' ## In this example we use a low cutoff used for illustrative purposes
#' cutoff <- 1
#'
#' ## Calculate the p-values and define the regions of interest.
#' regsWithP <- calculatePvalues(prep, models, fstats,
#' nPermute = 1, seeds = 1,
#' chr = "chr21", cutoff = cutoff, mc.cores = 1, method = "regular"
#' )
#' regsWithP
#' \dontrun{
#' ## Calculate again, but with 10 permutations instead of just 1
#' regsWithP <- calculatePvalues(prep, models, fstats,
#' nPermute = 10, seeds = 1:10,
#' chr = "chr21", cutoff = cutoff, mc.cores = 2, method = "regular"
#' )
#'
#' ## Check that they are the same as the previously calculated regions
#' library(testthat)
#' expect_that(regsWithP, equals(genomeRegions))
#'
#' ## Histogram of the theoretical p-values by region
#' hist(pf(regsWithP$regions$value, df1 - df0, n - df1), main = "Distribution
#' original p-values by region", freq = FALSE)
#'
#' ## Histogram of the permutted p-values by region
#' hist(regsWithP$regions$pvalues, main = "Distribution permutted p-values by
#' region", freq = FALSE)
#'
#' ## MA style plot
#' library("ggplot2")
#' ma <- data.frame(
#' mean = regsWithP$regions$meanCoverage,
#' log2FoldChange = regsWithP$regions$log2FoldChangeYRIvsCEU
#' )
#' ggplot(ma, aes(x = log2(mean), y = log2FoldChange)) +
#' geom_point() +
#' ylab("Fold Change (log2)") +
#' xlab("Mean coverage (log2)") +
#' labs(title = "MA style plot")
#'
#' ## Annotate the results
#' library("bumphunter")
#' genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene)
#' annotation <- matchGenes(regsWithP$regions, genes)
#' head(annotation)
#' }
#'
calculatePvalues <- function(
coveragePrep, models, fstats, nPermute = 1L,
seeds = as.integer(gsub("-", "", Sys.Date())) + seq_len(nPermute),
chr, cutoff = quantile(fstats, 0.99, na.rm = TRUE),
significantCut = c(0.05, 0.1),
lowMemDir = NULL, smooth = FALSE, weights = NULL,
smoothFunction = bumphunter::locfitByCluster, ...) {
## Setup
if (is.null(seeds)) {
seeds <- rep(NA, nPermute)
}
## Run some checks
stopifnot(nPermute == length(seeds))
stopifnot(length(intersect(names(coveragePrep), c(
"coverageProcessed",
"mclapplyIndex", "position", "meanCoverage", "groupMeans"
))) ==
5)
stopifnot(length(intersect(names(models), c("mod", "mod0"))) ==
2)
stopifnot(length(significantCut) == 2 & all(significantCut >=
0 & significantCut <= 1))
## Advanged arguments
# @param verbose If \code{TRUE} basic status updates will be printed along the
# way.
verbose <- .advanced_argument("verbose", TRUE, ...)
# @param scalefac This argument is passed to
# \link[derfinderHelper]{fstats.apply} and should be the same as the one used
# in \link{preprocessCoverage}.
scalefac <- .advanced_argument("scalefac", 32, ...)
# @param method Has to be either 'Matrix' (default), 'Rle' or 'regular'. See
# details in derfinderHelper::fstats.apply().
method <- .advanced_argument("method", "Matrix", ...)
# @param adjustF A single value to adjust that is added in the denominator of
# the F-stat calculation. Useful when the Residual Sum of Squares of the
# alternative model is very small.
adjustF <- .advanced_argument("adjustF", 0, ...)
# @param writeOutput If \code{TRUE} then the regions are saved before
# calculating q-values, and then overwritten once the q-values are written.
# This argument was introduced to save the results from the permutations (can
# take some time) to investigate the problem described at
# https://support.bioconductor.org/p/62026/
writeOutput <- .advanced_argument("writeOutput", FALSE, ...)
## Identify the data segments
if (verbose) {
message(paste(
Sys.time(),
"calculatePvalues: identifying data segments"
))
}
## Extract data
position <- coveragePrep$position
means <- coveragePrep$meanCoverage
groupMeans <- coveragePrep$groupMeans
mclapplyIndex <- coveragePrep$mclapplyIndex
coverageProcessed <- coveragePrep$coverageProcessed
if (is.null(lowMemDir) & is.null(coverageProcessed)) {
stop("preprocessCoverage() was used with a non-null 'lowMemDir', so please specify 'lowMemDir'.")
}
rm(coveragePrep)
## Avoid re-calculating possible candidate DERs for every
## permutation
segmentIR <- .clusterMakerRle(
position = position, maxGap =
.advanced_argument("maxRegionGap", 0L, ...), ranges = TRUE
)
## Find the regions
regs <- findRegions(
position = position, chr = chr, fstats = fstats,
cutoff = cutoff, segmentIR = segmentIR, smooth = smooth,
weights = weights, smoothFunction = smoothFunction, ...
)
if (is.null(regs)) {
final <- list(
regions = NULL, nullStats = NULL, nullWidths = NULL,
nullPermutation = NULL
)
return(final)
}
## Assign mean coverage (overall)
indexIR <- IRanges(start = regs$indexStart, end = regs$indexEnd)
regs$meanCoverage <- mean(Views(means, indexIR))
## Calculate mean coverage by group and fold changes
if (length(groupMeans) > 0) {
regionGroupMean <- lapply(groupMeans, function(x) {
mean(Views(x, indexIR))
})
## Calculate fold coverage vs group 1
if (length(regionGroupMean) > 1) {
log2FoldChange <- vector("list", length(regionGroupMean) -
1)
names(log2FoldChange) <- names(regionGroupMean)[-1]
for (group in names(log2FoldChange)) {
log2FoldChange[[group]] <-
log2(regionGroupMean[[group]] / regionGroupMean[[1]])
}
}
## Finish up
names(regionGroupMean) <- paste0("mean", names(regionGroupMean))
mcols(regs) <- cbind(mcols(regs), DataFrame(regionGroupMean,
check.names = FALSE
))
if (length(regionGroupMean) > 1) {
names(log2FoldChange) <- paste0(
"log2FoldChange",
names(log2FoldChange), "vs", names(groupMeans)[1]
)
mcols(regs) <- cbind(mcols(regs), DataFrame(log2FoldChange,
check.names = FALSE
))
rm(log2FoldChange)
}
rm(regionGroupMean)
}
rm(fstats, means, groupMeans)
if (!smooth) rm(position)
## Pre-allocate memory
nullareas <- nullpermutation <- nullwidths <- nullstats <- vector(
"list",
length(seeds) * 2
)
last <- 0
nSamples <- seq_len(nrow(models$mod))
## Define cluster
BPPARAM <- define_cluster(...)
for (i in seq_along(seeds)) {
if (verbose) {
message(paste(
Sys.time(),
"calculatePvalues: calculating F-statistics for permutation",
i, "and seed", seeds[i]
))
}
if (!is.na(seeds[i])) {
set.seed(seeds[i])
}
idx.permute <- sample(nSamples)
## Permuted sample labels
mod.p <- models$mod[idx.permute, , drop = FALSE]
mod0.p <- models$mod0[idx.permute, , drop = FALSE]
## Get the F-statistics
fstats.output <- bplapply(mclapplyIndex, fstats.apply,
data = coverageProcessed, mod = mod.p, mod0 = mod0.p,
method = method, adjustF = adjustF, scalefac = scalefac,
lowMemDir = lowMemDir, BPPARAM = BPPARAM
)
fstats.output <- unlist(RleList(fstats.output), use.names = FALSE)
if (smooth) {
if (verbose) {
message(paste(Sys.time(), "calculatePvalues: smoothing F-statistics for permutation", i))
}
fstats.output <- .smootherFstats(fstats = fstats.output, position = position, weights = weights, smoothFunction = smoothFunction, ...)
}
## Find the segments
regs.perm <- findRegions(
chr = chr, fstats = fstats.output,
cutoff = cutoff, segmentIR = segmentIR, basic = TRUE, ...
)
## Calculate mean statistics
if (!is.null(regs.perm)) {
for (j in seq_len(2)) {
nullstats[[last + j]] <- regs.perm$stat
nullwidths[[last + j]] <- regs.perm$width
nullareas[[last + j]] <- regs.perm$area
nullpermutation[[last + j]] <- Rle(i, nrow(regs.perm))
}
}
last <- last + 2
## Finish loop
rm(idx.permute, fstats.output, regs.perm, mod.p, mod0.p)
}
nullstats <- do.call(c, nullstats[!sapply(nullstats, is.null)])
nullwidths <- do.call(c, nullwidths[!sapply(nullwidths, is.null)])
nullpermutation <- do.call(c, nullpermutation[!sapply(
nullpermutation,
is.null
)])
nullareas <- do.call(c, nullareas[!sapply(nullareas, is.null)])
## Order by area
regs <- regs[order(regs$area, decreasing = TRUE), ]
if (length(nullstats) > 0) {
## Proceed only if there is at least one null stats
## Calculate pvalues
if (verbose) {
message(paste(
Sys.time(),
"calculatePvalues: calculating the p-values"
))
}
regs$pvalues <- .calcPval(regs$area, nullareas)
regs$significant <- factor(regs$pvalues < significantCut[1],
levels = c(TRUE, FALSE)
)
.writeRegs(
regs = list(
regions = regs, nullStats = nullstats,
nullWidths = nullwidths, nullPermutation = nullpermutation
),
writeOutput = writeOutput, chr = chr
)
## Sometimes qvalue() fails due to incorrect pi0 estimates
qvalues <- tryCatch(qvalue(regs$pvalues), error = function(e) NULL)
if (!is.null(qvalues)) {
qvalues <- qvalues$qvalues
sigQval <- factor(qvalues < significantCut[2], levels = c(
TRUE,
FALSE
))
} else {
message(paste(Sys.time(), "calculatePvalues: skipping q-value calculation."))
qvalues <- rep(NA, length(regs$pvalues))
sigQval <- rep(NA, length(regs$pvalues))
}
regs$qvalues <- qvalues
regs$significantQval <- sigQval
} else {
if (verbose) {
message(paste(
Sys.time(),
"calculatePvalues: no null regions found. Skipping p-value calculation."
))
}
regs$pvalues <- rep(NA, length(regs))
regs$significant <- rep(NA, length(regs))
regs$qvalues <- rep(NA, length(regs))
regs$significantQval <- rep(NA, length(regs))
}
## Save the nullstats too
final <- list(
regions = regs, nullStats = nullstats,
nullWidths = nullwidths, nullPermutation = nullpermutation
)
.writeRegs(final, writeOutput, chr)
## Done =)
return(final)
}
.calcPval <- function(areas, nullareas) {
null <- Rle(sort(nullareas, decreasing = FALSE))
nullsum <- length(null) - cumsum(runLength(null))
int <- findInterval(areas, runValue(null))
greaterEqual <- c(length(null), nullsum)[int + 1]
res <- (greaterEqual + 1) / (length(null) + 1)
return(res)
}
.writeRegs <- function(regs, writeOutput, chr) {
## Save the output from calculatePvalues
if (writeOutput) {
regions <- regs
dir.create(chr, showWarnings = FALSE, recursive = TRUE)
save(regions, file = file.path(chr, "regions.Rdata"))
}
return(invisible(NULL))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.