#' This function computes the differentially methylated regions between two
#' conditions.
#'
#' @title Compute DMRs
#' @param methylationData1 the methylation data in condition 1
#' (see \code{\link{methylationDataList}}).
#' @param methylationData2 the methylation data in condition 2
#' (see \code{\link{methylationDataList}}).
#' @param regions a \code{\link{GRanges}} object with the regions where to
#' compute the DMRs. If \code{NULL}, the DMRs are computed genome-wide.
#' @param context the context in which the DMRs are computed (\code{"CG"},
#' \code{"CHG"} or \code{"CHH"}).
#' @param method the method used to compute the DMRs (\code{"noise_filter"},
#' \code{"neighbourhood"} or \code{"bins"}). The \code{"noise_filter"} method
#' uses a triangular kernel to smooth the number of reads and then performs a
#' statistical test to determine which regions dispay different levels of
#' methylation in the two conditions. The \code{"neighbourhood"} method
#' computates differentially methylated cytosines. Finally, the \code{"bins"}
#' method partiones the genome into equal sized tilling bins and performs the
#' statistical test between the two conditions in each bin. For all three
#' methods, the cytosines or bins are then merged into DMRs without affecting
#' the inital parameters used when calling the differentiall methylated
#' cytosines/bins (p-value, difference in methylation levels, minimum number of
#' reads per cytosine).
#' @param windowSize the size of the triangle base measured in nucleotides.
#' This parameter is required only if the selected method is
#' \code{"noise_filter"}.
#' @param kernelFunction a \code{character} indicating which kernel function to
#' be used. Can be one of \code{"uniform"}, \code{"triangular"},
#' \code{"gaussian"} or \code{"epanechnicov"}. This is required only if the
#' selected method is \code{"noise_filter"}.
#' @param lambda numeric value required for the Gaussian filter
#' (\code{K(x) = exp(-lambda*x^2)}). This is required only if the selected
#' method is \code{"noise_filter"} and the selected kernel function is
#' \code{"gaussian"}.
#' @param binSize the size of the tiling bins in nucleotides. This parameter is
#' required only if the selected method is \code{"bins"}.
#' @param test the statistical test used to call DMRs (\code{"fisher"} for
#' Fisher's exact test or \code{"score"} for Score test).
#' @param pValueThreshold DMRs with p-values (when performing the statistical
#' test; see \code{test}) higher or equal than \code{pValueThreshold} are
#' discarded. Note that we adjust the p-values using the Benjamini and
#' Hochberg's method to control the false discovery rate.
#' @param minCytosinesCount DMRs with less cytosines in the specified context
#' than \code{minCytosinesCount} will be discarded.
#' @param minProportionDifference DMRs where the difference in methylation
#' proportion between the two conditions is lower than
#' \code{minProportionDifference} are discarded.
#' @param minGap DMRs separated by a gap of at least \code{minGap} are not
#' merged. Note that only DMRs where the change in methylation is in the same
#' direction are joined.
#' @param minSize DMRs with a size smaller than \code{minSize} are discarded.
#' @param minReadsPerCytosine DMRs with the average number of reads lower than
#' \code{minReadsPerCytosine} are discarded.
#' @param cores the number of cores used to compute the DMRs.
#' @return the DMRs stored as a \code{\link{GRanges}} object with the following
#' metadata columns:
#' \describe{
#' \item{direction}{a number indicating whether the region lost (-1) or gain
#' (+1) methylation in condition 2 compared to condition 1.}
#' \item{context}{the context in which the DMRs was computed (\code{"CG"},
#' \code{"CHG"} or \code{"CHH"}).}
#' \item{sumReadsM1}{the number of methylated reads in condition 1.}
#' \item{sumReadsN1}{the total number of reads in condition 1.}
#' \item{proportion1}{the proportion methylated reads in condition 1.}
#' \item{sumReadsM2}{the number of methylated reads in condition 2.}
#' \item{sumReadsN2}{the total number reads in condition 2.}
#' \item{proportion2}{the proportion methylated reads in condition 2.}
#' \item{cytosinesCount}{the number of cytosines in the DMR.}
#' \item{regionType}{a string indicating whether the region lost (\code{"loss"})
#' or gained (\code{"gain"}) methylation in condition 2 compared to condition 1.}
#' \item{pValue}{the p-value (adjusted to control the false discovery rate with
#' the Benjamini and Hochberg's method) of the statistical test when the DMR was
#' called.}
#' }
#' @seealso \code{\link{filterDMRs}}, \code{\link{mergeDMRsIteratively}},
#' \code{\link{analyseReadsInsideRegionsForCondition}} and
#' \code{\link{DMRsNoiseFilterCG}}
#' @examples
#'
#' # load the methylation data
#' data(methylationDataList)
#'
#' # the regions where to compute the DMRs
#' regions <- GRanges(seqnames = Rle("Chr3"), ranges = IRanges(1,1E5))
#'
#' # compute the DMRs in CG context with noise_filter method
#' DMRsNoiseFilterCG <- computeDMRs(methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]], regions = regions,
#' context = "CG", method = "noise_filter",
#' windowSize = 100, kernelFunction = "triangular",
#' test = "score", pValueThreshold = 0.01,
#' minCytosinesCount = 4, minProportionDifference = 0.4,
#' minGap = 200, minSize = 50, minReadsPerCytosine = 4,
#' cores = 1)
#'
#' \dontrun{
#' # compute the DMRs in CG context with neighbourhood method
#' DMRsNeighbourhoodCG <- computeDMRs(methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]], regions = regions,
#' context = "CG", method = "neighbourhood",
#' test = "score", pValueThreshold = 0.01,
#' minCytosinesCount = 4, minProportionDifference = 0.4,
#' minGap = 200, minSize = 50, minReadsPerCytosine = 4,
#' cores = 1)
#'
#' # compute the DMRs in CG context with bins method
#' DMRsBinsCG <- computeDMRs(methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]], regions = regions,
#' context = "CG", method = "bins", binSize = 100,
#' test = "score", pValueThreshold = 0.01, minCytosinesCount = 4,
#' minProportionDifference = 0.4, minGap = 200, minSize = 50,
#' minReadsPerCytosine = 4, cores = 1)
#'
#' }
#' @author Nicolae Radu Zabet and Jonathan Michael Foonlan Tsang
#' @export
computeDMRs <- function(methylationData1,
methylationData2,
regions = NULL,
context = "CG",
method="noise_filter",
windowSize = 100,
kernelFunction = "triangular",
lambda = 0.5,
binSize = 100,
test = "fisher",
pValueThreshold = 0.01,
minCytosinesCount = 4,
minProportionDifference = 0.4,
minGap = 200,
minSize = 50,
minReadsPerCytosine = 4,
cores = 1) {
##Parameters checking
cat("Parameters checking ...\n")
.validateMethylationData(methylationData1, variableName="methylationData1")
.validateMethylationData(methylationData2, variableName="methylationData2")
regions <- union(.validateGRanges(regions, methylationData1), .validateGRanges(regions, methylationData2))
.validateContext(context)
.stopIfNotAll(c(!is.null(method),
all(is.character(method)),
length(method) == 1,
all(method %in% c("noise_filter","neighbourhood","bins"))),
" method can be only noise_filter, neighbourhood or bins")
if(method == "noise_filter"){
.stopIfNotAll(c(.isInteger(windowSize, positive=TRUE)),
" the window size used by the interpolation method is an integer higher than 0")
.stopIfNotAll(c(!is.null(kernelFunction),
kernelFunction%in%c("uniform", "triangular", "gaussian", "epanechnicov")),
paste("Unknown kernel function: ", kernelFunction, ".
kernelFunction should be one of \"uniform\", \"triangular\",
\"gaussian\", \"epanechnicov\"",sep=""))
if(kernelFunction == "gaussian"){
.stopIfNotAll(c(!is.null(lambda),
is.numeric(lambda)),
" lambda needs to be a numeric value.")
}
}
if(method == "bins"){
.stopIfNotAll(c(.isInteger(binSize, positive=TRUE)),
" the bin size used by the method is an integer higher than 0")
}
.validateStatisticalTest(test)
.stopIfNotAll(c(!is.null(pValueThreshold),
is.numeric(pValueThreshold),
pValueThreshold > 0,
pValueThreshold < 1),
" the p-value threshold needs to be in the interval (0,1)")
.stopIfNotAll(c(.isInteger(minCytosinesCount, positive=TRUE)),
" the minCytosinesCount is an integer higher or equal to 0")
.stopIfNotAll(c(!is.null(minProportionDifference),
is.numeric(minProportionDifference),
minProportionDifference > 0,
minProportionDifference < 1),
" the minimum difference in methylation needs to be in the interval (0,1)")
.stopIfNotAll(c(.isInteger(minGap, positive=TRUE)),
" the minimum gap between DMRs is an integer higher or equal to 0")
.stopIfNotAll(c(.isInteger(minSize, positive=TRUE)),
" the minimum size of a DMR is an integer higher or equal to 0")
.stopIfNotAll(c(.isInteger(minReadsPerCytosine, positive=TRUE)),
" the minimum average number of reads in a DMR is an integer higher or equal to 0")
.stopIfNotAll(c(.isInteger(cores, positive=TRUE)),
" the number of cores to used when computing the DMRs needs to be an integer hirger or equal to 1.")
computedDMRs <- GRanges()
if(method == "noise_filter"){
computedDMRs <- .computeDMRsNoiseFilter(methylationData1 = methylationData1,
methylationData2 = methylationData2,
regions = regions,
context = context,
windowSize = windowSize,
kernelFunction = kernelFunction,
lambda = lambda,
test = test,
pValueThreshold = pValueThreshold,
minCytosinesCount = minCytosinesCount,
minProportionDifference =minProportionDifference,
minGap = minGap,
minSize = minSize,
minReadsPerCytosine = minReadsPerCytosine,
cores = cores)
} else if(method == "neighbourhood"){
computedDMRs <- .computeDMRsNeighbourhood(methylationData1 = methylationData1,
methylationData2 = methylationData2,
regions = regions,
context = context,
test = test,
pValueThreshold = pValueThreshold,
minCytosinesCount = minCytosinesCount,
minProportionDifference =minProportionDifference,
minGap = minGap,
minSize = minSize,
minReadsPerCytosine = minReadsPerCytosine,
cores = cores)
} else if(method == "bins"){
computedDMRs <- .computeDMRsBins(methylationData1 = methylationData1,
methylationData2 = methylationData2,
regions = regions,
context = context,
binSize = binSize,
test = test,
pValueThreshold = pValueThreshold,
minCytosinesCount = minCytosinesCount,
minProportionDifference =minProportionDifference,
minGap = minGap,
minSize = minSize,
minReadsPerCytosine = minReadsPerCytosine,
cores = cores)
} else{
cat("Unknown method: ",method," \n")
}
return(computedDMRs)
}
#' This function computes the differentially methylated regions between two conditions
#' using the noise filter method.
.computeDMRsNoiseFilter <- function(methylationData1,
methylationData2,
regions = NULL,
context = "CG",
windowSize = 100,
kernelFunction="triangular",
lambda=0.5,
test = "fisher",
pValueThreshold = 0.01,
minCytosinesCount = 4,
minProportionDifference = 0.4,
minGap = 200,
minSize = 50,
minReadsPerCytosine = 4,
cores = 1) {
regions <- reduce(regions)
# extract the methylation data in the correct context
cat("Extract methylation in the corresponding context \n")
contextMethylationData1 <- methylationData1[methylationData1$context%in%context]
rm(methylationData1)
localContextMethylationData1 <- contextMethylationData1[queryHits(findOverlaps(contextMethylationData1, regions))]
rm(contextMethylationData1)
contextMethylationData2 <- methylationData2[methylationData2$context%in%context]
rm(methylationData2)
localContextMethylationData2 <- contextMethylationData2[queryHits(findOverlaps(contextMethylationData2, regions))]
rm(contextMethylationData2)
localContextMethylationData <- .joinMethylationData(localContextMethylationData1, localContextMethylationData2)
rm(localContextMethylationData1, localContextMethylationData2)
regionsList <- .splitGRangesEqualy(regions, cores)
# inner loop function for parallel::mclapply
.computeDMRsInterpolationLoop = function(i){
computedDMRs <- GRanges()
for(index in 1:length(regionsList[[i]])){
currentRegion <- regionsList[[i]][index]
cat("Computing DMRs at ",.printGenomicRanges(currentRegion),"\n")
# Select the points in methylationData that we're interested in. These are the
# points that lie within 'regions', as well as any that lie within
# window.size of them.
windowSizeHalf <- floor((windowSize - 1)/2)
extendedRegion <- currentRegion
start(extendedRegion) <- start(currentRegion) - windowSizeHalf
end(extendedRegion) <- end(currentRegion) + windowSizeHalf
overlaps <- findOverlaps(localContextMethylationData, extendedRegion)
if(length(overlaps) > 0){
localMethylationData <- localContextMethylationData[queryHits(overlaps)]
cat("Calculating interpolations...\n")
#Rcpp
movingAverageMethylReads1 <- round(.movingAverage(start(currentRegion),
end(currentRegion),
start(localMethylationData),
localMethylationData$readsM1,
windowSizeHalf = windowSizeHalf))
movingAverageTotalReads1 <- round(.movingAverage(start(currentRegion),
end(currentRegion),
start(localMethylationData),
localMethylationData$readsN1,
windowSizeHalf = windowSizeHalf))
movingAverageProportion1 <- movingAverageMethylReads1 / movingAverageTotalReads1
#Rcpp
movingAverageMethylReads2 <- round(.movingAverage(start(currentRegion),
end(currentRegion),
start(localMethylationData),
localMethylationData$readsM2,
windowSizeHalf = windowSizeHalf))
movingAverageTotalReads2 <- round(.movingAverage(start(currentRegion),
end(currentRegion),
start(localMethylationData),
localMethylationData$readsN2,
windowSizeHalf = windowSizeHalf))
movingAverageProportion2 <- movingAverageMethylReads2 / movingAverageTotalReads2
cat("Identifying DMRs...\n")
pValue <- .computeAdjuestedPValues(test,
movingAverageMethylReads1,
movingAverageTotalReads1,
movingAverageMethylReads2,
movingAverageTotalReads2,
alternative = "two.sided")
# compute the differentially methylated cytosines
DMPs <- rep(0, times=width(currentRegion))
DMPs[is.na(pValue)] <- -2
bufferIndex <- !is.na(pValue) &
pValue < pValueThreshold &
abs(movingAverageProportion1 - movingAverageProportion2) >= minProportionDifference &
movingAverageTotalReads1 >=minReadsPerCytosine &
movingAverageTotalReads2 >=minReadsPerCytosine
DMPs[bufferIndex] <- sign(movingAverageProportion2[bufferIndex] - movingAverageProportion1[bufferIndex])
#join the differentially methylated cytosines into regions
rle <- rle(DMPs)
rle$cumulative <- cumsum(rle$lengths)
endOfRuns <- rle$cumulative + start(currentRegion) - 1
DMRs <- GRanges(
seqnames = seqnames(currentRegion),
ranges = IRanges(endOfRuns - rle$lengths + 1, endOfRuns),
strand = strand(currentRegion),
direction = rle$values,
context = paste(context, collapse = "_")
)
DMRs$direction[DMRs$direction == -2] <- NA
# Select the crude list of DMRs
DMRs <- DMRs[!is.na(DMRs$direction) & (DMRs$direction == 1 | DMRs$direction == -1)]
# append current DMRs to the global list of DMRs
if(length(computedDMRs) == 0){
computedDMRs <- DMRs
} else{
computedDMRs <- c(computedDMRs,DMRs)
}
}
}
return(computedDMRs)
}
# compute the DMRs
if(cores > 1){
cat("Compute the DMRs using ", cores, "cores\n")
computedDMRs <- parallel::mclapply(1:length(regionsList), .computeDMRsInterpolationLoop, mc.cores = cores)
} else {
computedDMRs <- lapply(1:length(regionsList), .computeDMRsInterpolationLoop)
}
computedDMRs <- unlist(GRangesList(computedDMRs))
if(length(computedDMRs) > 0){
computedDMRs <- computedDMRs[order(computedDMRs)]
cat("Analysed reads inside DMRs\n")
overlaps <- countOverlaps(localContextMethylationData, computedDMRs)
localContextMethylationDataDMRs <- localContextMethylationData[overlaps > 0]
if(cores > 1){
computedDMRsList <- IRanges::splitAsList(computedDMRs, rep(1:cores, length.out=length(computedDMRs)))
bufferComputedDMRsList <- parallel::mclapply(1:length(computedDMRsList), function(i){
.analyseReadsInsideRegions(localContextMethylationDataDMRs, computedDMRsList[[i]])},
mc.cores = cores)
computedDMRs <- unlist(GRangesList(bufferComputedDMRsList))
computedDMRs <- computedDMRs[order(computedDMRs)]
} else{
computedDMRs <- .analyseReadsInsideRegions(localContextMethylationDataDMRs, computedDMRs)
}
computedDMRs <- computedDMRs[computedDMRs$cytosinesCount > 0]
computedDMRs$pValue <- .computeaAjustedPValuesInDMRs(test, computedDMRs ,alternative = "two.sided")
computedDMRs <- computedDMRs[computedDMRs$pValue < pValueThreshold &
abs(computedDMRs$proportion1 - computedDMRs$proportion2) >= minProportionDifference &
computedDMRs$sumReadsN1/computedDMRs$cytosinesCount >=minReadsPerCytosine &
computedDMRs$sumReadsN2/computedDMRs$cytosinesCount >=minReadsPerCytosine]
cat("Merge DMRs iteratively\n")
# Get rid of small gaps between DMRs.
#computedDMRs <- .mergeRegions(computedDMRs, minGap = minGap, respectSigns = TRUE)
if(minGap > 0){
computedDMRs <- .smartMergeDMRs(computedDMRs,
minGap = minGap,
respectSigns = TRUE,
methylationData = localContextMethylationData,
minProportionDifference=minProportionDifference,
minReadsPerCytosine = minReadsPerCytosine,
pValueThreshold=pValueThreshold,
test=test,
alternative = "two.sided",
cores = cores)
}
cat("Filter DMRs \n")
if(length(computedDMRs) > 0){
#remove small DMRs
computedDMRs <- computedDMRs[width(computedDMRs) >= minSize]
if(length(computedDMRs) > 0){
#remove DMRswith few cytosines
computedDMRs <- computedDMRs[computedDMRs$cytosinesCount >= minCytosinesCount]
#recompute the adjusted p-values
if(length(computedDMRs) > 0){
computedDMRs$pValue <- .computeaAjustedPValuesInDMRs(test, computedDMRs ,alternative = "two.sided")
computedDMRs$regionType <- rep("loss", length(computedDMRs))
computedDMRs$regionType[which(computedDMRs$proportion1 < computedDMRs$proportion2)] <- "gain"
}
}
}
}
return(computedDMRs)
}
#' This function computes the differentially methylated regions between two conditions
#' using the neighbourhood method. This assumes the computation of differentially methylated
#' cytosines followed by smart merging of these cytosines while keeping the new DMRs
#' statistically significant.
.computeDMRsNeighbourhood <- function(methylationData1,
methylationData2,
regions = NULL,
context = "CG",
test = "fisher",
pValueThreshold = 0.01,
minCytosinesCount = 4,
minProportionDifference = 0.4,
minGap = 200,
minSize = 50,
minReadsPerCytosine = 4,
cores = 1) {
regions <- reduce(regions)
# extract the methylation data in the correct context
cat("Extract methylation in the corresponding context \n")
contextMethylationData1 <- methylationData1[methylationData1$context%in%context]
rm(methylationData1)
localContextMethylationData1 <- contextMethylationData1[queryHits(findOverlaps(contextMethylationData1, regions))]
rm(contextMethylationData1)
contextMethylationData2 <- methylationData2[methylationData2$context%in%context]
rm(methylationData2)
localContextMethylationData2 <- contextMethylationData2[queryHits(findOverlaps(contextMethylationData2, regions))]
rm(contextMethylationData2)
localContextMethylationData <- .joinMethylationData(localContextMethylationData1, localContextMethylationData2)
rm(localContextMethylationData1, localContextMethylationData2)
cat("Computing DMRs \n")
DMPs <- GRanges()
if(length(localContextMethylationData) > 0){
DMPs <- localContextMethylationData
DMPs$pValue <- .computeAdjuestedPValues(test,
DMPs$readsM1,
DMPs$readsN1,
DMPs$readsM2,
DMPs$readsN2,
alternative = "two.sided")
DMPs <- DMPs[!is.na(DMPs$pValue)]
DMPs$sumReadsM1 <- DMPs$readsM1
DMPs$sumReadsN1 <- DMPs$readsN1
DMPs$proportion1 <- DMPs$readsM1 / DMPs$readsN1
DMPs$sumReadsM2 <- DMPs$readsM2
DMPs$sumReadsN2 <- DMPs$readsN2
DMPs$proportion2 <- DMPs$readsM2 / DMPs$readsN2
DMPs$cytosinesCount <- 1
DMPs$direction <- sign(DMPs$proportion2 - DMPs$proportion1)
bufferIndex <- DMPs$pValue < pValueThreshold &
abs(DMPs$proportion2 - DMPs$proportion1) >= minProportionDifference &
DMPs$sumReadsN1 >=minReadsPerCytosine &
DMPs$sumReadsN2 >=minReadsPerCytosine
DMPs <- DMPs[bufferIndex]
strand(DMPs) <- "*"
}
computedDMRs <- GRanges()
if(length(DMPs) > 0){
cat("Merge DMRs iteratively\n")
# Get rid of small gaps between DMRs.
if(minGap > 0){
computedDMRs <- .smartMergeDMRs(DMPs,
minGap = minGap,
respectSigns = TRUE,
methylationData = localContextMethylationData,
minProportionDifference=minProportionDifference,
minReadsPerCytosine = minReadsPerCytosine,
pValueThreshold=pValueThreshold,
test=test,
alternative = "two.sided",
cores = cores)
} else{
computedDMRs <- DMPs
}
cat("Filter DMRs \n")
if(length(computedDMRs) > 0){
#remove small DMRs
computedDMRs <- computedDMRs[width(computedDMRs) >= minSize]
if(length(computedDMRs) > 0){
#remove DMRs with few cytosines
computedDMRs <- computedDMRs[computedDMRs$cytosinesCount >= minCytosinesCount]
#recompute the adjusted p-values
if(length(computedDMRs) > 0){
computedDMRs$pValue <- .computeaAjustedPValuesInDMRs(test, computedDMRs ,alternative = "two.sided")
computedDMRs$regionType <- rep("loss", length(computedDMRs))
computedDMRs$regionType[which(computedDMRs$proportion1 < computedDMRs$proportion2)] <- "gain"
}
}
}
}
return(computedDMRs)
}
#' This function computes the differentially methylated regions between two conditions
#' using the bins method.
.computeDMRsBins <- function(methylationData1,
methylationData2,
regions = NULL,
context = "CG",
binSize = 100,
test = "fisher",
pValueThreshold = 0.01,
minCytosinesCount = 4,
minProportionDifference = 0.4,
minGap = 200,
minSize = 50,
minReadsPerCytosine = 4,
cores = 1) {
regions <- reduce(regions)
# extract the methylation data in the correct context
cat("Extract methylation in the corresponding context \n")
contextMethylationData1 <- methylationData1[methylationData1$context%in%context]
rm(methylationData1)
localContextMethylationData1 <- contextMethylationData1[queryHits(findOverlaps(contextMethylationData1, regions))]
rm(contextMethylationData1)
contextMethylationData2 <- methylationData2[methylationData2$context%in%context]
rm(methylationData2)
localContextMethylationData2 <- contextMethylationData2[queryHits(findOverlaps(contextMethylationData2, regions))]
rm(contextMethylationData2)
localContextMethylationData <- .joinMethylationData(localContextMethylationData1, localContextMethylationData2)
rm(localContextMethylationData1, localContextMethylationData2)
regionsList <- .splitGRangesEqualy(regions, cores)
# inner loop function for parallel::mclapply
.computeDMRsBinsLoop = function(i){
computedDMRs <- GRanges()
for(index in 1:length(regionsList[[i]])){
currentRegion <- regionsList[[i]][index]
cat("Computing DMRs at ",.printGenomicRanges(currentRegion),"\n")
seqs <- seq(start(currentRegion), (end(currentRegion)-binSize), by = binSize);
bins <- GRanges(seqnames(currentRegion), IRanges(seqs, (seqs+binSize-1)))
overlapsBins <- findOverlaps(localContextMethylationData, currentRegion)
if(length(overlapsBins) > 0){
localMethylationData <- localContextMethylationData[queryHits(overlapsBins)]
cat("Count inside each bin...\n")
#bins <- .analyseReadsInsideRegions(localMethylationData, bins, context, cores)
bins <- .analyseReadsInsideBins(localMethylationData, bins, currentRegion)
cat("Filter the bins...\n")
# Get rid of the bins with fewer than minCytosinesCount cytosines inside.
bins <- bins[bins$cytosinesCount >= minCytosinesCount]
# Get rid of the bins with fewer than minReadsPerCytosine reads per cytosine.
bins <- bins[(bins$sumReadsN1/bins$cytosinesCount >= minReadsPerCytosine) &
(bins$sumReadsN2/bins$cytosinesCount >= minReadsPerCytosine)]
# Get rid of the bins with small difference in proportion of methylation
bins <- bins[(abs(bins$proportion1 - bins$proportion2) >= minProportionDifference)]
cat("Identifying DMRs...\n")
pValue <- .computeAdjuestedPValues(test, bins$sumReadsM1, bins$sumReadsN1, bins$sumReadsM2, bins$sumReadsN2, alternative = "two.sided")
bins <- bins[!is.na(pValue) & pValue < pValueThreshold ]
bins$context <- rep(paste(context, collapse = "_"), length(bins))
bins$direction <- rep(NA, length(bins))
bins$direction <- sign(bins$proportion2 - bins$proportion1)
# Select the crude list of DMRs
DMRs <- bins[!is.na(bins$direction) & (bins$direction == 1 | bins$direction == -1)]
# append current DMRs to the global list of DMRs
if(length(computedDMRs) == 0){
computedDMRs <- DMRs
} else{
computedDMRs <- c(computedDMRs,DMRs)
}
}
}
return(computedDMRs)
}
# compute the DMRs
if(cores > 1){
cat("Compute the DMRs using ", cores, "cores\n")
computedDMRs <- parallel::mclapply(1:length(regionsList), .computeDMRsBinsLoop, mc.cores = cores)
} else {
computedDMRs <- lapply(1:length(regionsList), .computeDMRsBinsLoop)
}
computedDMRs <- unlist(GRangesList(computedDMRs))
if(length(computedDMRs) > 0){
cat("Merge adjacent DMRs\n")
computedDMRs <- computedDMRs[order(computedDMRs)]
computedDMRs$pValue <- .computeaAjustedPValuesInDMRs(test, computedDMRs ,alternative = "two.sided")
cat("Merge DMRs iteratively\n")
# Get rid of small gaps between DMRs.
if(minGap > 0){
computedDMRs <- .smartMergeDMRs(computedDMRs,
minGap = minGap,
respectSigns = TRUE,
methylationData = localContextMethylationData,
minProportionDifference=minProportionDifference,
minReadsPerCytosine = minReadsPerCytosine,
pValueThreshold=pValueThreshold,
test=test,
alternative = "two.sided",
cores = cores)
}
cat("Filter DMRs \n")
if(length(computedDMRs) > 0){
#remove small DMRs
computedDMRs <- computedDMRs[width(computedDMRs) >= minSize]
if(length(computedDMRs) > 0){
#remove DMRs with few cytosines
computedDMRs <- computedDMRs[computedDMRs$cytosinesCount >= minCytosinesCount]
#recompute the adjusted p-values
if(length(computedDMRs) > 0){
computedDMRs$pValue <- .computeaAjustedPValuesInDMRs(test, computedDMRs ,alternative = "two.sided")
computedDMRs$regionType <- rep("loss", length(computedDMRs))
computedDMRs$regionType[which(computedDMRs$proportion1 < computedDMRs$proportion2)] <- "gain"
}
}
}
}
return(computedDMRs)
}
#' This function verifies whether a set of pottential DMRs (e.g. genes,
#' transposons, CpG islands) are differentially methylated or not.
#'
#' @title Filter DMRs
#' @param methylationData1 the methylation data in condition 1
#' (see \code{\link{methylationDataList}}).
#' @param methylationData2 the methylation data in condition 2
#' (see \code{\link{methylationDataList}}).
#' @param potentialDMRs a \code{\link{GRanges}} object with potential DMRs
#' where to compute the DMRs. This can be a a list of gene and/or transposable
#' elements coordinates.
#' @param context the context in which the DMRs are computed (\code{"CG"},
#' \code{"CHG"} or \code{"CHH"}).
#' @param test the statistical test used to call DMRs (\code{"fisher"} for
#' Fisher's exact test or \code{"score"} for Score test).
#' @param pValueThreshold DMRs with p-values (when performing the statistical
#' test; see \code{test}) higher or equal than \code{pValueThreshold} are
#' discarded. Note that we adjust the p-values using the Benjamini and
#' Hochberg's method to control the false discovery rate.
#' @param minCytosinesCount DMRs with less cytosines in the specified context
#' than \code{minCytosinesCount} will be discarded.
#' @param minProportionDifference DMRs where the difference in methylation
#' proportion between the two conditions is lower than
#' \code{minProportionDifference} are discarded.
#' @param minReadsPerCytosine DMRs with the average number of reads lower than
#' \code{minReadsPerCytosine} are discarded.
#' @param cores the number of cores used to compute the DMRs.
#' @return a \code{\link{GRanges}} object with 11 metadata columns that contain
#' the DMRs; see \code{\link{computeDMRs}}.
#' @seealso \code{\link{DMRsNoiseFilterCG}}, \code{\link{computeDMRs}},
#' \code{\link{analyseReadsInsideRegionsForCondition}}
#' and \code{\link{mergeDMRsIteratively}}
#' @examples
#' # load the methylation data
#' data(methylationDataList)
#' # load the gene annotation data
#' data(GEs)
#'
#' #select the genes
#' genes <- GEs[which(GEs$type == "gene")]
#'
#' # the regions where to compute the DMRs
#' regions <- GRanges(seqnames = Rle("Chr3"), ranges = IRanges(1,1E5))
#' genes <- genes[overlapsAny(genes, regions)]
#'
#' # filter genes that are differntially methylated in the two conditions
#' DMRsGenesCG <- filterDMRs(methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]], potentialDMRs = genes,
#' context = "CG", test = "score", pValueThreshold = 0.01,
#' minCytosinesCount = 4, minProportionDifference = 0.4,
#' minReadsPerCytosine = 3, cores = 1)
#'
#' @author Nicolae Radu Zabet
#' @export
filterDMRs <- function(methylationData1,
methylationData2,
potentialDMRs,
context = "CG",
test = "fisher",
pValueThreshold = 0.01,
minCytosinesCount = 4,
minProportionDifference = 0.4,
minReadsPerCytosine = 3,
cores = 1) {
##Parameters checking
cat("Parameters checking ...\n")
.validateMethylationData(methylationData1, variableName="methylationData1")
.validateMethylationData(methylationData2, variableName="methylationData2")
regions <- union(getWholeChromosomes(methylationData1),
getWholeChromosomes(methylationData2))
.validateContext(context)
.validateGRanges(potentialDMRs, generateGenomeWide=FALSE, variableName="potentialDMRs", minLength=NULL)
.validateStatisticalTest(test)
.stopIfNotAll(c(!is.null(pValueThreshold), is.numeric(pValueThreshold), pValueThreshold > 0, pValueThreshold < 1),
" the p-value threshold needs to be in the interval (0,1)")
.stopIfNotAll(c(.isInteger(minCytosinesCount, positive=TRUE)),
" the minCytosinesCount is an integer higher or equal to 0")
.stopIfNotAll(c(!is.null(minProportionDifference), is.numeric(minProportionDifference), minProportionDifference > 0, minProportionDifference < 1),
" the minimum difference in methylation needs to be in the interval (0,1)")
.stopIfNotAll(c(.isInteger(minReadsPerCytosine, positive=TRUE)),
" the minimum number of reads in a bin is an integer higher or equal to 0")
.stopIfNotAll(c(.isInteger(cores, positive=TRUE)),
" the number of cores to use when computing the DMRs.")
regions <- reduce(regions)
if(length(potentialDMRs) > 0){
# extract the methylation data in the correct context
cat("Extract methylation in the corresponding context \n")
contextMethylationData1 <- methylationData1[methylationData1$context%in%context]
rm(methylationData1)
localContextMethylationData1 <- contextMethylationData1[queryHits(findOverlaps(contextMethylationData1, regions))]
rm(contextMethylationData1)
contextMethylationData2 <- methylationData2[methylationData2$context%in%context]
rm(methylationData2)
localContextMethylationData2 <- contextMethylationData2[queryHits(findOverlaps(contextMethylationData2, regions))]
rm(contextMethylationData2)
localContextMethylationData <- .joinMethylationData(localContextMethylationData1, localContextMethylationData2)
rm(localContextMethylationData1, localContextMethylationData2)
regionsList <- .splitGRangesEqualy(regions, cores)
# inner loop function for parallel::mclapply
.filterDMRsLoop = function(i){
computedDMRs <- GRanges()
for(index in 1:length(regionsList[[i]])){
currentRegion <- regionsList[[i]][index]
cat("Computing DMRs at ",.printGenomicRanges(currentRegion),"\n")
cat("Selecting data...\n")
# Select the points in methylationData that we're interested in. These are the
# points that lie within 'regions', as well as any that lie within
# window.size of them.
overlapsPotentialDMRs <- findOverlaps(potentialDMRs, currentRegion)
if(length(overlapsPotentialDMRs) > 0){
potentialDMRsLocal <- potentialDMRs[queryHits(overlapsPotentialDMRs)]
localMethylationData <- localContextMethylationData[queryHits(findOverlaps(localContextMethylationData, currentRegion))]
potentialDMRsLocal <- .analyseReadsInsideRegions(localMethylationData, potentialDMRsLocal)
if(length(computedDMRs) == 0){
computedDMRs <- potentialDMRsLocal
} else{
computedDMRs <- c(computedDMRs,potentialDMRsLocal)
}
}
}
return(computedDMRs)
}
# compute the DMRs
if(cores > 1){
cat("Compute the DMRs using ", cores, "cores\n")
computedDMRs <- parallel::mclapply(1:length(regionsList), .filterDMRsLoop, mc.cores = cores)
} else {
computedDMRs <- lapply(1:length(regionsList), .filterDMRsLoop)
}
computedDMRs <- unlist(GRangesList(computedDMRs))
if(length(computedDMRs) > 0){
cat("Identifying DMRs...\n")
pValue <- .computeAdjuestedPValues(test, computedDMRs$sumReadsM1, computedDMRs$sumReadsN1, computedDMRs$sumReadsM2, computedDMRs$sumReadsN2, alternative = "two.sided")
bufferIndex <- !is.na(pValue) &
pValue < pValueThreshold &
abs(computedDMRs$proportion1 - computedDMRs$proportion2) >= minProportionDifference &
computedDMRs$sumReadsN1/computedDMRs$cytosinesCount >= minReadsPerCytosine &
computedDMRs$sumReadsN2/computedDMRs$cytosinesCount >= minReadsPerCytosine &
computedDMRs$cytosinesCount >= minCytosinesCount
computedDMRs <- computedDMRs[bufferIndex]
}
} else{
computedDMRs <- GRanges()
}
if(length(computedDMRs) > 0){
computedDMRs$pValue <- .computeaAjustedPValuesInDMRs(test, computedDMRs ,alternative = "two.sided")
computedDMRs$regionType <- rep("loss", length(computedDMRs))
computedDMRs$regionType[which(computedDMRs$proportion1 < computedDMRs$proportion2)] <- "gain"
computedDMRs$direction <- rep(-1, length(computedDMRs))
computedDMRs$direction[which(computedDMRs$proportion1 < computedDMRs$proportion2)] <- 1
computedDMRs <- computedDMRs[order(computedDMRs)]
}
return(computedDMRs)
}
#' This function takes a list of DMRs and attempts to merge DMRs while keeping
#' the new DMRs statistically significant.
#'
#' @title Merge DMRs iteratively
#' @param DMRs the list of DMRs as a \code{\link{GRanges}} object;
#' e.g. see \code{\link{computeDMRs}}
#' @param minGap DMRs separated by a gap of at least \code{minGap} are not
#' merged.
#' @param respectSigns logical value indicating whether to respect the sign when
#' joining DMRs.
#' @param methylationData1 the methylation data in condition 1
#' (see \code{\link{methylationDataList}}).
#' @param methylationData2 the methylation data in condition 2
#' (see \code{\link{methylationDataList}}).
#' @param context the context in which the DMRs are computed (\code{"CG"},
#' \code{"CHG"}
#' or \code{"CHH"}).
#' @param minProportionDifference two adjacent DMRs are merged only if the
#' difference in methylation proportion of the new DMR is higher than
#' \code{minProportionDifference}.
#' @param minReadsPerCytosine two adjacent DMRs are merged only if the number of
#' reads per cytosine of the new DMR is higher than \code{minReadsPerCytosine}.
#' @param pValueThreshold two adjacent DMRs are merged only if the p-value of
#' the new DMR (see \code{test} below) is lower than \code{pValueThreshold}.
#' Note that we adjust the p-values using the Benjamini and Hochberg's method to
#' control the false discovery rate.
#' @param test the statistical test used to call DMRs (\code{"fisher"} for
#' Fisher's exact test or \code{"score"} for Score test).
#' @param alternative indicates the alternative hypothesis and must be one of
#' \code{"two.sided"}, \code{"greater"} or \code{"less"}.
#' @param cores the number of cores used to compute the DMRs.
#' @return the reduced list of DMRs as a \code{\link{GRanges}} object;
#' e.g. see \code{\link{computeDMRs}}
#' @seealso \code{\link{filterDMRs}}, \code{\link{computeDMRs}},
#' \code{\link{analyseReadsInsideRegionsForCondition}} and
#' \code{\link{DMRsNoiseFilterCG}}
#' @examples
#' # load the methylation data
#' data(methylationDataList)
#'
#' #load the DMRs in CG context they were computed with minGap = 200
#' data(DMRsNoiseFilterCG)
#'
#'
#' #merge the DMRs
#' DMRsNoiseFilterCGLarger <- mergeDMRsIteratively(DMRsNoiseFilterCG[1:100],
#' minGap = 500, respectSigns = TRUE,
#' methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]],
#' context = "CG", minProportionDifference=0.4,
#' minReadsPerCytosine = 1, pValueThreshold=0.01,
#' test="score",alternative = "two.sided")
#'
#'
#' \dontrun{
#' #set genomic coordinates where to compute DMRs
#' regions <- GRanges(seqnames = Rle("Chr3"), ranges = IRanges(1,1E5))
#'
#' # compute DMRs and remove gaps smaller than 200 bp
#' DMRsNoiseFilterCG200 <- computeDMRs(methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]], regions = regions,
#' context = "CG", method = "noise_filter",
#' windowSize = 100, kernelFunction = "triangular",
#' test = "score", pValueThreshold = 0.01,
#' minCytosinesCount = 1, minProportionDifference = 0.4,
#' minGap = 200, minSize = 0, minReadsPerCytosine = 1,
#' cores = 1)
#'
#' DMRsNoiseFilterCG0 <- computeDMRs(methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]], regions = regions,
#' context = "CG", method = "noise_filter",
#' windowSize = 100, kernelFunction = "triangular",
#' test = "score", pValueThreshold = 0.01,
#' minCytosinesCount = 1, minProportionDifference = 0.4,
#' minGap = 0, minSize = 0, minReadsPerCytosine = 1,
#' cores = 1)
#' DMRsNoiseFilterCG0Merged200 <- mergeDMRsIteratively(DMRsNoiseFilterCG0,
#' minGap = 200, respectSigns = TRUE,
#' methylationDataList[["WT"]],
#' methylationDataList[["met1-3"]],
#' context = "CG", minProportionDifference=0.4,
#' minReadsPerCytosine = 1, pValueThreshold=0.01,
#' test="score",alternative = "two.sided")
#'
#' #check that all newley computed DMRs are identical
#' print(all(DMRsNoiseFilterCG200 == DMRsNoiseFilterCG0Merged200))
#'
#' }
#'
#' @author Nicolae Radu Zabet
#'
#' @export
mergeDMRsIteratively <- function(DMRs,
minGap,
respectSigns = TRUE,
methylationData1,
methylationData2,
context = "CG",
minProportionDifference=0.4,
minReadsPerCytosine = 4,
pValueThreshold=0.01,
test="fisher",
alternative = "two.sided",
cores = 1){
##Parameters checking
cat("Parameters checking ...\n")
.validateMethylationData(methylationData1, variableName="methylationData1")
.validateMethylationData(methylationData2, variableName="methylationData2")
.validateContext(context)
.validateGRanges(DMRs, generateGenomeWide=FALSE, variableName="DMRs", minLength=NULL)
.validateStatisticalTest(test)
.stopIfNotAll(c(!is.null(pValueThreshold),
is.numeric(pValueThreshold),
pValueThreshold > 0,
pValueThreshold < 1),
" the p-value threshold needs to be in the interval (0,1)")
.stopIfNotAll(c(!is.null(minProportionDifference),
is.numeric(minProportionDifference),
minProportionDifference > 0,
minProportionDifference < 1),
" the minimum difference in methylation needs to be in the interval (0,1)")
.stopIfNotAll(c(.isInteger(minGap, positive=TRUE)),
" the minimum gap between DMRs is an integer higher or equal to 0")
.stopIfNotAll(c(.isInteger(minReadsPerCytosine, positive=TRUE)),
" the minimum average number of reads in a DMR is an integer higher or equal to 0")
.stopIfNotAll(c(.isInteger(cores, positive=TRUE)),
" the number of cores to use when computing the DMRs.")
totalRegion <- reduce(DMRs, drop.empty.ranges=FALSE, min.gapwidth=minGap, ignore.strand=TRUE)
contextMethylationData1 <- methylationData1[methylationData1$context%in%context]
rm(methylationData1)
localContextMethylationData1 <- contextMethylationData1[queryHits(findOverlaps(contextMethylationData1, totalRegion))]
rm(contextMethylationData1)
contextMethylationData2 <- methylationData2[methylationData2$context%in%context]
rm(methylationData2)
localContextMethylationData2 <- contextMethylationData2[queryHits(findOverlaps(contextMethylationData2, totalRegion))]
rm(contextMethylationData2)
localContextMethylationData <- .joinMethylationData(localContextMethylationData1, localContextMethylationData2)
rm(localContextMethylationData1, localContextMethylationData2)
cat("Merge DMRs iteratively ...\n")
return(.smartMergeDMRs(DMRs,
minGap = minGap,
respectSigns = respectSigns,
methylationData = localContextMethylationData,
minProportionDifference=minProportionDifference,
minReadsPerCytosine = minReadsPerCytosine,
pValueThreshold=pValueThreshold,
test=test,
alternative = alternative,
cores = cores))
}
#' This function extracts from the methylation data the total number of reads,
#' the number of methylated reads and the number of cytosines in the specific
#' context from a region (e.g. DMRs)
#'
#' @title Analyse reads inside regions for condition
#' @param regions a \code{\link{GRanges}} object with a list of regions on the
#' genome; e.g. could be a list of DMRs
#' @param methylationData the methylation data in one condition
#' (see \code{\link{methylationDataList}}).
#' @param context the context in which to extract the reads (\code{"CG"},
#' \code{"CHG"} or \code{"CHH"}).
#' @param label a string to be added to the columns to identify the condition
#' @param cores the number of cores used to compute the DMRs.
#' @return a \code{\link{GRanges}} object with additional four metadata columns
#' \describe{
#' \item{sumReadsM}{the number of methylated reads}
#' \item{sumReadsN}{the total number of reads}
#' \item{proportion}{the proportion methylated reads}
#' \item{cytosinesCount}{the number of cytosines in the regions}
#' }
#' @seealso \code{\link{filterDMRs}}, \code{\link{computeDMRs}},
#' \code{\link{DMRsNoiseFilterCG}}, and \code{\link{mergeDMRsIteratively}}
#' @examples
#'
#' # load the methylation data
#' data(methylationDataList)
#'
#' #load the DMRs in CG context. These DMRs were computed with minGap = 200.
#' data(DMRsNoiseFilterCG)
#'
#' #retrive the number of reads in CHH context in WT
#' DMRsNoiseFilterCGreadsCHH <- analyseReadsInsideRegionsForCondition(
#' DMRsNoiseFilterCG[1:10],
#' methylationDataList[["WT"]], context = "CHH",
#' label = "WT")
#'
#'
#' @author Nicolae Radu Zabet
#'
#' @export
analyseReadsInsideRegionsForCondition <- function(regions,
methylationData,
context,
label = "",
cores = 1){
##Parameters checking
cat("Parameters checking ...\n")
.validateGRanges(regions, generateGenomeWide=FALSE, variableName="regions", minLength=NULL)
.validateMethylationData(methylationData, variableName="methylationData")
.validateContext(context)
.stopIfNotAll(c(.isInteger(cores, positive=TRUE)),
" the number of cores to use when computing the DMRs.")
cat("Extract methylation levels in corresponding context ...\n")
contextMethylationData <- methylationData[methylationData$context%in%context]
rm(methylationData)
cat("Compute reads inside each region ...\n")
if(length(regions) > 0){
if(cores > 1){
cat("Analyse reads in regions using ", cores, "cores\n")
regionsList <- split(regions, rep(1:cores, length.out=length(regions)))
.analyseReadsInsideRegionsForConditionLoop = function(i){
regionsLocal <- .analyseReadsInsideRegionsForCondition(regionsList[[i]],
contextMethylationData,
label = label,
context = context)
return(regionsLocal)
}
regions <- parallel::mclapply(1:length(regionsList),
.analyseReadsInsideRegionsForConditionLoop, mc.cores = cores)
regions <- unlist(GRangesList(regions))
} else{
regions <- .analyseReadsInsideRegionsForCondition(regions,
contextMethylationData,
label = label,
context = context)
}
regions <- regions[order(regions)]
}
return(regions)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.