### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### cometh: Estimate within-sample, within-fragment co-methylation.
###
# TODO: Merge cometh() and mantelhaen(), i.e., make mantelhaen() an option
# in cometh() via the 'method' argument.
# TODO: Should pair_type even by an option? That is, should the user have already
# decided whether to remove pairs of loci with NIL > 0? If so, provide a function
# filter_nil().
# TODO: All filter_* functions should identically either retain "filtered" or
# remove "filtered" objects. Use the same logic as dplyr.
# TODO: Finish documenting.
# TODO: Consistency of arguments, e.g., pair_type vs. pair.type
#' Estimate within-sample, within-fragment co-methylation using 2-tuples.
#'
#' @param methpat A \code{\link{MethPat}} object containing 2-tuples.
#' @param pair_type A character string giving the type of pairs to be
#' constructed when computing correlations. One of "\code{neighbours}" or
#' "\code{all}", can be abbreviated. Please see the below section, "Choice of
#' 'pair_type'", for details.
#' @param method A character string indicating which statistic to use to
#' estimate co-methylation. This must be (an abbreviation of) one of the
#' strings "\code{lor}" (\eqn{log_{2}(odds-ratio)}) or
#' "\code{pearson}" (Pearson product-moment correlation coefficient). Please
#' see the below section, "Choice of statistic", for details.
#' @param min_cov An \code{integer} specifying the minimum coverage required
#' in order to use a 2-tuple when estimating co-methylation.
#' @param alternative Indicates the alternative hypothesis and must be one of
#' "two.sided", "greater" or "less". You can specify just the initial letter.
#' "greater" corresponds to positive association, "less" to negative
#' association.
#' @param feature An optional \code{\link[GenomicRanges]{GRanges}} object with
#' the locations of a "genomic feature". This
#' \code{\link[GenomicRanges]{GRanges}} object must be disjoint (see
#' \code{\link[GenomicRanges]{isDisjoint}}). Please see the
#' below section, "Stratifying pairs by a genomic feature", for details.
#' @param offset A \code{numeric} vector with length 1 used when computing
#' M-values (default: 0.5).
#'
#' @section Choice of 'pair_type': \strong{TODO}
#'
#' @section Choice of statistic: \strong{TODO}
#'
#' @section Stratifying pairs by a genomic feature: \strong{TODO}
#'
#' @export
cometh <- function(methpat,
pair_type = c('all', 'strict_adjacent'),
ref_loci,
method = c("lor", "pearson"),
min_cov = 5L,
alternative = c("two.sided", "less", "greater"),
conf.level = 0.95,
feature,
offset = 0.5) {
# TODO: Check if this likely to occur in practice
if (nrow(methpat) > .Machine$integer.max) {
stop(paste0("Sorry, 'methLevelCor' doesn't yet support 'methpat' objects ",
"with more than ", .Machine$integer.max,
" (.Machine$integer.max) rows."))
}
if (!is(methpat, "MethPat") || size(methpat) != 2L) {
stop("'methpat' must be a 'MethPat' object containing 2-tuples.")
}
pair_type <- match.arg(pair_type)
if (pair_type == "strict_adjacent") {
stop("Sorry, 'pair_type = \"strict_adjacent\"' not yet implemented.")
# TODO: Check how many samples in methpat. It's easiest to implement
# strict_adjacent if there is only 1 sample in methpat. Otherwise have to have
# a different ref_loci for each sample and deal with that.
if (missing(ref_loci)) {
stop("If 'pair_type' = 'strict_adjacent', then must supply 'ref_loci',")
}
if(!is(ref_loci, "MTuples") || size(ref_loci) != 1) {
stop(paste0("'ref_loci' must be an 'MTuples' object containing ",
"1-tuples of methylation loci."))
}
seqinfo <- try(merge(seqinfo(methpat), seqinfo(ref_loci)), silent = TRUE)
if (is(seqinfo, "try-error")) {
# TODO: Stricter check of seqinfo compatability, e.g., identical?
stop("'methpat' and 'ref_loci' have incompatible 'seqinfo'.")
}
if (!all(methtype(methpat) %in% methtype(ref_loci))) {
stop("'methpat' and 'mtuples' have incompatible 'seqinfo'.")
}
# TODO: Check that all loci in methpat are also in ref_loci.
}
method <- match.arg(method)
min_cov <- as.integer(min_cov)
alternative <- match.arg(alternative)
if (!missing(conf.level) &&
(length(conf.level) != 1 || !is.finite(conf.level) || conf.level < 0 ||
conf.level > 1)) {
stop("'conf.level' must be a single number between 0 and 1")
}
if (!missing(feature)) {
rd_start <- GRanges(seqnames(methpat), IRanges(start(methpat), width = 1L),
strand(methpat), seqinfo = seqinfo(methpat))
rd_end <- GRanges(seqnames(methpat), IRanges(end(methpat), width = 1L),
strand(methpat), seqinfo = seqinfo(methpat))
pair_feature_status <- Rle(overlapsAny(rd_start, feature) +
overlapsAny(rd_end, feature))
rm(rd_start, rd_end)
} else {
pair_feature_status <- Rle(NA, nrow(methpat))
}
cov <- getCoverage(methpat)
# mc = those loci with the minimum coverage. Only compute statistic for these
# loci.
mc <- cov >= min_cov & !is.na(cov)
# TODO: Does a forced rm() actually help or will it be gc-ed anyway?
rm(cov)
if (method == "lor") {
statistic <- log2(assay(methpat, "MM")[mc] + offset) +
log2(assay(methpat, "UU")[mc] + offset) -
log2(assay(methpat, "MU")[mc] + offset) -
log2(assay(methpat, "UM")[mc] + offset)
# Compute confidence interval
sigma <- sqrt(1 / (assay(methpat, "MM")[mc] + offset) +
1 / (assay(methpat, "MU")[mc] + offset) +
1 / (assay(methpat, "UM")[mc] + offset) +
1 / (assay(methpat, "UU")[mc] + offset))
CI_lower <- switch(alternative,
less = -Inf,
greater = statistic - sigma * qnorm(conf.level),
two.sided = statistic - sigma *
qnorm((1 + conf.level) / 2))
CI_upper <- switch(alternative,
less = statistic + sigma * qnorm(conf.level),
greater = Inf,
two.sided = statistic + sigma *
qnorm((1 + conf.level) / 2))
} else if (method == "pearson") {
# Compute Pearson correlation (equivalent to phi coefficient).
num <- assay(methpat, "MM")[mc] * assay(methpat, "UU")[mc] -
assay(methpat, "UM")[mc] * assay(methpat, "MU")[mc]
denom <- sqrt(assay(methpat, "MM")[mc] + assay(methpat, "MU")[mc]) *
sqrt((assay(methpat, "UM")[mc] + assay(methpat, "UU")[mc])) *
sqrt((assay(methpat, "MM")[mc] + assay(methpat, "UM")[mc])) *
sqrt((assay(methpat, "MU")[mc] + assay(methpat, "UU")[mc]))
statistic <- num / denom
# Compute confidence interval using the Fisher transformation
# TODO: Can produce "In atanh(statistic) : NaNs produced".
# Suspect this is due to NA in statistic but haven't been able to
# reproduce.
z <- atanh(statistic)
sigma <- suppressWarnings(1 / sqrt(assay(methpat, "MM")[mc] +
assay(methpat, "MU")[mc] +
assay(methpat, "UM")[mc] +
assay(methpat, "UU")[mc] - 3L))
sigma[is.nan(sigma)] <- NA
CI_lower <- switch(alternative,
less = -Inf,
greater = z - sigma * qnorm(conf.level),
two.sided = z - sigma * qnorm((1 + conf.level) / 2))
CI_lower <- tanh(CI_lower)
CI_upper <- switch(alternative,
less = z + sigma * qnorm(conf.level),
greater = Inf,
two.sided = z + sigma * qnorm((1 + conf.level) / 2))
CI_upper <- tanh(CI_upper)
}
# This clunky construction of a data.table is necessary because working with
# Rle objects leads to integer overflow for large methpat objects.
data.table(
chr = unlist(lapply(seq_len(ncol(mc)), function(j, mc, sn) {
as.character(sn[mc[, j]])
}, mc = mc, sn = seqnames(methpat)), use.names = FALSE),
pos1 = unlist(lapply(seq_len(ncol(mc)), function(j, mc, s) {
s[mc[, j]]
}, mc = mc, s = start(methpat)), use.names = FALSE),
pos2 = unlist(lapply(seq_len(ncol(mc)), function(j, mc, e) {
e[mc[, j]]
}, mc = mc, e = end(methpat)), use.names = FALSE),
strand = unlist(lapply(seq_len(ncol(mc)), function(j, mc, strand) {
as.character(strand[mc[, j]])
}, mc = mc, strand = strand(methpat)), use.names = FALSE),
pair_feature_status = unlist(lapply(seq_len(ncol(mc)),
function(j, mc, pfs) {
as.vector(pfs[mc[, j]])
}, mc = mc, pfs = pair_feature_status),
use.names = FALSE),
sample = unlist(mapply(function(j, sn, mc) {
rep(sn, sum(mc[, j]))
}, j = seq_len(ncol(mc)), sn = colnames(methpat),
MoreArgs = list(mc = mc), SIMPLIFY = FALSE), use.names = FALSE),
statistic = statistic,
sigma = sigma,
CI_lower = CI_lower,
CI_upper = CI_upper)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.