# Copyright (C) 2015 Peter Hickey
#
# This file is part of methsim.
#
# methsim is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# methsim is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with methsim If not, see <http://www.gnu.org/licenses/>.
### =========================================================================
### MethylomeParam: An S4 class to store the parameters used to simulate
### SimulatedMethylome.
### -------------------------------------------------------------------------
###
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Design
###
### list(BSgenomeName, PartitionedMethylome, MethLevelDT, ComethDT,
### PatternFreqsDT, SampleName)
### BSgenomeName: A character vector with the name of the relevant
### BSgenome object.
### PartitionedMethylome: A PartitionedMethylome object.
### MethLevelDT: A data.table containing the
### MethylationTuples::methLevel data.
### ComethDT: A data.table containing the
### MethylationTuples::cometh data.
### MixtureWeights: A vector of weights representing the mixture of
### methylomes in the sample.
### SampleName: The sample name.
#' MethylomeParam class
#'
#' An S4 class for the parameters used by
#' \code{\link{simulate,MethylomeParam-method}}.
#'
#' @include PartitionedMethylome-class.R
#'
#' @aliases MethylomeParam
#'
#' @export
setClass("MethylomeParam",
slots = list(
BSgenomeName = "character",
PartitionedMethylome = "PartitionedMethylome",
MethLevelDT = "data.table",
ComethDT = "data.table",
MixtureWeights = "numeric",
SampleName = "character")
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Validity
###
.valid.MethylomeParam.BSgenomeName <- function(object) {
msg <- NULL
if (length(object@BSgenomeName) == 1L && is.character(object@BSgenomeName)) {
if (!(object@BSgenomeName %in% BSgenome::available.genomes())) {
msg <- Biobase::validMsg(msg, paste0("'BSgenomeName' must be an element ",
"of ",
"'BSgenome::available.genomes()'."))
}
} else {
msg <- Biobase::validMsg(msg, paste0("'BSgenomeName' must be a length one ",
"character vector."))
}
}
.valid.MethylomeParam.PartitionedMethylome <- function(object) {
msg <- NULL
if (!is(object@PartitionedMethylome, "PartitionedMethylome")) {
msg <- Biobase::validMsg(msg, paste0("'PartitionedMethylome' slot must ",
"be a 'PartitionedMethylome'."))
}
msg
}
.valid.MethylomeParam.MethLevelDT <- function(object) {
msg <- NULL
if (!is.data.table(object@MethLevelDT)) {
msg <- Biobase::validMsg(msg, paste0("'MethLevelDT' slot must be a ",
"'data.table'."))
}
if (!identical(colnames(object@MethLevelDT), c("type", "beta", "N"))) {
msg <- Biobase::validMsg(msg, paste0("Column names of the 'MethLevelDT' ",
"slot must be 'type', 'beta' and ",
"'N'."))
}
msg
}
.valid.MethylomeParam.ComethDT <- function(object) {
msg <- NULL
if (!is.data.table(object@ComethDT)) {
msg <- Biobase::validMsg(msg, paste0("'ComethDT' slot must be a ",
"'data.table'."))
}
if (!identical(colnames(object@ComethDT), c("IPD", "type", "statistic",
"N"))) {
msg <- Biobase::validMsg(msg, paste0("Column names of the 'ComethDT' ",
"slot must be 'IPD', 'type', ",
"'statistic' and 'N'." ))
}
msg
}
.valid.MethylomeParam.MixtureWeights <- function(object) {
msg <- NULL
if (!is.numeric(object@MixtureWeights)) {
msg <- Biobase::validMsg(msg, paste0("'MixtureWeights' slot must be a ",
"numeric vector."))
} else {
if (any(object@MixtureWeights > 1 || object@MixtureWeights < 0)) {
msg <- Biobase::validMsg(msg, paste0("'MixtureWeights' slot must all be ",
"between zero and one."))
}
if (!all.equal(sum(object@MixtureWeights), 1)) {
msg <- Biobase::validMsg(msg, paste0("'MixtureWeights' slot must sum to ",
"one."))
}
}
msg
}
.valid.MethylomeParam.SampleName <- function(object) {
msg <- NULL
if (!is.character(object@SampleName) | length(object@SampleName) != 1L) {
msg <- Biobase::validMsg(msg, paste0("'SampleName' slot must be a ",
"'character' vector with length 1."))
}
}
.valid.MethylomeParam <- function(object) {
# Include all .valid.MethylomeParam.* functions in this vector
msg <- c(.valid.MethylomeParam.BSgenomeName(object),
.valid.MethylomeParam.PartitionedMethylome(object),
.valid.MethylomeParam.MethLevelDT(object),
.valid.MethylomeParam.ComethDT(object),
.valid.MethylomeParam.MixtureWeights(object),
.valid.MethylomeParam.SampleName(object))
if (is.null(msg)) {
return(TRUE)
} else{
msg
}
}
S4Vectors::setValidity2("MethylomeParam", .valid.MethylomeParam)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Constructor
###
# TODO (long term): This is a barebones constructor. Might want to make
# MethLevelDT and ComethDT formal S4 classes.
#' @export
MethylomeParam <- function(BSgenomeName,
PartitionedMethylome,
MethLevelDT,
ComethDT,
MixtureWeights,
SampleName) {
# TODO: Argument checking
new("MethylomeParam",
BSgenomeName = BSgenomeName,
PartitionedMethylome = PartitionedMethylome,
MethLevelDT = MethLevelDT,
ComethDT = ComethDT,
MixtureWeights = MixtureWeights,
SampleName = SampleName)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### simulate()
###
# A helper function called by simulate,MethylomeParam-method
.simulateMethylomeParam <- function(i, object, epsilon, ol, one_tuples,
two_tuples, dots) {
# Need to simulate a MarginalProb and LOR for all positive
# object@MixtureWeights
n_components <- sum(object@MixtureWeights > 0)
# Sample average methylation levels in each region
mldt <- object@MethLevelDT
rt <- regionType(object@PartitionedMethylome)
marginal_prob_by_region <- replicate(n_components,
.sampleMethLevelDT(mldt, rt),
simplify = "array")
# Add (resp. subtract) epsilon to zero (resp. one) elements of
# marginal_prob_by_region, otherwise the entire region will be
# zero (resp. one).
marginal_prob_by_region[marginal_prob_by_region == 1] <- 1 - epsilon
marginal_prob_by_region[marginal_prob_by_region == 0] <- epsilon
n_loci_per_region <- countSubjectHits(ol)
# Sample within-fragment co-methylation for each IPD-region_type
# combination.
# dots (...) are passed to methsim:::.sampleComethDT().
# For a seqlevel with n 1-tuples, there are (n - 1) elements of
# lor_by_pair; there is no value for the first 1-tuple because it
# has no predecessor (hence the need below to add a value of zero
# for the first 1-tuple on each seqlevel)
cdt <- object@ComethDT
pm <- object@PartitionedMethylome
# TODO: This is the slowest part, and it would be good to run the
# n_component replications in parallel. Need to pass down BPPARAM in a clever
# way to respect inheritance so that it doesn't blow up.
lor_by_pair <- replicate(n_components,
.sampleComethDT(two_tuples = two_tuples,
cometh_dt = cdt,
partitioned_methylome = pm,
dots),
simplify = "array")
# Add a value of LOR = 0 for the first methylation locus of each
# seqlevel.
cn <- paste0("component_", seq_len(n_components))
lor <- matrix(0,
nrow = sum(as.numeric(n_loci_per_region)),
ncol = n_components,
dimnames = list(NULL, cn))
first_loci <- start(seqnames(one_tuples))
lor[-c(first_loci), ] <- lor_by_pair
# Create SimulatedMethylome object
# NOTE: The column names are 'component1', ..., 'componentW',
# where W = n_components
marginal_prob <- matrix(rep(marginal_prob_by_region,
rep(n_loci_per_region, n_components)),
ncol = n_components,
dimnames = list(NULL, cn))
mixture_weights <- S4Vectors::DataFrame(lapply(object@MixtureWeights, Rle,
lengths = nrow(marginal_prob)))
colnames(mixture_weights) <- cn
assays <- S4Vectors::SimpleList(MarginalProb = marginal_prob,
LOR = lor,
MixtureWeights = mixture_weights)
new("SimulatedMethylome",
SummarizedExperiment(assays = assays, rowRanges = one_tuples))
}
# TODO: NULL vs. missing arguments; what's the best choice? (I remember
# reading some advice from Hadley on this issue).
# TODO: Investigate locus-specific average methylation levels rather than
# region-specifc methylation levels.
# TODO: Should arguments to .sampleComethDT() and/or epsilon be part of the
# MethylomeParam object? No. They specify how to construct the
# SimulatedMethylome; the same MethylomeParam can be used to create multiple
# SimulatedMethylome objects by different sampling schemes.
# TODO: Should user-messages be suppressible via suppressMessages() or a
# 'verbose' option.
#' Simulate a methylome.
#'
#' \code{simulate()} samples the distribution of average methylation level
#' (\code{MethLevelDT} slot) and the distributions of within-fragment
#' co-methylation (\code{ComethDT} slot) to compute the transition
#' probabilities under a first-order Markov process.
#'
#' @param object A \code{\link{MethylomeParam}} object.
#' @param nsim The number of methylomes to simulate using the
#' parameters given in \code{object}.
#' @param seed An object specifying if and how the random number generator
#' should be initialized ('seeded'). For the \code{MethylomeParam}
#' method, either \code{NULL} or an integer that will be used in a call to
#' \code{base::\link[base]{set.seed}} before simulating the samples
#' (methylomes). If set, the value is saved as the "\code{seed}" attribute of
#' the returned value. The default, \code{NULL}, will not change the random
#' generator state, and return \code{\link{.Random.seed}} as the "\code{seed}"
#' attribute, see 'Value'.
#' @param epsilon An offset added/subtracted to a region's sampled methylation
#' level to avoid zero/one values.
#' @param seqlevels A character vector of
#' \code{GenomeInfoDb::\link[GenomeInfoDb]{seqlevels}} at which to simulate a
#' methylome. If missing, the default is to use all available seqlevels.
#' @param BPPARAM An optional
#' \code{BiocParallel::\link[BiocParallel]{BiocParallelParam}} instance
#' determining the parallel back-end to be used during evaluation.
#' @param ... Optional arguments passed to the internal function
#' \code{methsim:::.sampleComethDT()}. Best avoided.
#'
#' @section Co-methylation:
#' \strong{TODO}: Describe \code{min_n}, \code{mean_fun}, \code{sd_fun}, etc.
#'
#' @note Currently only supports simulation of CpG methylation and unstranded
#' methylomes.
#'
#' @return A list of length \code{nsim} of SimulatedMethylome objects.
#'
#' @export
setMethod("simulate",
"MethylomeParam",
function(object,
nsim = 1,
seed = NULL,
epsilon = 0.01,
seqlevels,
BPPARAM = bpparam(),
...) {
# TODO: dots are tricky things; see
# http://adv-r.had.co.nz/Functions.html and
# http://adv-r.had.co.nz/Computing-on-the-language.html#capturing-dots
# Would like a more robust way to pass arguments to
# .sampleComethDT(), e.g. min_n, mean_fun, sd_fun. Currently,
# if the user includes a nonsense argument in dots then it will
# cause a (difficult to diagnose) error.
dots <- list(...)
# Argument checks
# Load the namespace for the BSgenome
if (!object@BSgenomeName %in% BSgenome::available.genomes()) {
stop(paste0("'", object@BSgenomeName, "' package is not ",
"available from Bioconductor."))
}
if (!requireNamespace(object@BSgenomeName, quietly = TRUE)) {
stop(paste0("'", object@BSgenomeName, "' package is required.\n",
"To install this package, start R and enter:\n",
"source('http://bioconductor.org/biocLite.R')\n",
"biocLite('", object@BSgenomeName, "')"))
} else {
bsgenome <- eval(parse(text = paste0(object@BSgenomeName,
"::", object@BSgenomeName)))
}
# Check epsilon
stopifnot(is.numeric(epsilon) & epsilon > 0 & epsilon < 1)
# TODO: Is this the best way to set default seqlevels? Can't use
# seqlevels = seqlevels(object@PartitionedMethylome) in function
# signature because of 'recursive default argument reference' error.
if (missing(seqlevels)) {
seqlevels <- GenomeInfoDb::seqlevels(object@PartitionedMethylome)
}
if (!all(seqlevels %in% GenomeInfoDb::seqlevels(bsgenome))) {
stop(paste0("Unexpected seqlevels.\n",
paste0(seqlevels[!seqlevels %in%
GenomeInfoDb::seqlevels(bsgenome)],
collapse = ", "), " are not seqlevels of ",
GenomeInfoDb::bsgenomeName(bsgenome)))
}
# Non-CpG methylation is unlikely to be implemented.
message("Currently only simulates CpG methylation.")
# TODO (long term): Support stranded methylomes.
message("Currently only simulates unstranded methylomes.")
# This chunk for handling RNG generation is based on
# stats:::simulate.lm.
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
runif(1)
}
if (is.null(seed)) {
rng_state <- get(".Random.seed", envir = .GlobalEnv)
} else {
r_seed <- get(".Random.seed", envir = .GlobalEnv)
set.seed(seed)
rng_state <- structure(seed, kind = as.list(RNGkind()))
on.exit(assign(".Random.seed", r_seed, envir = .GlobalEnv))
}
# Simulate nsim methylomes.
if (nsim >= 2) {
message("Simulating ", nsim, " methylomes...")
} else {
message("Simulating ", nsim, " methylome...")
}
# Methylation loci at which to simulate a methylation state.
exclude <- setdiff(GenomeInfoDb::seqlevels(bsgenome), seqlevels)
one_tuples <- findMTuples(bsgenome,
MethInfo("CG"),
size = 1,
exclude = exclude)
# Only want unstranded methylomes
one_tuples <- unstrand(one_tuples[strand(one_tuples) == "+"])
ol <- findOverlaps(one_tuples, object@PartitionedMethylome)
# sufficient_one_tuples are those seqlevels with at least two
# 1-tuples.
sufficient_one_tuples <- elementLengths(
split(one_tuples, seqnames(one_tuples))) > 1L
two_tuples <- endoapply(
split(one_tuples, seqnames(one_tuples))[sufficient_one_tuples],
function(x) {
n <- length(x)
MTuples(GTuples(seqnames(x)[seq_len(n - 1)],
matrix(c(start(x)[seq.int(1, n - 1)],
start(x)[seq.int(2, n)]), ncol = 2),
strand(x)[seq_len(n - 1)],
seqinfo = seqinfo(x)),
methinfo(x))
}
)
two_tuples <- unlist(two_tuples, use.names = FALSE)
# Simulate nsim SimulatedMethylome objects in parallel.
val <- bplapply(seq_len(nsim),
.simulateMethylomeParam,
object = object, epsilon = epsilon, ol = ol,
one_tuples = one_tuples,
two_tuples = two_tuples, dots = dots,
BPPARAM = BPPARAM)
# Ensure "seed" is set as an attribute of the returned value.
names(val) <- paste("sim", seq_len(nsim), sep = "_")
attr(val, "seed") <- rng_state
val
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### show()
###
# TODO (long term)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### getters/setters
###
# TODO (long term)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.