# Copyright (C) 2015 Peter Hickey
#
# This file is part of methsim.
#
# methsim is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# methsim is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with methsim If not, see <http://www.gnu.org/licenses/>.
### =========================================================================
### WGBSParam: An S4 class to store the parameters used to simulate a
### SimulatedBS object.
### -------------------------------------------------------------------------
###
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Design
###
### list(SimulatedMethylome, AveCov, ErrorRate, SequencingType, ReadLength)
### SimulatedMethylome: A SimulatedMethylome object.
### AveCov: The average sequencing coverage to simulate.
### ErrorRate: The error rate (combined sequencing error and
### bisulfite-conversion error) to use in the simulation.
### SequencingType: The sequencing type: 'SE' (single-end) or 'PE'
### (paired-end).
### ReadLength: The read-length.
# TODO: Create a VIRTUAL class that WGBSParam inherits from. Eventually
# RRBSParam, ERRBSParam, etc. will also inherit from this VIRTUAL class.
#' WGBSParam class
#'
#' An S4 class for the parameters used by
#' \code{\link{simulate,WGBSParam-method}}.
#'
#' @include SimulatedMethylome-class.R
#'
#' @aliases WGBSParam
#'
#' @export
setClass("WGBSParam",
slots = list(
SimulatedMethylome = "SimulatedMethylome",
AveCov = "numeric",
ErrorRate = "numeric",
SequencingType = "character",
ReadLength = "integer")
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Validity
###
# TODO: Explicit checks of class types isn't necessary for S4 classes. Leaving
# for now as the conservative option until unit tests are added.
.valid.WGBSParam.SimulatedMethylome <- function(object) {
msg <- NULL
if (!is(object@SimulatedMethylome, "SimulatedMethylome")) {
msg <- Biobase::validMsg(msg, paste0("'SimulatedMethylome' slot must be ",
"a 'SimulatedMethylome' object."))
}
msg
}
.valid.WGBSParam.AveCov <- function(object) {
msg <- NULL
if (!is.numeric(object@AveCov) ||
length(object@AveCov) != 1L ||
isTRUE(object@AveCov <= 0)) {
msg <- Biobase::validMsg(msg, paste0("'AveCov' slot must be a postive ",
"number."))
}
msg
}
.valid.WGBSParam.ErrorRate <- function(object) {
msg <- NULL
if (!is.numeric(object@ErrorRate) ||
length(object@ErrorRate) != 1L ||
object@ErrorRate < 0 ||
object@ErrorRate > 1) {
msg <- Biobase::validMsg(msg, paste0("'ErrorRate' slot must be a number ",
"between 0 and 1."))
}
msg
}
.valid.WGBSParam.SequencingType <- function(object) {
msg <- NULL
if (!is.character(object@SequencingType) ||
length(object@SequencingType) != 1L ||
(!object@SequencingType %in% c("SE", "PE"))) {
msg <- Biobase::validMsg(msg, paste0("'SequencingType' slot must be 'SE' ",
"(single-end) or 'PE' (paired-end)."))
}
}
.valid.WGBSParam.ReadLength <- function(object) {
msg <- NULL
if (!is.integer(object@ReadLength) ||
length(object@ReadLength) != 1L ||
object@ReadLength < 1)
msg <- Biobase::validMsg(msg, paste0("'ReadLength' slot must be a ",
"positive integer."))
}
.valid.WGBSParam <- function(object) {
# Include all .valid.WGBSParam.* functions in this vector
msg <- c(.valid.WGBSParam.SimulatedMethylome(object),
.valid.WGBSParam.AveCov(object),
.valid.WGBSParam.ErrorRate(object),
.valid.WGBSParam.SequencingType(object),
.valid.WGBSParam.ReadLength(object))
if (is.null(msg)) {
return(TRUE)
} else{
msg
}
}
S4Vectors::setValidity2("WGBSParam", .valid.WGBSParam)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Constructor
###
#' @export
WGBSParam <- function(SimulatedMethylome,
AveCov = 30L,
ErrorRate = 0.01,
SequencingType = "SE",
ReadLength = 100L) {
# Avoid problem when user specifies ReadLength as numeric, e.g., 100 vs. 100L.
ReadLength <- as.integer(ReadLength)
# TODO: Argument checks
new("WGBSParam",
SimulatedMethylome = SimulatedMethylome,
AveCov = AveCov,
ErrorRate = ErrorRate,
SequencingType = SequencingType,
ReadLength = ReadLength)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### seqlevels()
###
setMethod("seqlevels",
"WGBSParam",
function(x) {
seqlevels(x@SimulatedMethylome)
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### seqlengths()
###
setMethod("seqlengths",
"WGBSParam",
function(x) {
seqlengths(x@SimulatedMethylome)
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### simulate()
###
# A helper function called by simulate,WGBSParam-method
.simulateWGBSParam <- function(i, object, seqlevels, simplify, BPPARAM) {
# Need to simulate compute a transition matrix, P, for all components.
# object@MixtureWeights
n_components <- ncol(object@SimulatedMethylome)
# Sample read start sites based on uniform sampling with given
# average sequencing coverage (aveCov).
# TODO: This assumes constant readLength; this code will need
# modification if this assumption is changed.
n_reads <- as.list(trunc(object@AveCov / object@ReadLength *
seqlengths(object)[seqlevels]))
# TODO: Perhaps the number of reads per-chromosome should be
# sampled from a multinomial(sum(n_reads), n_reads)?
# Don't simulate read_start in parallel, e.g., via bpmapply().
# It needlessly complicates things (reproducibility of random
# numbers when generated in parallel is hard) and any speed ups are
# swamped by the running times of other steps in this function.
read_start <- mapply(function(n, seqlength) {
.sampleReadStart(n, seqlength)
}, n = n_reads, seqlength = seqlengths(object)[seqlevels], SIMPLIFY = FALSE)
# Sorting makes things easier to keep track of, and sorting an
# integer vector is easier than sorting more complicated objects (e.g.,
# GRanges objects).
read_start <- bplapply(read_start, sort, BPPARAM = BPPARAM)
# Compute the transition probabilities.
# TODO: The order of the Markov chain (mc_order) is currently hard-coded.
mc_order <- 1L
P <- bplapply(seq_len(n_components),
function(i, object, mc_order) {
.computeP(assay(object@SimulatedMethylome, "MarginalProb",
withDimnames = FALSE)[, i, drop = FALSE],
assay(object@SimulatedMethylome, "LOR",
withDimnames = FALSE)[, i, drop = FALSE],
mc_order)
}, object = object, mc_order = mc_order, BPPARAM = BPPARAM)
# TODO: This is currently not RNG-safe since random
# numbers are generated within the parallel process.
# Find reads that overlap methylation loci and then sample a
# methylation pattern for each such read.
# TODO: Take care if simulate() itself is being run in parallel
# (or at least document that it could spawn heaps of processes).
z <- bplapply(names(read_start), function(seqlevel,
read_start,
object,
P,
simplify) {
# TODO: This may cause warnings (at least when this isn't run
# in parallel, which causes warning()s to be suppressed). These
# warnings will occur if a read runs "off the end" of the
# seqlevel.
rs <- read_start[[seqlevel]]
# Suppress warnings about out-of-bound ranges.
gr <- suppressWarnings(
GRanges(seqlevel,
IRanges(rs, width = object@ReadLength),
seqinfo = seqinfo(object@SimulatedMethylome))
)
# Address the above (possible) warning about out-of-bound ranges.
gr <- trim(gr)
ol <- findOverlaps(gr, object@SimulatedMethylome)
# Find all reads with the same overlaps.
# This assumes that a read sequences contiguous methylation loci, i.e., no
# gaps.
# TODO: Re-write if allowing gaps (i.e., if
# sequencingType = "PE" is implemented).
nh <- countQueryHits(ol)
# Exclude reads with no hits
nh <- nh[nh > 0L]
# Assign reads to components of the mixture
mixture_weights <- as.vector(
unlist(assay(object@SimulatedMethylome, "MixtureWeights",
withDimnames = FALSE)[1, ], use.names = FALSE))
component <- sample(x = n_components,
size = length(nh),
replace = TRUE,
prob = mixture_weights)
hits_dt <- data.table(qh = unique(queryHits(ol)),
fh = na.omit(selectHits(ol, "first")),
nh = nh,
component = component,
key = c("fh", "nh", "component"))
hits_dt <- hits_dt[, .N, by = key(hits_dt)]
# Rcpp can't yet return long vectors nor can mclapply()-based
# functions. We could get around the Rcpp limitation by processing in
# suitably sized batches. However, this won't work for the
# data.table-based portions of this code because it also can't work with
# long vectors. Furthermore, there is likely to be problems with
# returning such a large object when running in parallel. Therefore, we
# throw an error if this occurs. A general solution will be difficult.
if (sum(hits_dt[, nh] * as.numeric(hits_dt[, N])) >
.Machine$integer.max) {
stop(paste0(seqlevel, ": Number of simulated methylation loci ",
"> ", .Machine$integer.max, " (.Machine$integer.max). ",
"Sorry, this is not yet supported. Try reducing the average",
"sequencing coverage ('AveCov' slot of the 'WGBSParam' ",
"object)."))
}
# Simulate N paths for each row of hits_dt, stratified by mixture component.
zz <- .simulatez(hits_dt[, fh],
hits_dt[, nh],
hits_dt[, N],
hits_dt[, component],
assay(object@SimulatedMethylome,
"MarginalProb", withDimnames = FALSE),
P)
# Introduce sequencing error + bisulfite-conversion error i.e.,
# flip elements of z[[i]]$z s.t. Prob(flip) = object@errorRate.
# TODO: RNG at C++ level if possible
.simulateErrorInPlace(zz$z,
runif(length(zz$z)),
object@ErrorRate)
setDT(zz)
zz <- cbind(zz, pos = start(object@SimulatedMethylome)[zz[, h]])
zz[, h := NULL]
setcolorder(zz, c("pos", "readID", "z"))
if (!simplify) {
zz
} else {
return(.makePosAndCounts(zz, size = simplify))
}
}, read_start = read_start, object = object, P = P, simplify = simplify,
BPPARAM = BPPARAM)
# Don't rbindlist(z). Instead, keeping as list will actually save memory (no
# need to retain seqnames for every row) and allow easier parallelisation by
# seqlevel. But must ensure seqlevels are set as names(z).
names(z) <- names(read_start)
seqinfo <- seqinfo(object@SimulatedMethylome)
methinfo <- methinfo(object@SimulatedMethylome)
if (!simplify) {
# Construct SimulatedBS object.
sbs <- new("SimulatedBS",
z = z,
seqinfo = seqinfo,
methinfo = methinfo,
SampleName = paste("sim", i, sep = "_"))
return(sbs)
} else {
seqnames <- Rle(names(z), sapply(lapply(z, "[[", "pos"), nrow))
pos <- do.call(rbind, lapply(z, "[[", "pos"))
counts <- lapply(seq_len(2 ^ simplify), function(i, z) {
matrix(unlist(lapply(lapply(z, "[[", "counts"), "[[", i),
use.names = FALSE), ncol = 1L)
}, z = z)
names(counts) <- MethylationTuples:::.makeMethPatNames(simplify)
counts <- lapply(counts, `colnames<-`, paste("sim", i, sep = "_"))
MethPat(assays = counts,
rowRanges = MTuples(GTuples(seqnames, pos, "*", seqinfo = seqinfo),
methinfo = methinfo))
}
}
# TODO: Should user-messages be suppressible via suppressMessages() or a
# 'verbose' option.
#' Simulate a bisulfite-sequencing experiment.
#'
#' @param object A \code{\link{WGBSParam}} object.
#' @param nsim The number of samples to simulate using the parameters given in
#' \code{object}. Additional samples will be technical replicates.
#' @param seed An object specifying if and how the random number generator
#' should be initialized ('seeded'). For the "WGBSParam" method, either
#' \code{NULL} or an integer that will be used in a call to
#' \code{base::\link[base]{set.seed}} before simulating the samples. If set,
#' the value is saved as the "\code{seed}" attribute of the returned value. The
#' default, \code{NULL}, will not change the random generator state, and return
#' \code{\link{.Random.seed}} as the "\code{seed}" attribute, see 'Value'.
#' @param seqlevels A character vector of
#' \code{GenomeInfoDb::\link[GenomeInfoDb]{seqlevels}} at which to
#' bisulfite-sequencing reads. If missing, the default is to use all available
#' seqlevels.
#' @param simplify An integer specifying whether, and by how much, the object
#' should be simplified, see 'Value'.
#' @param BPPARAM an optional
#' \code{BiocParallel::\link[BiocParallel]{BiocParallelParam}} instance
#' determining the parallel back-end to be used during evaluation.
#'
#' @return The return value depends on the value of \code{simplify}. The
#' default (\code{simplify = 0}) means no simplification and the returned
#' object is a \code{list} of length \code{nsim} containing
#' \code{\link{SimulatedBS}} objects. If \code{simplify} is greater than zero,
#' the returned object is a
#' \code{MethylationTuples::\link[MethylationTuples]{MethPat}} object with
#' \code{\link[MethylationTuples]{size} = simplify} and \code{ncol = simplify}.
#'
#' @note Currently only simulates whole-genome bisulfite-sequencing data.
#'
#' @section Warnings:
#' \itemize{
#' \item Reads are \strong{not} yet simulated for circular seqlevels such as
#' 'chrM' (mitochondrial DNA).
#' \item Only single-end sequencing ('SE') is currently supported.
#' \item This is currently not RNG-safe since random numbers are generated
#' within the parallel process and at the \code{Rcpp} level. Therefore,
#' results may not be reproducible, even given the same \code{seed}.
#' \strong{This is a work in progress and will be fixed.}
#' \item The \code{nsim} simulations are currently simulated in
#' \strong{serial} (but steps of each simulation may be run in parallel).
#' }
#'
#' @export
setMethod("simulate",
"WGBSParam",
function(object,
nsim = 1,
seed = NULL,
seqlevels,
simplify = 0L,
BPPARAM = bpparam()) {
# Argument checks
# Only single-end sequencing currently supported
if (object@SequencingType != "SE") {
stop(paste0("Only single-end sequencing is currently supported.",
"\nPlease modify the 'WGBSParam' object accordingly."))
}
# TODO: Is this the best way to set default seqlevels? Can't use
# seqlevels = seqlevels(object@PartitionedMethylome) in function
# signature because of 'recursive default argument reference' error.
# TODO: Propose a seqlevelsInUse,SummarizedExperiment-method.
valid_seqlevels <- GenomeInfoDb::seqlevelsInUse(
rowRanges(object@SimulatedMethylome))
if (missing(seqlevels)) {
# Only use seqlevels that are "active" in the SimulatedMethylome
# object
seqlevels <- valid_seqlevels
} else {
# Check that supplied seqlevels are valid
if (!all(seqlevels %in% valid_seqlevels)) {
stop(paste0("Unexpected seqlevels.\n",
paste0(seqlevels[!seqlevels %in% valid_seqlevels],
collapse = ", "), " are not seqlevels of ",
"'WGBSParam'."))
}
}
if (simplify < 0 || simplify != as.integer(simplify)) {
stop(paste0("'simplify' must be an integer greater than or equal
to zero."))
}
simplify <- as.integer(simplify)
# TODO: Will need to revisit how seed is set and (pseudo) random
# numbers are generated due to the use of BiocParallel and Rcpp*.
# This chunk for handling RNG generation is based on
# stats:::simulate.lm.
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
runif(1)
}
if (is.null(seed)) {
rng_state <- get(".Random.seed", envir = .GlobalEnv)
} else {
r_seed <- get(".Random.seed", envir = .GlobalEnv)
set.seed(seed)
rng_state <- structure(seed, kind = as.list(RNGkind()))
on.exit(assign(".Random.seed", r_seed, envir = .GlobalEnv))
}
# TODO (long term): Support stranded data.
warning("Currently only simulates unstranded data.")
# TOOD: Add support for paired-end reads.
warning("Currently only simulates single-end reads.")
# TODO: Fix RNG
warning("Random number generation is not yet reproducible.")
if (nsim >= 2) {
message("Simulating ", nsim, " bisulfite-sequencing samples...")
} else {
message("Simulating ", nsim, " bisulfite-sequencing sample...")
}
# TODO: Circular chromosomes are hard. While the read automatically
# gets wrapped around by GRanges(), it makes subsequent functions,
# e.g., asMethPat(), more complicated. So, for now, I do not allow
# simulation of reads for circular chromosomes.
is_circ <- isCircular(
seqinfo(object@SimulatedMethylome))[seqlevels]
if (any(is_circ)) {
warning(paste0("No reads will be simulated for ",
paste0(seqlevels[is_circ], collapse = ", "),
" (circular seqlevels not yet supported)."))
}
# Remove circular seqlevels from seqlevels.
seqlevels <- seqlevels[!is_circ]
if (length(seqlevels) == 0L) {
# TODO: Return the "empty" object instead of a warning message.
stop("No reads simulated (all seqlevels are circular).")
}
# TODO: Allow simulation in parallel.
# Simulate nsim objects in *serial*.
val <- lapply(seq_len(nsim),
.simulateWGBSParam,
object = object, seqlevels = seqlevels,
simplify = simplify, BPPARAM = BPPARAM)
names(val) <- paste("sim", seq_len(nsim), sep = "_")
if (simplify) {
# Combine MethPat objects into one.
# TODO: Why doesn't do.call(combine, val) work? Check out
# other combine methods, e.g., combine,data.frame-method.
val <- Reduce(combine, val)
}
# Ensure "seed" is set as an attribute of the returned value.
attr(val, "seed") <- rng_state
val
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### getters/setters
###
# TODO (long term)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.