# Copyright (C) 2015 Peter Hickey
#
# This file is part of methsim.
#
# methsim is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# methsim is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with methsim If not, see <http://www.gnu.org/licenses/>.
### =========================================================================
### Sampling utility functions. These functions are not exported.
### -------------------------------------------------------------------------
#' Sample MethLevelDT
#'
#' Sample with replacement from meth_level_dt, stratified by region_type, such
#' that each region is assigned an (average) methylation level.
#' @keywords internal
.sampleMethLevelDT <- function(meth_level_dt, region_type) {
region_type_levels <- levels(region_type)
meth_level <- rep(NA_real_, length(region_type))
for (level in region_type_levels) {
mli <- which(meth_level_dt[, type] == level)
ml <- meth_level_dt[mli, ]
rti <- which(region_type == level)
n <- length(rti)
meth_level[rti] <- ml[sample(nrow(ml),
n,
replace = TRUE,
prob = ml[, N]), beta]
}
meth_level
}
#' Sample from co-methylation distribution.
#'
#' Sample with replacement from cometh_dt, stratified by region_type, such that
#' methylation loci 2-tuple is assigned a level of within-fragment
#' co-methylation (on the log odds-ratio scale i.e., lor). If there are fewer
#' than \code{min_n} observations for that IPD-type combination then the lor is
#' simulated from a Gaussian distribution, with mean given by \code{mean_fun}
#' and standard deviation given by \code{sd_fun} (see Note).
#' @param two_tuples a
#' \code{MethylationTuples::\link[MethylationTuples]{Mtuples}} containing all
#' two-tuples of methylation loci in the genome. Typically generated by
#' \code{MethylationTuples::\link[MethylationTuples]{findMTuples}}.
#' @param partitioned_methylome a \code{\link{PartitionedMethylome}} object.
#' @param min_n the minimum number of observations on a IPD-type combination in
#' order to sample from the empirical distribution as opposed to sampling from
#' the Gaussian(mean_fun(ipd, type), sd_fun(ipd, type)) distribution.
#' @param mean_fun a function specifying the mean LOR for a given IPD-type
#' combination.
#' @param sd_fun a function specifying the standard deviation of the LOR for a
#' given IPD-type combination.
#'
#' @note Good values of \code{mean_fun} and \code{sd_fun} are difficult to
#' estimate because we have little-to-no data on within-fragment co-methylation
#' for large IPDs (approximately $IPD > 200$). Fortunately, at least in the
#' human genome, more than 90\% of adjacent CpG two-tuples have an $IPD < 200$,
#' therefore this choice only affects a minority of loci. The default
#' \code{mean_fun} effectively means that the methylation state of CpGs with a
#' larger IPD are "independent on average"; the default \code{sd_fun} is based
#' on the median standard deviation of the LOR-by-IPD distribution (these
#' distributions are remarkably stable across IPD and region-type, with a value
#' close to 2).
#' Clearly it should be possible to obtain better estimates of these
#' parameters, which is the subject of ongoing research (ideas welcome!).
#'
#' @return A vector of log odds-ratios (LOR), one for each 2-tuple.
#' @keywords internal
.sampleComethDT <- function(two_tuples, cometh_dt, partitioned_methylome,
min_n = 100L, mean_fun = function(ipd, type) 0,
sd_fun = function(ipd, type) 2) {
# If a 2-tuple isn't within a region, then the region_type is (somewhat
# arbitrarily) defined by the region for the first methylation loci in the
# tuple.
within_ol <- findOverlaps(two_tuples, partitioned_methylome, type = "within")
not_within <- which(!seq_len(queryLength(within_ol)) %in%
queryHits(within_ol))
end_ol <- findOverlaps(two_tuples[not_within], partitioned_methylome,
type = "any", select = "first")
mcols(two_tuples)$regionType <-
regionType(partitioned_methylome)[c(subjectHits(within_ol), end_ol)]
region_type <- regionType(partitioned_methylome)
region_type_levels <- levels(region_type)
two_tuples_dt <- data.table(IPD = as.vector(IPD(two_tuples)),
type = regionType(partitioned_methylome)[
c(subjectHits(within_ol), end_ol)])
# Tabulate the number of occurences of each IPD-type combination (n).
two_tuples_reduced_dt <- two_tuples_dt[, list(n = .N), by = list(IPD, type)]
setkey(two_tuples_reduced_dt, IPD, type)
setkey(cometh_dt, IPD, type)
lor_list <- lapply(seq_len(nrow(two_tuples_reduced_dt)),
function(i, two_tuples_reduced_dt, cometh_dt, min_n) {
# Create a table with all the co-methylation data for that IPD-type
# combination
x <- cometh_dt[two_tuples_reduced_dt[i, ]]
# If there are a sufficient number of observations then sample from the
# empirical distribution, otherwise sample from the loess-smoothed
# Gaussian model.
if (isTRUE(x[, sum(N)] > min_n)) {
x[, sample(x = statistic, size = n, replace = TRUE, prob = N)]
} else {
rnorm(n = two_tuples_reduced_dt[i, n],
mean = mean_fun(two_tuples_reduced_dt[i, IPD],
two_tuples_reduced_dt[i, type]),
sd = sd_fun(two_tuples_reduced_dt[i, IPD],
two_tuples_reduced_dt[i, type]))
}
}, two_tuples_reduced_dt, cometh_dt, min_n)
# Extract values from lor_list and insert them appropriately for the
# two_tuples based on IPD-type combination.
two_tuples_dt[, idx := .I]
setkey(two_tuples_dt, IPD, type)
two_tuples_dt[, lor := unlist(x = lor_list, recursive = TRUE,
use.names = FALSE)]
setkey(two_tuples_dt, idx)
two_tuples_dt[, lor]
}
#' Sample PatternFreqsDT
#'
#' Sample with replacement from pattern_freqs_dt, stratified by region_type,
#' such that each region is assigned a vector of haplotype frequencies.
#' @keywords internal
.samplePatternFreqsDT <- function(pattern_freqs_dt, region_type) {
W_names <- grep("^w", colnames(pattern_freqs_dt), value = TRUE)
region_type_levels <- levels(region_type)
W <- matrix(NA_real_, ncol = length(W_names), nrow = length(region_type))
for (level in region_type_levels) {
pfi <- which(pattern_freqs_dt[, type] == level)
pf <- pattern_freqs_dt[pfi, ]
rti <- which(region_type == level)
n <- length(rti)
W[rti, ] <- as.matrix(
pf[sample(nrow(pf),
n,
replace = TRUE,
prob = pf[, N])][, W_names, with = FALSE])
}
W
}
#' Sample read start positions.
#'
#' @param n the number of reads to simulate
#' @param seqlength the length of the chromosome
#'
#' @return A sorted integer vector of read start positions.
#'
#' @note This function can return a read start positions that results in a read
#' running off the end of the chromosome, e.g., if the read start position is
#' 99, the read_length is 10 but the chromosome length is only 100.
#'
#' @keywords internal
.sampleReadStart <- function(n, seqlength) {
# TODO: Should sampling be with or without replacement?
sample(x = seqlength, size = n, replace = TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.