#' Calculate sequence background.
#'
#' For a set of input sequences, calculate the overall sequence background for
#' any k-let size. For very large sequences DNA and RNA sequences (in the billions of bases),
#' please be aware of the much faster and more efficient
#' \code{\link[Biostrings:nucleotideFrequency]{Biostrings::oligonucleotideFrequency()}}.
#' [get_bkg()] can still be used in these cases, though it may take several seconds or
#' minutes to calculate the results (depending on requested k-let sizes).
#'
#' @param sequences \code{\link{XStringSet}} Input sequences. Note that if
#' multiple sequences are present, the results will be combined into one
#' (unless `merge.res = FALSE`).
#' @param k `integer` Size of k-let. Background can be calculated for any
#' k-let size.
#' @param as.prob Deprecated.
#' @param pseudocount `integer(1)` Add a count to each possible k-let. Prevents
#' any k-let from having 0 or 1 probabilities.
#' @param alphabet `character(1)` Provide a custom alphabet to calculate a
#' background for. If `NULL`, then standard letters will be assumed for
#' DNA, RNA and AA sequences, and all unique letters found will be used
#' for `BStringSet` type sequences. Note that letters which are not a part
#' of the standard DNA/RNA/AA alphabets or in the provided alphabet will
#' not be counted in the totals during probability calculations.
#' @param to.meme If not `NULL`, then [get_bkg()] will return the sequence
#' background in MEME Markov Background Model format. Input for this argument
#' will be used for `cat(..., file = to.meme)` within [get_bkg()]. See
#' \url{http://meme-suite.org/doc/bfile-format.html} for a description of
#' the format.
#' @param RC `logical(1)` Calculate the background of the reverse complement
#' of the input sequences as well. Only valid for DNA/RNA.
#' @param list.out Deprecated.
#' @param nthreads `numeric(1)` Run [get_bkg()] in parallel with `nthreads`
#' threads. `nthreads = 0` uses all available threads.
#' Note that no speed up will occur for jobs with only a single sequence.
#' @param merge.res `logical(1)` Whether to merge results from all sequences
#' or return background data for individual sequences.
#' @param window `logical(1)` Determine background in windows.
#' @param window.size `numeric` Window size. If a number between 0 and 1 is
#' provided, the value is calculated as the number multiplied by the sequence
#' length.
#' @param window.overlap `numeric` Overlap between windows. If a number
#' between 0 and 1 is provided, the value is calculated as the number
#' multiplied by the sequence length.
#'
#' @return
#' If `to.meme = NULL`, a `DataFrame` with columns `klet`, `count`,
#' and `probability`. If `merge.res = FALSE`, there will be an additional
#' `sequence` column. If `window = TRUE`, there will be an additional `start`
#' and `stop` columns.
#'
#' If `to.meme` is not `NULL`, then `NULL` is returned, invisibly.
#'
#' @examples
#' ## Compare to Biostrings version
#' library(Biostrings)
#' seqs.DNA <- create_sequences()
#' bkg.DNA <- get_bkg(seqs.DNA, k = 3)
#' bkg.DNA2 <- oligonucleotideFrequency(seqs.DNA, 3, 1, as.prob = FALSE)
#' bkg.DNA2 <- colSums(bkg.DNA2)
#' all(bkg.DNA$count == bkg.DNA2)
#'
#' ## Create a MEME background file
#' get_bkg(seqs.DNA, k = 1:3, to.meme = stdout(), pseudocount = 1)
#'
#' ## Non-DNA/RNA/AA alphabets
#' seqs.QWERTY <- create_sequences("QWERTY")
#' bkg.QWERTY <- get_bkg(seqs.QWERTY, k = 1:2)
#'
#' @references
#'
#' Bailey TL, Elkan C (1994). “Fitting a mixture model by expectation
#' maximization to discover motifs in biopolymers.” *Proceedings of
#' the Second International Conference on Intelligent Systems for
#' Molecular Biology*, **2**, 28-36.
#'
#' @seealso [create_sequences()], [scan_sequences()], [shuffle_sequences()]
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @export
get_bkg <- function(sequences, k = 1:3, as.prob = NULL, pseudocount = 0,
alphabet = NULL, to.meme = NULL, RC = FALSE, list.out = NULL, nthreads = 1,
merge.res = TRUE, window = FALSE, window.size = 0.1, window.overlap = 0) {
# TODO: return.granges (only for when window = TRUE?)
# param check --------------------------------------------
args <- as.list(environment())
all_checks <- character(0)
if (any(k < 1)) {
k_check <- paste0(" * Incorrect 'k': values below 1 are not allowed; found `",
paste0(k[k < 1], collapse = ", "), "`")
all_checks <- c(all_checks, k_check)
}
s4_check <- check_fun_params(list(sequences = args$sequences),
numeric(), logical(), TYPE_S4)
num_check <- check_fun_params(list(k = args$k, pseudocount = args$pseudocount),
c(0, 1), c(FALSE, FALSE), TYPE_NUM)
char_check <- check_fun_params(list(alphabet = args$alphabet), 1,
TRUE, TYPE_CHAR)
logi_check <- check_fun_params(list(RC = args$RC),
numeric(), logical(), TYPE_LOGI)
all_checks <- c(all_checks, char_check, num_check, s4_check, logi_check)
if (length(all_checks) > 0) stop(all_checks_collapse(all_checks))
#---------------------------------------------------------
if (!is.null(as.prob))
warning(wmsg("`as.prob` has been disabled and now does nothing ",
"(both counts and probabilities are now shown)"), immediate. = TRUE)
if (!is.null(list.out))
warning(wmsg("`list.out` has been disabled and now does nothing ",
"(all posisble outputs are now returned as DataFrames)"), immediate. = TRUE)
if (!is.null(to.meme) && window)
stop("`window = TRUE` is not valid if `to.meme` is not `NULL`")
if (!is.null(to.meme) && !merge.res)
stop("`merge.res = FALSE` is not valid if `to.meme` is not `NULL`")
pseudocount <- as.integer(pseudocount)
k <- as.integer(k)
if (RC && seqtype(sequences) %in% c("DNA", "RNA"))
sequences <- c(sequences, reverseComplement(sequences))
if (!is.null(to.meme)) {
if (!all(k == seq_len(k[length(k)])))
stop(wmsg("To create a MEME background file, `all(k == seq_along(k))` must be true"))
}
seq_type <- seqtype(sequences)
no.alph <- FALSE
if (is.null(alphabet)) {
if (!is(sequences, "XStringSet"))
stop("`sequences` must be an `XStringSet` object")
switch(seq_type,
"DNA" = alphabet <- DNA_BASES,
"RNA" = alphabet <- RNA_BASES,
"AA" = alphabet <- AA_STANDARD2,
no.alph <- TRUE)
} else {
if (length(alphabet) == 1) alphabet <- sort_unique_cpp(safeExplode(alphabet))
else alphabet <- sort_unique_cpp(alphabet)
}
seqlens <- width(sequences)
if (min(seqlens) < max(k))
stop("`k` must be less than the length of the shortest sequence")
seq.names <- names(sequences)
if (is.null(seq.names)) seq.names <- as.character(seq_len(length(sequences)))
seqs1 <- as.character(sequences)
seqs <- lapply(seqs1, safeExplode)
if (no.alph) alphabet <- sort_unique_cpp(do.call(c, lapply(seqs, unique)))
alph <- collapse_cpp(alphabet)
if (!window) {
counts <- vector("list", length(k))
names(counts) <- as.character(k)
for (i in seq_along(k)) {
counts[[as.character(k[i])]] <- count_klets_alph_cpp(seqs1, alph, k[i], nthreads)
counts[[as.character(k[i])]] <- do.call(data.frame, counts[[as.character(k[i])]])
if (merge.res) {
counts[[as.character(k[i])]] <- rowSums(counts[[as.character(k[i])]])
names(counts[[as.character(k[i])]]) <- get_klets(alphabet, k[i])
} else {
colnames(counts[[as.character(k[i])]]) <- seq.names
rownames(counts[[as.character(k[i])]]) <- get_klets(alphabet, k[i])
counts[[as.character(k[i])]] <- as.matrix(t(counts[[as.character(k[i])]]))
}
}
if (pseudocount > 0) counts <- lapply(counts, function(x) x + pseudocount)
if (merge.res) {
probs <- lapply(counts, function(x) x / sum(x))
} else {
probs <- lapply(counts, function(x) t(apply(x, 1, function(y) y / sum(y))))
}
counts <- counts[sort_unique_cpp(names(counts))]
probs <- probs[sort_unique_cpp(names(probs))]
if (!is.null(to.meme)) {
if (pseudocount < 1) {
zero.check <- vapply(probs, function(x) any(x == 0), logical(1))
one.check <- vapply(probs, function(x) any(x == 1), logical(1))
if (any(zero.check) || any(one.check))
stop(wmsg("MEME background files do not allow 0 or 1 values, ",
"please try again with a `pseudocount` higher than 0"))
}
out <- character(0)
out <- to_meme_bkg(probs)
cat(out, sep = "\n", file = to.meme)
return(invisible())
}
if (merge.res) {
counts <- unlist(counts)
names(counts) <- vapply(strsplit(names(counts), ".", fixed = TRUE),
function(x) x[2], character(1))
probs <- unlist(probs)
names(probs) <- vapply(strsplit(names(probs), ".", fixed = TRUE),
function(x) x[2], character(1))
res <- data.frame(names(counts), unname(counts), unname(probs), row.names = NULL)
colnames(res) <- c("klet", "count", "probability")
res <- as(res, "DataFrame")
} else {
for (i in seq_along(counts)) {
counts[[i]] <- stack(counts[[i]])
probs[[i]] <- stack(probs[[i]])
}
probs <- do.call(rbind, probs)
counts <- do.call(rbind, counts)
counts[[1]] <- as.character(counts[[1]])
counts[[2]] <- as.character(counts[[2]])
res <- counts
colnames(res) <- c("sequence", "klet", "count")
res$probability <- probs[[3]]
}
} else {
if (merge.res && length(window.size) > 1 || length(window.overlap) > 1) {
stop(wmsg("`window.size` and `window.overlap` must be single values ",
"when `merge.res = TRUE`"), call. = FALSE)
}
window.size <- rep_len(window.size, length(sequences))
window.overlap <- rep_len(window.overlap, length(sequences))
if (any(window.size == 0))
stop("`window.size` must be greater than 0")
window.size[window.size < 1] <- as.integer(window.size[window.size < 1] *
seqlens[window.size < 1])
window.overlap[window.overlap < 1 & window.overlap > 0] <- as.integer(
window.overlap[window.overlap < 1 & window.overlap > 0] *
seqlens[window.overlap < 1 & window.overlap > 0])
if (any(window.size <= window.overlap))
stop("`window.overlap` cannot be larger than or equal to `window.size`")
wins <- mapply(calc_wins, seqlens, window.size, window.overlap,
SIMPLIFY = FALSE)
starts <- lapply(wins, function(x) x$starts)
stops <- lapply(wins, function(x) x$stops)
seqs.split <- mapply(function(x, y, z) slide_windows_cpp(x, y, z, TRUE, nthreads),
seqs1, window.size, window.overlap, SIMPLIFY = FALSE)
split_n <- vapply(seqs.split, length, integer(1))
seqnames_n <- mapply(function(x, y) rep(x, y), seq.names, split_n, SIMPLIFY = FALSE)
seqnames_n <- do.call(c, seqnames_n)
seqs.split2 <- do.call(c, unname(seqs.split))
if (min(nchar(seqs.split2)) < max(k))
stop("`k` must be less than the smallest `window.size`")
starts <- do.call(c, starts)
stops <- do.call(c, stops)
res <- vector("list", length(k))
for (i in seq_along(res)) {
res[[i]] <- count_klets_alph_cpp(seqs.split2, alph, k[i], nthreads)
res[[i]] <- do.call(data.frame, res[[i]])
rownames(res[[i]]) <- get_klets(alphabet, k[i])
colnames(res[[i]]) <- NULL
res[[i]] <- stack(as.matrix(res[[i]]))
colnames(res[[i]]) <- c("klet", "col", "count")
res[[i]]$klet <- as.character(res[[i]]$klet)
res[[i]]$sequence <- seqnames_n[as.integer(res[[i]]$col)]
res[[i]]$start <- starts[as.integer(res[[i]]$col)]
res[[i]]$stop <- stops[as.integer(res[[i]]$col)]
}
res <- do.call(rbind, res)
res <- res[, c("sequence", "start", "stop", "klet", "count")]
if (merge.res) {
if (length(unique(seqlens)) != 1)
stop(wmsg("`merge.res = TRUE` and `window = TRUE` is only valid if ",
"all sequences are of equal length"))
# TODO: find a faster replacement
res <- aggregate(count ~ start + stop + klet, data = res[, -1], sum)
if (pseudocount > 0) res$count <- res$count + pseudocount
res <- by(res, INDICES = list(res$start, nchar(res$klet)), FUN = function(x) {
x$probability <- x$count / sum(x$count) ; x
})
res <- do.call(rbind, res)
rownames(res) <- NULL
res <- res[, c("start", "stop", "klet", "count", "probability")]
} else {
if (pseudocount > 0) res$count <- res$count + pseudocount
# TODO: find a faster replacement
res <- by(res, INDICES = list(res$sequence, res$start, nchar(res$klet)),
FUN = function(x) {
x$probability <- x$count / sum(x$count) ; x
}
)
# Don't understand why this line is needed
res <- res[!vapply(res, is.null, logical(1))]
res <- do.call(rbind, res)
rownames(res) <- NULL
res <- res[, c("sequence", "start", "stop", "klet", "count", "probability")]
}
if (!merge.res)
res <- res[order(res$sequence, res$klet, method = "radix"), ]
else
res <- res[order(res$klet, method = "radix"), ]
res$start <- as.integer(res$start)
res$stop <- as.integer(res$stop)
res <- as(res, "DataFrame")
}
res$probability[is.nan(res$probability)] <- 0
# if (return.granges) {
# if (merge.res) {
# res$seqname <- "1"
# colnames(res)[2] <- "end"
# res <- granges_fun(GenomicRanges::GRanges(res,
# seqlengths = c("1" = max(res$end))))
# } else {
# colnames(res)[1] <- "seqname"
# colnames(res)[3] <- "end"
# res <- granges_fun(GenomicRanges::GRanges(res,
# seqlengths = structure(width(sequences), names = seq.names)))
# }
# }
res
}
to_meme_bkg <- function(counts) {
count.names <- lapply(counts, names)
meme.order <- seq(0, length(counts) - 1)
out <- vector("list", length(counts))
for (i in seq_along(meme.order)) {
out[[i]] <- to_meme_bkg_single(counts[[i]], meme.order[i],
names(counts[[i]]))
}
do.call(c, out)
}
to_meme_bkg_single <- function(counts, meme.order, count.names) {
out <- character(length(counts) + 1)
out[1] <- paste0("# order ", meme.order)
for (i in seq_along(counts)) {
out[i + 1] <- paste0(format(count.names[i], width = 8),
formatC(counts[i], format = "e", digits = 3),
" ")
}
out
}
calc_wins <- function(seqlen, winsize, ovrlp) {
x <- calc_wins_cpp2(seqlen, winsize, ovrlp, TRUE)
if (!length(x))
stop(wmsg("Incorrect window and/or overlap size"), call. = FALSE)
x[[1]] <- x[[1]]
x[[2]] <- x[[2]]
structure(x, names = c("starts", "stops"))
}
calc_wins_old <- function(seqlen, winsize, ovrlp) {
starts <- seq(from = 1, to = seqlen, by = winsize)
if (ovrlp == 0) {
stops <- c(c(1, starts[c(-1, -length(starts))]) + winsize - 1, seqlen)
list(starts = starts, stops = stops)
} else {
starts[-1] <- starts[-1] - ovrlp
starts <- c(starts, starts[length(starts)] + winsize)
if (starts[length(starts)] >= seqlen) starts <- starts[-length(starts)]
stops <- c((starts[-length(starts)] + winsize) - 1, seqlen)
list(starts = starts, stops = stops)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.