#' Sequence-related utility functions.
#'
#' @param alph `character(1)` A single character string with the desired
#' sequence alphabet. If missing, finds the unique letters within each
#' string.
#' @param alph.name `character(1)`, `NULL` Custom alphabet name.
#' @param ambiguity `character(1)`, `NULL` A named vector providing ambiguity codes for
#' the custom alphabet.
#' @param core `character(1)` Core alphabet symbols. If complements are also
#' provided, then only half of the letters should be provided to this argument.
#' @param colours `character`, `NULL` Named vector of core symbol colours.
#' MEME requires hex colours.
#' @param complements `character(1)`, `NULL` Complementary letters to the core symbols.
#' @param complexity.method `character(1)` Complexity algorithm. See
#' [sequence_complexity()].
#' @param file Output file.
#' @param FUN `closure` The function to apply per window. (See `?vapply`.)
#' @param FUN.VALUE The expected return type for `FUN`. (See `?vapply`.)
#' @param lets `character` A character vector where each element will be
#' considered a single unit.
#' @param k `integer(1)` K-let size.
#' @param letter `character(1)` Character to use for masking.
#' @param letter.names `character`, `NULL` Named vector of core symbol names.
#' @param like `character(1)`, `NULL` How to classify the custom alphabet. If not
#' `NULL`, then one of `c("DNA", "RNA", "PROTEIN")`.
#' @param method `character(1)` Shuffling method. One of `c("euler", "linear",
#' "markov")`. See [shuffle_sequences()].
#' @param n `integer(1)` Total size from which to calculate sliding windows.
#' @param nthreads `integer(1)` Number of threads to use. Zero uses all
#' available threads.
#' @param overlap `integer(1)` Overlap size between windows.
#' @param pattern `character(1)` Pattern to mask.
#' @param ranges `GRanges` The ranges to mask. Must be a `GRanges` object
#' from the `GenomicRanges` package.
#' @param RC `logical(1)` Whether to mask the reverse complement of the pattern.
#' @param return.incomp `logical(1)` Whether to return the last window if it is
#' smaller then the requested window size.
#' @param rng.seed `numeric(1)` Set random number generator seed. Since shuffling
#' in [shuffle_sequences()] can occur simultaneously in multiple threads using C++,
#' it cannot communicate
#' with the regular `R` random number generator state and thus requires an
#' independent seed. Since [shuffle_string()] uses the same underlying code
#' as [shuffle_sequences()], it also requires a separate seed even if it is
#' run in serial.
#' @param seqs `XStringSet` Sequences to mask. Cannot be `BStringSet`.
#' @param string `character(1)` A character vector containing a single string,
#' with the exception of [calc_complexity()] where `string` can be a length
#' greater than one.
#' @param trifonov.max.word.size `integer(1)` Maximum word size for use
#' in the Trifonov complexity methods. See [sequence_complexity()].
#' @param window `integer(1)` Window size to slide along.
#'
#' @return
#' For [calc_complexity()]: A vector of `numeric` values.
#'
#' For [calc_windows()]: A `data.frame` with columns `start` and `stop`.
#'
#' For [count_klets()]: A `data.frame` with columns `lets` and `counts`.
#'
#' For [get_klets()]: A `character` vector of k-lets.
#'
#' For [mask_ranges()]: The masked `XStringSet` object.
#'
#' For [mask_seqs()]: The masked `XStringSet` object.
#'
#' For [meme_alph()]: `NULL`, invisibly.
#'
#' For [shuffle_string()]: A single `character` string.
#'
#' For [slide_fun()]: A vector with type `FUN.VALUE`.
#'
#' For [window_string()]: A `character` vector.
#'
#' @examples
#' #######################################################################
#' ## calc_complexity
#' ## Calculate complexity for abitrary strings
#' calc_complexity("GTGCCCCGCGGGAACCCCGC", c = "WoottonFederhen")
#' calc_complexity("GTGCCCCGCGGGAACCCCGC", c = "WoottonFederhenFast")
#' calc_complexity("GTGCCCCGCGGGAACCCCGC", c = "Trifonov")
#' calc_complexity("GTGCCCCGCGGGAACCCCGC", c = "TrifonovFast")
#' calc_complexity("GTGCCCCGCGGGAACCCCGC", c = "DUST")
#'
#' #######################################################################
#' ## calc_windows
#' ## Calculate window coordinates for any value 'n'.
#' calc_windows(100, 10, 5)
#'
#' #######################################################################
#' ## count_klets
#' ## Count k-lets for any string of characters
#' count_klets("GCAAATGTACGCAGGGCCGA", k = 2)
#' ## The default 'k' value (1) counts individual letters
#' count_klets("GCAAATGTACGCAGGGCCGA")
#'
#' #######################################################################
#' ## get_klets
#' ## Generate all possible k-lets for a set of characters
#' get_klets(c("A", "C", "G", "T"), 3)
#' ## Note that each element in 'lets' is considered a single unit;
#' ## see:
#' get_klets(c("AA", "B"), k = 2)
#'
#' #######################################################################
#' ## mask_ranges
#' ## Mask arbitrary ranges
#' if (requireNamespace("GenomicRanges", quiet = TRUE)) {
#' ranges <- GenomicRanges::GRanges("A", IRanges::IRanges(1, 5))
#' seq <- Biostrings::DNAStringSet(c(A = "ATGACTGATTACTTATA"))
#' mask_ranges(seq, ranges, "-")
#' }
#'
#' #######################################################################
#' ## mask_seqs
#' ## Mask repetitive seqeuences
#' data(ArabidopsisPromoters)
#' mask_seqs(ArabidopsisPromoters, "AAAAAA")
#'
#' #######################################################################
#' ## meme_alph
#' ## Create MEME custom alphabet definition files
#' meme_alph("ACm", complements = "TGM", alph.name = "MethDNA",
#' letter.names = c(A = "Adenine", C = "Cytosine", G = "Guanine",
#' T = "Thymine", m = "Methylcytosine", M = "mC:Guanine"),
#' like = "DNA", ambiguity = c(N = "ACGTmM"))
#'
#' #######################################################################
#' ## shuffle_string
#' ## Shuffle any string of characters
#' shuffle_string("ASDADASDASDASD", k = 1)
#'
#' #######################################################################
#' ## slide_fun
#' ## Apply a function to a character vector along sliding windows
#' FUN <- function(x) grepl("[GC]", x)
#' data.frame(
#' Window = window_string("ATGCATCTATGCA", 2, 1),
#' HasGC = slide_fun("ATGCATCTATGCA", FUN, logical(1), 2, 1)
#' )
#'
#' #######################################################################
#' ## window_string
#' ## Get sliding windows for a string of characters
#' window_string("ABCDEFGHIJ", 2, 1)
#'
#' @seealso [create_sequences()], [get_bkg()], [sequence_complexity()],
#' [shuffle_sequences()]
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @name utils-sequence
NULL
#' @rdname utils-sequence
#' @export
calc_complexity <- function(string, complexity.method = c("WoottonFederhen",
"WoottonFederhenFast", "Trifonov", "TrifonovFast", "DUST"), alph = NULL,
trifonov.max.word.size = 7) {
method <- match.arg(complexity.method)
if (is.null(alph)) alph <- ""
trifonov.max.word.size <- as.integer(trifonov.max.word.size)
if (is.na(trifonov.max.word.size) || trifonov.max.word.size <= 0) {
stop("trifonov.max.word.size must be a positive integer")
}
switch(method,
WoottonFederhen = vapply(string,
function(x) wootton_federhen_cpp(x, alph),
numeric(1), USE.NAMES = FALSE),
WoottonFederhenFast = vapply(string,
function(x) wootton_federhen_fast_cpp(x, alph),
numeric(1), USE.NAMES = FALSE),
Trifonov = vapply(string,
function(x) trifonov_cpp(x, trifonov.max.word.size, alph),
numeric(1), USE.NAMES = FALSE),
TrifonovFast = vapply(string,
function(x) trifonov_fast_cpp(x, trifonov.max.word.size, alph),
numeric(1), USE.NAMES = FALSE),
DUST = vapply(string,
function(x) dust_cpp(x),
numeric(1), USE.NAMES = FALSE)
)
}
#' @rdname utils-sequence
#' @export
calc_windows <- function(n, window = 1, overlap = 0, return.incomp = TRUE) {
n <- as.integer(n)
window <- as.integer(window)
overlap <- as.integer(overlap)
if (is.na(n) || n < 1) {
stop("`n` should be a positive integer")
}
if (is.na(window) || window < 1) {
stop("`window` should be a positive integer")
}
if (is.na(overlap) || overlap < 0) {
stop("`overlap` should be a non-negative integer")
}
if (!isTRUEorFALSE(return.incomp) || length(return.incomp) != 1) {
stop("`return.incomp` should be a single boolean value")
}
if (window > n) {
window <- n
overlap <- 0
}
if (overlap >= window) {
stop("`overlap` cannot be greater or equal to `window`")
}
as.data.frame(structure(calc_wins_cpp2(n, window, overlap, return.incomp),
names = c("start", "stop")))
}
#' @rdname utils-sequence
#' @export
count_klets <- function(string, k = 1, alph) {
if (k < 1) stop("k must be greater than 0")
k <- as.integer(k)
if (length(string) != 1) stop("'string' must be a length 1 character vector")
if (nchar(string) < 1) stop("'string' cannot be empty")
if (missing(alph)) {
counts <- count_klets_cpp(string, k, 1)[[1]]
klets <- get_klets_cpp(sort_unique_cpp(safeExplode(string)), k)
} else {
if (length(alph) > 1) stop("'alph' must be a single string")
if (nchar(alph) < 1) stop("'alph' cannot be empty")
counts <- count_klets_alph_cpp(string, alph, k, 1)
klets <- get_klets_cpp(sort_unique_cpp(safeExplode(alph)), k)
}
structure(data.frame(klets, counts, stringsAsFactors = FALSE),
names = c("klets", "counts"))
}
# get_klets(lets, k = 1) --> see utils.cpp
#' @rdname utils-sequence
#' @export
get_klets <- function(lets, k = 1) {
get_klets_cpp(lets, k)
}
granges_fun2 <- function(FUN, env = parent.frame()) {
if (requireNamespace("GenomicRanges", quietly = TRUE)) {
eval(substitute(FUN), envir = env)
} else {
stop(wmsg("The 'GenomicRanges' package must be installed to use mask_ranges(). ",
"[BiocManager::install(\"GenomicRanges\")]"), call. = FALSE)
}
}
#' @rdname utils-sequence
#' @export
mask_ranges <- function(seqs, ranges, letter = "-") {
if (!is(seqs, "XStringSet")) {
stop(wmsg("`seqs` must be an XStringSet object"), call. = FALSE)
}
if (!is(ranges, "GRanges")) {
stop(wmsg("`ranges` must be GRanges object"), call. = FALSE)
}
if (!is.character(letter) || length(letter) != 1) {
stop(wmsg("`letter` must be a single character"), call. = FALSE)
}
ranges <- granges_fun2(GenomicRanges::reduce(ranges))
ranges <- unname(as(ranges, "IRangesList")[names(seqs)])
letterList <- sapply(ranges,
function(x) vapply(width(x),
function(y) collapse_cpp(rep(letter, y)),
character(1)
)
)
Biostrings::replaceAt(seqs, ranges, letterList)
}
#' @rdname utils-sequence
#' @export
mask_seqs <- function(seqs, pattern, RC = FALSE, letter = "-") {
if (!is(seqs, "XStringSet"))
stop("`seqs` must be an `XStringSet` object")
if (length(pattern) > 1 || !is.character(pattern))
stop("`pattern` must be a single character")
alph <- seqtype(seqs)
if (alph == "B")
stop("`mask_seqs()` only works with DNA/RNA/AA sequences")
fix_seqs <- function(seqs, pattern, letter) {
seqs <- lapply(seqs, mask, pattern = pattern)
seqs <- lapply(seqs, injectHardMask, letter = letter)
switch(alph,
DNA = DNAStringSet(seqs),
RNA = RNAStringSet(seqs),
AA = AAStringSet(seqs)
)
}
seqs <- fix_seqs(seqs, pattern, letter)
if (RC) {
pattern <- as.character(switch(alph,
DNA = reverseComplement(DNAString(pattern)),
RNA = reverseComplement(RNAString(pattern)),
stop("`RC = TRUE` is only valid for DNA/RNA sequences")
))
seqs <- fix_seqs(seqs, pattern, letter)
}
seqs
}
#' @rdname utils-sequence
#' @export
meme_alph <- function(core, file = stdout(), complements = NULL, ambiguity = NULL,
like = NULL, alph.name = NULL, letter.names = NULL, colours = NULL) {
alph <- "ALPHABET"
if (!is.null(alph.name)) {
if (length(alph.name) != 1) {
stop("\"alph.name\" must be length 1", call. = FALSE)
}
alph <- paste0(alph, " \"", alph.name, "\"")
}
if (!is.null(like)) {
if (length(like) != 1 || !like %in% c("DNA", "RNA", "PROTEIN")) {
stop("\"like\" must be one of NULL, DNA, RNA, PROTEIN", call. = FALSE)
} else {
alph <- paste0(alph, " ", like, "-LIKE")
}
}
alph <- c(alph, "", "# Core symbols")
if (length(core) != 1) {
stop("\"core\" must be length 1", call. = FALSE)
}
lets <- strsplit(core, "")[[1]]
lets0 <- lets
compl <- NULL
if (!is.null(complements)) {
if (length(complements) != 1) {
stop("\"complements\" must be length 1", call. = FALSE)
}
compl <- strsplit(complements, "")[[1]]
if (length(lets) != length(compl)) {
stop("\"core\" and \"complements\" do not have the same number of letters",
call. = FALSE)
}
}
compl0 <- compl
if (!is.null(letter.names)) {
if (anyNA(letter.names[lets0])) {
stop("Found letters with missing \"letter.names\"", call. = FALSE)
}
lets <- paste(lets, paste0("\"", letter.names[lets0], "\""))
if (!is.null(compl)) {
if (anyNA(letter.names[compl])) {
stop("Found letters with missing \"letter.names\"", call. = FALSE)
}
compl <- paste(compl, paste0("\"", letter.names[compl0], "\""))
}
}
if (!is.null(colours)) {
if (anyNA(colours[lets0])) {
stop("Found letters with missing \"colours\"", call. = FALSE)
}
lets <- paste(lets, colours[lets0])
if (!is.null(compl)) {
if (anyNA(colours[compl0])) {
stop("Found letters with missing \"colours\"", call. = FALSE)
}
compl <- paste(compl, colours[compl0])
}
}
if (!is.null(compl)) {
alph <- c(alph, paste(lets, compl, sep = " ~ "))
} else {
alph <- c(alph, lets)
}
if (!is.null(ambiguity)) {
alph <- c(alph, "", "# Ambiguous symbols")
if (is.null(names(ambiguity)) || anyNA(names(ambiguity))) {
stop("\"ambiguity\" must be a named vector", call. = FALSE)
}
amb <- paste(names(ambiguity), unname(ambiguity), sep = " = ")
alph <- c(alph, amb)
}
cat(alph, file = file, sep = "\n")
invisible(NULL)
}
#' @rdname utils-sequence
#' @export
shuffle_string <- function(string, k = 1, method = c("euler", "linear", "markov"),
rng.seed = sample.int(1e4, 1)) {
method <- match.arg(method, c("euler", "linear", "markov"))
if (length(string) != 1) stop("'string' must be a length 1 character vector")
if (length(k) != 1) stop("'k' must be length 1")
if (k < 1) stop("'k' must be greater than 0")
k <- as.integer(k)
seed <- as.integer(abs(rng.seed))[1]
if (k == 1) {
shuffle_k1_cpp(string, 1, seed)
} else if (k > 1) {
switch(method,
"euler" = shuffle_euler_cpp(string, k, 1, seed),
"linear" = shuffle_linear_cpp(string, k, 1, seed),
"markov" = shuffle_markov_cpp(string, k, 1, seed),
stop("'method' must be one of 'euler', 'linear', 'markov'"))
} else {
stop("k must be greater than 0")
}
}
# calc_gc <- function(string) {
# x <- table(safeExplode(gsub("G", "C", string)))
# unname(x[names(x) == "C"]) / nchar(string)
# }
#' @rdname utils-sequence
#' @export
slide_fun <- function(string, FUN, FUN.VALUE, window = 1, overlap = 0,
return.incomp = TRUE) {
string <- as.character(string)
if (!length(string) || is.na(string) || !nchar(string)) {
stop("`string` must be a non-empty length 1 character vector")
}
window <- as.integer(window)
overlap <- as.integer(overlap)
if (is.na(window) || window < 1) {
stop("`window` should be a positive integer")
}
if (is.na(overlap) || overlap < 0) {
stop("`overlap` should be a non-negative integer")
}
if (!isTRUEorFALSE(return.incomp) || length(return.incomp) != 1) {
stop("`return.incomp` should be a single boolean value")
}
if (window > nchar(string)) {
if (!return.incomp) {
stop(wmsg("`string` is smaller than `window`; set `return.incomp=TRUE`",
" to automatically shorten the window"))
}
window <- nchar(string)
overlap <- 0
}
if (overlap >= window) {
stop("`overlap` cannot be greater or equal to `window`")
}
vapply(
slide_windows_cpp(string, window, overlap, return.incomp),
FUN, FUN.VALUE, USE.NAMES = FALSE
)
}
#' @rdname utils-sequence
#' @export
window_string <- function(string, window = 1, overlap = 0,
return.incomp = TRUE, nthreads = 1) {
string <- as.character(string)
if (any(is.na(string)) || any(!nchar(string))) {
stop("`string` must be a character vector without any blank strings")
}
if (!is.character(string) || length(string) > 1) {
stop("`string` should be a length 1 character vector")
}
if (!nchar(string)) return(string)
window <- as.integer(window)
overlap <- as.integer(overlap)
nthreads <- as.integer(nthreads)
if (is.na(window) || window < 1) {
stop("`window` should be a positive integer")
}
if (is.na(overlap) || overlap < 0) {
stop("`overlap` should be a non-negative integer")
}
if (is.na(nthreads) || nthreads < 0) {
stop("`nthreads` should be a non-negative integer")
}
if (window > nchar(string)) {
window <- nchar(string)
overlap <- 0
}
if (overlap >= window) {
stop("`overlap` cannot be greater or equal to `window`")
}
if (!isTRUEorFALSE(return.incomp) || length(return.incomp) != 1) {
stop("`return.incomp` should be a single boolean value")
}
slide_windows_cpp(string, window, overlap, return.incomp, nthreads)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.