R/spoar.R

Defines functions spoaConsensus.derep spoaConsensus.ShortReadQ spoaConsensus.QualityScaledXStringSet spoaConsensus.ShortRead spoaConsensus.XStringSet spoaConsensus.character spoaConsensus spoaAlign.derep spoaAlign.ShortReadQ spoaAlign.QualityScaledXStringSet spoaAlign.ShortRead spoaAlign.XStringSet spoaAlign.character spoaAlign SpoaAlgo

Documented in SpoaAlgo spoaAlign spoaAlign.character spoaAlign.derep spoaAlign.QualityScaledXStringSet spoaAlign.ShortReadQ spoaAlign.XStringSet spoaConsensus spoaConsensus.character spoaConsensus.derep spoaConsensus.QualityScaledXStringSet spoaConsensus.ShortRead spoaConsensus.ShortReadQ spoaConsensus.XStringSet

## Brendan Furneaux
## May 2021

methods::setClass(
    "SpoaAlgo",
    slots = c(
        algorithm = "character",
        m = "integer",
        n = "integer",
        g = "integer",
        e = "integer",
        q = "integer",
        c = "integer"
    ),
    prototype = list(
        algorithm = NA_character_,
        m = NA_integer_,
        n = NA_integer_,
        g = NA_integer_,
        e = NA_integer_,
        q = NA_integer_,
        c = NA_integer_
    )
)

#' Specify an algorithm for SPOA
#' @param m (non-negative `integer`) score for a match. *Default*: `5L`
#' @param n (non-positive `integer`) score for a mismatch. *Default*: `-4L`
#' @param g (non-positive `integer`) gap opening penalty. *Default*: `-8L`
#' @param e (non-positive `integer`) gap extension penalty. *Default*: same
#' value as `g`
#' @param q (non-positive `integer`) second gap opening penalty.
#' *Default*: same value as `g`
#' @param c (non-positive `integer`) second gap extension penalty.
#' *Default*: same value as `e`
#' @param algorithm (`character` string) alignment mode; one of `"local"`
#' (Smith-Watterman), `"global"` (Needleman-Wunsch), or `"semi.global"`
#' (Overlap). *Default*: `"local"`
#'
#' @return an object of class "`SpoaAlgo`", suitable for passing to
#' [spoaAlign()] or [spoaConsensus()]
#' @rdname spoaAlign
SpoaAlgo <- function(algorithm = c("local", "global", "semi.global"),
    m = 5L, n = -4L, g = -8L, e = g, q = g, c = q) {
    algorithm <- match.arg(algorithm)
    assertthat::assert_that(
        isNonnegativeScalarInt(m),
        isNonpositiveScalarInt(n),
        isNonpositiveScalarInt(g),
        isNonpositiveScalarInt(e),
        isNonpositiveScalarInt(q),
        isNonpositiveScalarInt(c)
    )
    methods::new("SpoaAlgo",
        algorithm = algorithm,
        m = as.integer(m),
        n = as.integer(n),
        g = as.integer(g),
        e = as.integer(e),
        q = as.integer(q),
        c = as.integer(c)
    )
}

#' Align sequences using SPOA (and optionally get the consensus)
#'
#' @param seq (`character` vector, [`Biostrings::XStringSet-class`],
#' [`ShortRead::ShortRead-class`], or [`dada2::derep-class`]) sequences to
#' align.
#' @param algo (`SpoaAlgo` object) parameters for SPOA, generated by
#' [SpoaAlgo()].
#' @param w (`integer` vector) weights for the sequences. *Default*: `1L`
#' @param ... additional parameters passed to methods
#'
#' @return For `spoaConsensus()`, either a `character` string or the
#' appropriate [`Biostrings::XString-class`], depending on the class of `seq`.
#'
#' For `spoaAlign()`, either a
#' `character` vector or a [`Biostrings::MultipleAlignment-class`], depending on
#' the class of `seq`. If `seq` is a `BStringSet` (i.e., an `XStringSet` which
#' is not specifically DNA, RNA, or AA) then the result is also a `BStringSet`,
#' since there is no corresponding "`BMultipleAlignment`" class. If `seq` is a
#' [`ShortRead::ShortRead-class`] object, the output is a
#' [`Biostrings::DNAMultipleAlignment-class`].
#'
#' | **seq**| **spoaConsensus** | **spoaAlign** |
#' |---------|---------------|-----------|
#' |`character(n)`|`character(1)` | `character(n)`|
#' |[`BStringSet`][Biostrings::XStringSet-class]|[`BString`][Biostrings::XString-class]|[`BStringSet`][Biostrings::XStringSet-class]|
#' |[`DNAStringSet`][Biostrings::XStringSet-class]|[`DNAString`][Biostrings::XString-class]|[`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`RNAStringSet`][Biostrings::XStringSet-class]|[`RNAString`][Biostrings::XString-class]|[`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`AAStringSet`][Biostrings::XStringSet-class]|[`AAString`][Biostrings::XString-class]|[`AAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`QualityScaledBStringSet`][Biostrings::QualityScaledXStringSet-class]|[`BString`][Biostrings::XString-class]|[`BStringSet`][Biostrings::XStringSet-class]|
#' |[`QualityScaledDNAStringSet`][Biostrings::QualityScaledXStringSet-class]|[`DNAString`][Biostrings::XString-class]|[`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`QualityScaledRNAStringSet`][Biostrings::QualityScaledXStringSet-class]|[`RNAString`][Biostrings::XString-class]|[`RNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`QualityScaledAAStringSet`][Biostrings::QualityScaledXStringSet-class]|[`AAString`][Biostrings::XString-class]|[`AAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`ShortReadQ`][ShortRead::ShortReadQ-class]|[`DNAString`][Biostrings::XString-class]|[`DNAMultipleAlignment`][Biostrings::MultipleAlignment-class]|
#' |[`derep`][dada2::derep-class]|`character(1)`| `character(n)`|
#'
#' @details
#'
#' ## Gap penalties
#'
#' The general (convex) gap penalty formula is:
#'
#' `min(g + (i - i) * e, q + (i - 1) * c)`
#'
#' For `q == g` and `c == e` (as is the case unless `q` or `c` is explicitly
#' set), this simplifies to the affine gap penalty:
#'
#' `g + (i - 1) * e`
#'
#' If, additionally, `e == g` (the default), then this is the linear gap
#' penalty:
#'
#' `g * i`
#'
#' ## Weighting
#'
#' Assigning a weight *w* to a sequence is equivalent to including that sequence
#' *w* times; this is primarily useful for dereplicated sequences, where all
#' sequences are unique, and the weight represents their multiplicity.
#'
#' Note that POA is not order-independent, so results may differ when a set of
#' non-unique sequences is dereplicated. For the most predictable results, it
#' is recommended to sort dereplicated sequences in decreasing order of
#' abundance (as in [dada2::derepFastq()]).
#'
#' ## Quality scores
#'
#' If `seq` is an object which includes quality scores
#' ([`Biostrings::QualityScaledXStringSet-class`], [`ShortRead::ShortReadQ`],
#' or [`dada2::derep-class`])
#' then the alignment consensus is weighted using the quality scores. The method
#' used by SPOA uses the integer quality scores as per-position weights
#' analogous to sequence weights; i.e., if two different bases align in the same
#' position, one in a single sequence with quality score 40 and the other in a
#' single sequence with quality score 20, then this is equivalent to the first
#' base occurring at that position in 40 sequences without quality scores, and
#' the second base occurring at that position in 20 sequences without quality
#' scores.
#'
#' It is possible to use a combination of sequence weights and quality scores.
#' In this case, in order for a dereplicated sequence set to give the same
#' results as the non-dereplicated sequences, the quality score for each of the
#' dereplicated sequences should be the mean of the quality scores for the
#' corresponding non-dereplicated sequences. This is the method used to generate
#' quality scores for dereplicated sequences in [dada2::derepFastq()].
#'
#' @examples
#' sequences <- c(
#'     "CATAAAAGAACGTAGGTCGCCCGTCCGTAACCTGTCGGATCACCGGAAAGGACCCGTAAAGTGATAATGAT",
#'     "ATAAAGGCAGTCGCTCTGTAAGCTGTCGATTCACCGGAAAGATGGCGTTACCACGTAAAGTGATAATGATTAT",
#'     "ATCAAAGAACGTGTAGCCTGTCCGTAATCTAGCGCATTTCACACGAGACCCGCGTAATGGG",
#'     "CGTAAATAGGTAATGATTATCATTACATATCACAACTAGGGCCGTATTAATCATGATATCATCA",
#'     "GTCGCTAGAGGCATCGTGAGTCGCTTCCGTACCGCAAGGATGACGAGTCACTTAAAGTGATAAT",
#'     "CCGTAACCTTCATCGGATCACCGGAAAGGACCCGTAAATAGACCTGATTATCATCTACAT"
#' )
#' spoaAlign(sequences)
#' spoaConsensus(sequences)
#' @export
spoaAlign <- function(seq, algo = SpoaAlgo(), ...) {
    UseMethod("spoaAlign")
}

#' @param qual (`list` of `numeric` vectors) per-base quality scores for the
#' sequences. *Default*: none
#' @export
#' @rdname spoaAlign
spoaAlign.character <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
    assertthat::assert_that(methods::is(algo, "SpoaAlgo"))
    checkWeights(seq, w)
    if (length(w) == 1L) w <- rep(w, length(seq))
    msa <- rep(NA_character_, length(seq))
    names(msa) <- names(seq)
    no_na <- !is.na(seq)
    msa[no_na] <- if (!is.null(qual)) {
        checkQuals(seq, qual)
        if (all(vapply(qual, is.integer, TRUE))) {
            spoa_align_intqual(
                seq[no_na],
                qual[no_na],
                algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
                w[no_na]
            )
        } else {
            spoa_align_dblqual(
                seq[no_na],
                qual[no_na],
                algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
                w[no_na]
            )
        }
    } else {
        spoa_align(
            seq[no_na],
            algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
            w[no_na]
        )
    }
    msa
}

#' @export
#' @rdname spoaAlign
spoaAlign.XStringSet <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
    s <- spoaAlign.character(as.character(seq), algo, w, qual, ...)
    matchXMultipleAlignment(s, seq)
}

#' @export
spoaAlign.ShortRead <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
    requireShortRead()
    seq2 <- ShortRead::sread(seq)
    names(seq2) <- as.character(ShortRead::id(seq))
    spoaAlign.XStringSet(seq2, algo, w, qual = NULL, ...)
}

#' @export
#' @rdname spoaAlign
spoaAlign.QualityScaledXStringSet <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
    requireBiostrings()
    s <- spoaAlign.character(as.character(seq), algo, w,
        qual = as.list(methods::as(Biostrings::quality(seq), "IntegerList"))
    )
    matchXMultipleAlignment(s, seq)
}

#' @export
#' @rdname spoaAlign
spoaAlign.ShortReadQ <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
    requireShortRead()
    requireBiostrings()
    s <- methods::as(seq, "QualityScaledDNAStringSet")
    names(s) <- as.character(ShortRead::id(seq))
    spoaAlign.QualityScaledXStringSet(s, algo, w, ...)
}

#' @export
#' @rdname spoaAlign
spoaAlign.derep <- function(seq, algo = SpoaAlgo(), ...) {
    spoaAlign.character(
        names(seq$uniques),
        algo,
        w = seq$uniques,
        qual = apply(seq$quals, 1, stats::na.omit)
    )
}

#' @rdname spoaAlign
#' @export
spoaConsensus <- function(seq, algo = SpoaAlgo(), ...) {
    UseMethod("spoaConsensus")
}

#' @export
#' @rdname spoaAlign
spoaConsensus.character <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
    assertthat::assert_that(methods::is(algo, "SpoaAlgo"))
    checkWeights(seq, w)
    if (length(w) == 1L) w <- rep(w, length(seq))
    no_na <- !is.na(seq)
    if (!is.null(qual)) {
        checkQuals(seq, qual)
        if (all(vapply(qual, is.integer, TRUE))) {
            spoa_consensus_intqual(
                seq[no_na],
                qual[no_na],
                algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
                w[no_na]
            )
        } else {
            spoa_consensus_dblqual(
                seq[no_na],
                qual[no_na],
                algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
                w[no_na]
            )
        }
    } else {
        spoa_consensus(
            seq[no_na],
            algo@algorithm, algo@m, algo@n, algo@g, algo@e, algo@q, algo@c,
            w[no_na]
        )
    }
}

#' @export
#' @rdname spoaAlign
spoaConsensus.XStringSet <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
    s <- spoaConsensus.character(as.character(seq), algo, w, qual, ...)
    matchXString(s, seq)
}

#' @export
#' @rdname spoaAlign
spoaConsensus.ShortRead <- function(seq, algo = SpoaAlgo(), w = 1, qual = NULL, ...) {
    requireShortRead()
    seq2 <- ShortRead::sread(seq)
    names(seq2) <- as.character(ShortRead::id(seq))
    spoaConsensus.XStringSet(seq2, algo, w, qual, ...)
}

#' @export
#' @rdname spoaAlign
spoaConsensus.QualityScaledXStringSet <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
    requireBiostrings()
    if (length(w) == 1L) w <- rep(w, length(seq))
    s <- spoaConsensus.character(
        as.character(seq),
        algo = algo,
        w = w,
        qual = as.list(methods::as(Biostrings::quality(seq), "IntegerList")),
        ...
    )
    matchXString(s, seq)
}

#' @export
#' @rdname spoaAlign
spoaConsensus.ShortReadQ <- function(seq, algo = SpoaAlgo(), w = 1, ...) {
    requireShortRead()
    requireBiostrings()
    spoaConsensus.QualityScaledXStringSet(
        methods::as(seq, "QualityScaledDNAStringSet"),
        algo, w, ...
    )
}

#' @export
#' @rdname spoaAlign
spoaConsensus.derep <- function(seq, algo = SpoaAlgo(), ...) {
    spoaConsensus.character(
        names(seq$uniques),
        algo,
        w = seq$uniques,
        qual = apply(seq$quals, 1, stats::na.omit),
        ...
    )
}
brendanf/spoar documentation built on Dec. 19, 2021, 11:43 a.m.