R/score_wrapper.R

Defines functions scoreThreshold scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreDistBf scoreDist

Documented in scoreDist scoreDistBf scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreThreshold

#' Score distribution
#'
#' This function computes the score distribution for the given PFM and
#' background. The Score distribution is computed based on an efficient
#' dynamic programming algorithm.
#'
#' @inheritParams motifValid
#' @template templates
#' @return List that contains
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score distribution
#' dp = scoreDist(motif, bg)
#'
#' @export
scoreDist = function(pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    scores = .Call(
        motifcounter_scorerange,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )

    dist = .Call(
        motifcounter_scoredist,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    return(list(scores = scores, dist = dist))
}


#' Score distribution
#'
#' This function computes the score distribution for a given PFM and
#' a background model.
#'
#' The result of this function is identical to \code{\link{scoreDist}},
#' however, the method employs a less efficient algorithm that
#' enumerates all DNA sequences of the length of the motif.
#' This function is only used for debugging and testing purposes
#' and might require substantial computational
#' resources for long motifs.
#'
#' @inheritParams scoreDist
#' @return List containing
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#'
#' @seealso \code{\link{scoreDist}}
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score distribution
#' dp = motifcounter:::scoreDistBf(motif, bg)
#'
scoreDistBf = function(pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    scores = .Call(
        motifcounter_scorerange,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )

    dist = .Call(
        motifcounter_scoredist_bf,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    return(list(scores = scores, dist = dist))
}

#' Score strand
#'
#' This function computes the per-position
#' score in a given DNA strand.
#'
#' The function returns the per-position scores
#' for the given strand. If the sequence is too short,
#' it contains an empty vector.
#'
#' @inheritParams scoreDist
#' @param seq A DNAString object
#' @return
#' \describe{
#' \item{scores}{Vector of scores on the given strand}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the per-position and per-strand scores
#' motifcounter:::scoreStrand(seqs[[1]], motif, bg)
#'
scoreStrand = function(seq, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    # Check class
    stopifnot(is(seq, "DNAString"))

    scores = .Call(
        motifcounter_scoresequence,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        toString(seq),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    return(as.numeric(scores))
}

#' Score observations
#'
#' This function computes the per-position and per-strand
#' score in a given DNA sequence.
#'
#' @inheritParams scoreDist
#' @param seq A DNAString object
#' @return List containing
#' \describe{
#' \item{fscores}{Vector of scores on the forward strand}
#' \item{rscores}{Vector of scores on the reverse strand}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the per-position and per-strand scores
#' scoreSequence(seqs[[1]], motif, bg)
#'
#' @export
scoreSequence = function(seq, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    # Check class
    stopifnot(is(seq, "DNAString"))

    fscores = scoreStrand(seq, pfm, bg)
    rscores = scoreStrand(seq, revcompMotif(pfm), bg)
    return(list(fscores = fscores, rscores = rscores))
}

#' Score profile across multiple sequences
#'
#' This function computes the per-position and per-strand
#' average score profiles across a set of DNA sequences.
#' It can be used to reveal positional constraints
#' of TFBSs.
#'
#' @inheritParams scoreDist
#' @param seqs A DNAStringSet or DNAString object
#'
#' @return List containing
#' \describe{
#' \item{fscores}{Vector of per-position average forward strand scores}
#' \item{rscores}{Vector of per-position average reverse strand scores}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score profile
#' scoreProfile(seqs, motif, bg)
#'
#' @export
scoreProfile = function(seqs, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)

    if (is(seqs, "DNAString")) {
        # wrap the sequence up as sequence set
        seqs = DNAStringSet(seqs)
    }

    stopifnot (is(seqs, "DNAStringSet"))

    if (any(lenSequences(seqs) != lenSequences(seqs)[1])) {
        stop(paste(strwrap("All DNAStrings in 'seqs' must be of equal length
            and must not contain 'N's. Please remove sequences that contain
            'N's from 'seqs' and/or trim
            trim the sequences such that they are equally long."),
            collapse = "\n"))
    }
    slen = lenSequences(seqs[1])

    if (slen <= ncol(pfm) - 1) {
        return (list(fscores = integer(0), rscores = integer(0)))
    }

    fscores = vapply(seqs, function(seq, pfm, bg) {
        s = scoreStrand(seq, pfm, bg)
    },
    FUN.VALUE = numeric(slen - ncol(pfm) + 1),
    pfm, bg)
    fscores = rowMeans(as.matrix(fscores))

    rscores = vapply(seqs, function(seq, pfm, bg) {
        s = scoreStrand(seq, revcompMotif(pfm), bg)
    },
    FUN.VALUE = numeric(slen - ncol(pfm) + 1),
    pfm, bg)
    rscores = rowMeans(as.matrix(rscores), na.rm = TRUE)
    return (list(
        fscores = as.vector(fscores),
        rscores = as.vector(rscores)
    ))
}

#' Score histogram on a single sequence
#'
#' This function computes the empirical score
#' distribution by normalizing the observed score histogram
#' for a given sequence.
#'
#'
#' @inheritParams scoreSequence
#' @return List containing
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#'
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the per-position and per-strand scores
#' motifcounter:::scoreHistogramSingleSeq(seqs[[1]], motif, bg)
#'
scoreHistogramSingleSeq = function(seq, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    stopifnot(is(seq, "DNAString"))
    motifAndBackgroundValid(pfm, bg)

    scores = .Call(
        motifcounter_scorerange,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )

    dist = .Call(
        motifcounter_scorehistogram,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        toString(seq),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    result = list(scores = scores, dist = dist)

    return(result)
}


#' Score histogram
#'
#' This function computes the empirical score
#' distribution for a given set of DNA sequences.
#'
#' It can be used to compare the empirical score
#' distribution against the theoretical one (see \code{\link{scoreDist}}).
#'
#' @inheritParams scoreProfile
#' @return List containing
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the empirical score histogram
#' scoreHistogram(seqs, motif, bg)
#'
#' @seealso \code{\link{scoreDist}}
#' @export
scoreHistogram = function(seqs, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)

    if (is(seqs, "DNAString")) {
        # wrap the sequence up as sequence set
        seqs = DNAStringSet(seqs)
    }

    stopifnot(is(seqs, "DNAStringSet"))

    # First, extract the score range
    his = lapply(seqs[1], scoreHistogramSingleSeq, pfm, bg)
    nseq = length(his)
    scores = his[[1]]$scores

    # Then extract the histogram
    his = vapply(seqs, function(seq, pfm, bg) {
        scoreHistogramSingleSeq(seq, pfm, bg)$dist
    }, FUN.VALUE = numeric(length(scores)), pfm, bg)

    freq = rowSums(his)
    result = list(scores = scores, dist = freq)

    return(result)
}

#' Score threshold
#'
#' This function computes the score threshold for a desired
#' false positive probability `alpha`.
#'
#' Note that the returned alpha usually differs slightly
#' from the one that is prescribed using
#' \code{\link{motifcounterOptions}}, because
#' of the discrete nature of the sequences.
#'
#' @inheritParams scoreDist
#' @return List containing
#' \describe{
#' \item{threshold}{Score threshold}
#' \item{alpha}{False positive probability}
#' }
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score threshold
#' motifcounter:::scoreThreshold(motif, bg)
#'
scoreThreshold = function(pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)

    scoredist = scoreDist(pfm, bg)

    # find quantile
    ind = which(1 - cumsum(scoredist$dist) <= sigLevel())
    if (length(ind) <= 1) {
        stop(paste(strwrap(
            "The significance level 'alpha' is too stringent
            for the given 'pfm'. Motif matches are impossible.
            Increase 'alpha' using 'motifcounterOptions()'."), collapse = "\n"))
    }
    ind = tail(ind, -1)
    alpha = sum(scoredist$dist[ind])

    ind = min(ind)
    threshold = scoredist$scores[ind]

    return(list(threshold = threshold, alpha = alpha))
}

Try the motifcounter package in your browser

Any scripts or data that you put into this service are public.

motifcounter documentation built on Nov. 8, 2020, 5:44 p.m.