R/score_wrapper.R
In motifcounter: R package for analysing TFBSs in DNA sequences

Documented in scoreDist scoreDistBf scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreThreshold

#' Score distribution
#'
#' This function computes the score distribution for the given PFM and
#' background. The Score distribution is computed based on an efficient
#' dynamic programming algorithm.
#'
#' @inheritParams motifValid
#' @template templates
#' @return List that contains
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score distribution
#' dp = scoreDist(motif, bg)
#'
#' @export
scoreDist = function(pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    scores = .Call(
        motifcounter_scorerange,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )

    dist = .Call(
        motifcounter_scoredist,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    return(list(scores = scores, dist = dist))
}


#' Score distribution
#'
#' This function computes the score distribution for a given PFM and
#' a background model.
#'
#' The result of this function is identical to \code{\link{scoreDist}},
#' however, the method employs a less efficient algorithm that
#' enumerates all DNA sequences of the length of the motif.
#' This function is only used for debugging and testing purposes
#' and might require substantial computational
#' resources for long motifs.
#'
#' @inheritParams scoreDist
#' @return List containing
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#'
#' @seealso \code{\link{scoreDist}}
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score distribution
#' dp = motifcounter:::scoreDistBf(motif, bg)
#'
scoreDistBf = function(pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    scores = .Call(
        motifcounter_scorerange,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )

    dist = .Call(
        motifcounter_scoredist_bf,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    return(list(scores = scores, dist = dist))
}

#' Score strand
#'
#' This function computes the per-position
#' score in a given DNA strand.
#'
#' The function returns the per-position scores
#' for the given strand. If the sequence is too short,
#' it contains an empty vector.
#'
#' @inheritParams scoreDist
#' @param seq A DNAString object
#' @return
#' \describe{
#' \item{scores}{Vector of scores on the given strand}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the per-position and per-strand scores
#' motifcounter:::scoreStrand(seqs[[1]], motif, bg)
#'
scoreStrand = function(seq, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    # Check class
    stopifnot(is(seq, "DNAString"))

    scores = .Call(
        motifcounter_scoresequence,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        toString(seq),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    return(as.numeric(scores))
}

#' Score observations
#'
#' This function computes the per-position and per-strand
#' score in a given DNA sequence.
#'
#' @inheritParams scoreDist
#' @param seq A DNAString object
#' @return List containing
#' \describe{
#' \item{fscores}{Vector of scores on the forward strand}
#' \item{rscores}{Vector of scores on the reverse strand}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the per-position and per-strand scores
#' scoreSequence(seqs[[1]], motif, bg)
#'
#' @export
scoreSequence = function(seq, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    motifAndBackgroundValid(pfm, bg)

    # Check class
    stopifnot(is(seq, "DNAString"))

    fscores = scoreStrand(seq, pfm, bg)
    rscores = scoreStrand(seq, revcompMotif(pfm), bg)
    return(list(fscores = fscores, rscores = rscores))
}

#' Score profile across multiple sequences
#'
#' This function computes the per-position and per-strand
#' average score profiles across a set of DNA sequences.
#' It can be used to reveal positional constraints
#' of TFBSs.
#'
#' @inheritParams scoreDist
#' @param seqs A DNAStringSet or DNAString object
#'
#' @return List containing
#' \describe{
#' \item{fscores}{Vector of per-position average forward strand scores}
#' \item{rscores}{Vector of per-position average reverse strand scores}
#' }
#'
#' @examples
#'
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score profile
#' scoreProfile(seqs, motif, bg)
#'
#' @export
scoreProfile = function(seqs, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)

    if (is(seqs, "DNAString")) {
        # wrap the sequence up as sequence set
        seqs = DNAStringSet(seqs)
    }

    stopifnot (is(seqs, "DNAStringSet"))

    if (any(lenSequences(seqs) != lenSequences(seqs)[1])) {
        stop(paste(strwrap("All DNAStrings in 'seqs' must be of equal length
            and must not contain 'N's. Please remove sequences that contain
            'N's from 'seqs' and/or trim
            trim the sequences such that they are equally long."),
            collapse = "\n"))
    }
    slen = lenSequences(seqs[1])

    if (slen <= ncol(pfm) - 1) {
        return (list(fscores = integer(0), rscores = integer(0)))
    }

    fscores = vapply(seqs, function(seq, pfm, bg) {
        s = scoreStrand(seq, pfm, bg)
    },
    FUN.VALUE = numeric(slen - ncol(pfm) + 1),
    pfm, bg)
    fscores = rowMeans(as.matrix(fscores))

    rscores = vapply(seqs, function(seq, pfm, bg) {
        s = scoreStrand(seq, revcompMotif(pfm), bg)
    },
    FUN.VALUE = numeric(slen - ncol(pfm) + 1),
    pfm, bg)
    rscores = rowMeans(as.matrix(rscores), na.rm = TRUE)
    return (list(
        fscores = as.vector(fscores),
        rscores = as.vector(rscores)
    ))
}

#' Score histogram on a single sequence
#'
#' This function computes the empirical score
#' distribution by normalizing the observed score histogram
#' for a given sequence.
#'
#'
#' @inheritParams scoreSequence
#' @return List containing
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#'
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the per-position and per-strand scores
#' motifcounter:::scoreHistogramSingleSeq(seqs[[1]], motif, bg)
#'
scoreHistogramSingleSeq = function(seq, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)
    stopifnot(is(seq, "DNAString"))
    motifAndBackgroundValid(pfm, bg)

    scores = .Call(
        motifcounter_scorerange,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )

    dist = .Call(
        motifcounter_scorehistogram,
        as.numeric(pfm),
        nrow(pfm),
        ncol(pfm),
        toString(seq),
        getStation(bg),
        getTrans(bg),
        as.integer(getOrder(bg))
    )
    result = list(scores = scores, dist = dist)

    return(result)
}


#' Score histogram
#'
#' This function computes the empirical score
#' distribution for a given set of DNA sequences.
#'
#' It can be used to compare the empirical score
#' distribution against the theoretical one (see \code{\link{scoreDist}}).
#'
#' @inheritParams scoreProfile
#' @return List containing
#' \describe{
#' \item{scores}{Vector of scores}
#' \item{dist}{Score distribution}
#' }
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the empirical score histogram
#' scoreHistogram(seqs, motif, bg)
#'
#' @seealso \code{\link{scoreDist}}
#' @export
scoreHistogram = function(seqs, pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)

    if (is(seqs, "DNAString")) {
        # wrap the sequence up as sequence set
        seqs = DNAStringSet(seqs)
    }

    stopifnot(is(seqs, "DNAStringSet"))

    # First, extract the score range
    his = lapply(seqs[1], scoreHistogramSingleSeq, pfm, bg)
    nseq = length(his)
    scores = his[[1]]$scores

    # Then extract the histogram
    his = vapply(seqs, function(seq, pfm, bg) {
        scoreHistogramSingleSeq(seq, pfm, bg)$dist
    }, FUN.VALUE = numeric(length(scores)), pfm, bg)

    freq = rowSums(his)
    result = list(scores = scores, dist = freq)

    return(result)
}

#' Score threshold
#'
#' This function computes the score threshold for a desired
#' false positive probability `alpha`.
#'
#' Note that the returned alpha usually differs slightly
#' from the one that is prescribed using
#' \code{\link{motifcounterOptions}}, because
#' of the discrete nature of the sequences.
#'
#' @inheritParams scoreDist
#' @return List containing
#' \describe{
#' \item{threshold}{Score threshold}
#' \item{alpha}{False positive probability}
#' }
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#'
#' # Load background
#' bg = readBackground(seqs, 1)
#'
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Compute the score threshold
#' motifcounter:::scoreThreshold(motif, bg)
#'
scoreThreshold = function(pfm, bg) {
    motifValid(pfm)
    stopifnot(is(bg, "Background"))
    validObject(bg)

    scoredist = scoreDist(pfm, bg)

    # find quantile
    ind = which(1 - cumsum(scoredist$dist) <= sigLevel())
    if (length(ind) <= 1) {
        stop(paste(strwrap(
            "The significance level 'alpha' is too stringent
            for the given 'pfm'. Motif matches are impossible.
            Increase 'alpha' using 'motifcounterOptions()'."), collapse = "\n"))
    }
    ind = tail(ind, -1)
    alpha = sum(scoredist$dist[ind])

    ind = min(ind)
    threshold = scoredist$scores[ind]

    return(list(threshold = threshold, alpha = alpha))
}

Any scripts or data that you put into this service are public.

motifcounter documentation built on Nov. 8, 2020, 5:44 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

motifcounter
R package for analysing TFBSs in DNA sequences

R/score_wrapper.R
In motifcounter: R package for analysing TFBSs in DNA sequences

Defines functions scoreThreshold scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreDistBf scoreDist

Documented in scoreDist scoreDistBf scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreThreshold

Try the motifcounter package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

motifcounter R package for analysing TFBSs in DNA sequences

R/score_wrapper.R In motifcounter: R package for analysing TFBSs in DNA sequences

Defines functions scoreThreshold scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreDistBf scoreDist

Documented in scoreDist scoreDistBf scoreHistogram scoreHistogramSingleSeq scoreProfile scoreSequence scoreStrand scoreThreshold

Try the motifcounter package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

motifcounter
R package for analysing TFBSs in DNA sequences

R/score_wrapper.R
In motifcounter: R package for analysing TFBSs in DNA sequences