#' @include codonTable-class.R
#' @include genCode-class.R
#' @include functions.R
#' @import data.table
NULL
setGeneric(
name = "expectedCU",
def = function(cTobject,
gCobject,
subsets,
self,
ribosomal)
{
standardGeneric("expectedCU")
}
)
setMethod(
f = "expectedCU",
signature = c(cTobject = "codonTable", subsets = "list"),
definition = function(cTobject,
gCobject,
subsets,
self,
ribosomal)
{
ns <- length(cTobject@len)
if (!is.list(subsets))
stop(
paste0(
"Subsets must be either a (named) list of logical vectors",
", each of length ",
ns,
", or character vectors of KEGG/eggNOG annotations",
", or codonTable objects (of any length)."
)
)
if (length(subsets) != 0) {
ok <- vapply(subsets, function(x) {
all(is.vector(x, mode = "logical"), length(x) == ns) |
is.vector(x, mode = "character") |
all(inherits(x, "codonTable"), nrow(x) > 0)
}, logical(length = 1))
stopifnot(ok)
nam <- names(subsets)
nsubs <- length(subsets)
if (is.null(nam)) {
nam <- paste("subset", seq_len(nsubs), sep = ".")
} else {
nam[nam == ""] <- paste("subset", seq_len(nsubs)[nam == ""],
sep = ".")
}
names(subsets) <- make.names(nam, unique = TRUE)
}
if (ribosomal) {
ribKOs <- cTobject@KO %in% RPKOs
if (any(ribKOs))
subsets <- c(list(ribosomal = ribKOs), subsets)
}
if (self) {
self_set <- rep.int(TRUE, ns)
subsets <- c(list(self = c(self_set)), subsets)
}
vapply(subsets, function(s) {
if (all(class(s) %in% c("logical", "character")))
.normSetFrequencies(subset(cTobject, s), gCobject)
else if (all(is(s, "codonTable")))
.normSetFrequencies(s, gCobject)
}, numeric(length = nrow(gCobject@ctab)))
}
)
#' Calculate CU measures.
#'
#' Calculate values of the codon usage (CU) measure
#' for every sequence in the given \code{codonTable} object.
#' The following methods are implemented:
#' \code{MILC}, Measure Independent of Length and Composition
#' \href{https://bit.ly/2GkT7qe}{Supek & Vlahovicek (2005)},
#' \code{B}, codon usage bias (B)
#' \href{https://bit.ly/2DRGdeb}{Karlin et al. (2001)},
#' \code{ENC}, effective number of codons (ENC)
#' \href{https://www.ncbi.nlm.nih.gov/pubmed/2110097}{Wright (1990)}.
#' \code{ENCprime}, effective number of codons prime (ENC')
#' \href{https://www.ncbi.nlm.nih.gov/pubmed/12140252}{Novembre (2002)},
#' \code{MCB}, maximum-likelihood codon bias (MCB)
#' \href{https://bit.ly/2GlMRyy}{Urrutia and Hurst (2001)},
#' \code{SCUO}, synonymous codon usage eorderliness (SCUO)
#' \href{https://www.ncbi.nlm.nih.gov/pubmed/15222899}{Wan et al. (2004)}.
#'
#' @param cTobject A \code{codonTable} object.
#' @param subsets A (named) list of logical vectors, the length of each equal
#' to \code{getlen(cTobject)}, i.e. the number of sequences in the set, or
#' character vectors (of any length) containing KEGG/eggNOG annotations,
#' or codonTable objects (of any length).
#' Not used for \code{ENC}, \code{SCUO} and \code{GCB} calculations.
#' @param self Logical, if \code{TRUE} (default), CU statistic is also
#' calculated against the average CU of the entire set of sequences.
#' Not used for \code{ENC}, \code{SCUO} and \code{GCB} calculations.
#' @param ribosomal Logical, if \code{TRUE}, CU statistic is also calculated
#' against the average CU of the ribosomal genes in the sequence set.
#' Not used for \code{ENC} and \code{SCUO} calculations.
#' For GCB calculations, if \code{TRUE}, ribosomal genes are used
#' as a seed, and if \code{FALSE} (default), \code{seed}
#' has to be specified.
#' @inheritParams Biostrings::getGeneticCode
#' @param alt.init logical, whether to use alternative initiation codons.
#' Default is \code{TRUE}.
#' @param stop.rm Logical, whether to remove stop codons. Default is
#' \code{FALSE}.
#' @param filtering Character vector, one of \code{c("none", "soft", "hard")}.
#' Specifies whether sequences shorther than some threshold value of
#' length (in codons), \code{len.threshold}, should be excluded from
#' calculations. If \code{"none"} (default), length of sequences is not
#' checked, if \code{"soft"}, a warrning is printed if there are shorter
#' sequences, and if \code{"hard"}, these sequences are excluded from
#' calculation.
#' @param len.threshold Optional numeric, specifying sequence length,
#' in codons, used for filtering.
#'
#' @return A matrix or a numeric vector with CU measure values.
#' For \code{MILC}, \code{B}, \code{ENCprime}, the matrix has a column
#' with values for every specified subset
#' (\code{subsets}, \code{self}, \code{ribosomal}).
#' A numeric vector for \code{ENC} and \code{SCUO}.
#'
#' @examples
#' # load example DNA sequences
#' exampledir <- system.file("extdata", package = "coRdon")
#' cT <- codonTable(readSet(exampledir))
#'
#' # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#' # In the examples below, MILC values are calculated for all sequences;
#' # B and ENCprime can be caluclated in the same way.
#' # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#'
#' # calculate MILC distance to the average CU of the example DNA sequences
#' milc <- MILC(cT)
#' head(milc)
#'
#' # also calculate MILC distance to the average CU
#' # of ribosomal genes among the example DNA sequences
#' milc <- MILC(cT, ribosomal = TRUE)
#' head(milc)
#'
#' # calculate MILC distance to the average CU
#' # of the first 20 example DNA sequences
#' # (i.e. the first half of the example DNA set)
#' milc <- MILC(cT, self = FALSE,
#' subsets = list(half = c(rep(TRUE, 20), rep(FALSE, 20))))
#'
#' # alternatively, you can specify codonTable as a subset
#' halfcT <- codonTable(codonCounts(cT)[1:20,])
#' milc2 <- MILC(cT, self = FALSE, subsets = list(half = halfcT))
#' all.equal(milc, milc2) # TRUE
#'
#' # filtering
#' MILC(cT, filtering = "hard", len.threshold = 80) # MILC for 9 sequences
#' sum(getlen(cT) > 80) # 9 sequences are longer than 80 codons
#' milc1 <- MILC(cT, filtering = "none") # no filtering
#' milc2 <- MILC(cT, filtering = "soft") # warning
#' all.equal(milc1, milc2) # TRUE
#'
#' # options for genetic code
#' milc <- MILC(cT, stop.rm = TRUE) # don't use stop codons in calculation
#' milc <- MILC(cT, alt.init = FALSE) # don't use alternative start codons
#' milc <- MILC(cT, id_or_name2 = "2") # use different genetic code, for help
#' # see `?Biostrings::GENETIC_CODE`
#'
#' # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#' # In the examples below, ENC values are calculated for all sequences;
#' # SCUO values can be caluclated in the same way.
#' # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#'
#' # calculate ENC
#' enc <- ENC(cT)
#' head(enc)
#'
#' # filtering
#' ENC(cT, filtering = "hard", len.threshold = 80) # ENC for 9 sequences
#' sum(getlen(cT) > 80) # 9 sequences are longer than 80 codons
#' enc1 <- ENC(cT, filtering = "none") # no filtering
#' enc2 <- ENC(cT, filtering = "soft") # warning
#' all.equal(enc1, enc2) # TRUE
#'
#' # options for genetic code
#' enc <- ENC(cT, stop.rm = TRUE) # don't use stop codons in calculation
#' enc <- ENC(cT, alt.init = FALSE) # don't use alternative start codons
#' enc <- ENC(cT, id_or_name2 = "2") # use different genetic code, for help
#' # see `?Biostrings::GENETIC_CODE`
#'
#' @name codonUsage
NULL
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### MILC, Supek and Vlahovicek 2005.
###
#' @rdname codonUsage
#' @export
setGeneric(
name = "MILC",
def = function(cTobject,
subsets = list(),
self = TRUE,
ribosomal = FALSE,
id_or_name2 = "1",
alt.init = TRUE,
stop.rm = FALSE,
filtering = "none",
len.threshold = 80)
{
standardGeneric("MILC")
}
)
#' @rdname codonUsage
setMethod(
f = "MILC",
signature = c(cTobject = "codonTable"),
definition = function(cTobject,
subsets,
self,
ribosomal,
id_or_name2,
alt.init,
stop.rm,
filtering,
len.threshold) {
if (filtering == "hard") {
cTobject <- subset(cTobject, cTobject@len > len.threshold)
} else if (filtering == "soft") {
if (any(cTobject@len < len.threshold))
warning("Some sequences have below-threshold length!")
} else if (filtering == "none")
NULL
gCobject <- genCode(id_or_name2, alt.init, stop.rm)
if (stop.rm) {
cTobject@counts <- cTobject@counts[, gCobject@nostops]
cTobject@len <-
rowSums(cTobject@counts[, gCobject@nostops])
}
len <- cTobject@len
csums <- .countsbyaa(cTobject, gCobject)
fc <- .normFrequencies(cTobject, gCobject)
gc <-
expectedCU(cTobject, gCobject, subsets, self, ribosomal)
cor <-
as.numeric(((csums > 0) %*% (gCobject@deg - 1)) / len - 0.5)
vapply(colnames(gc), function(x) {
rowSums(2 * cTobject@counts * log(t(t(fc) / gc[, x])),
na.rm = TRUE) / len - cor
}, numeric(length = nrow(cTobject@counts)))
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### B, Karlin et al. 2001.
###
#' @rdname codonUsage
#' @export
setGeneric(
name = "B",
def = function(cTobject,
subsets = list(),
self = TRUE,
ribosomal = FALSE,
id_or_name2 = "1",
alt.init = TRUE,
stop.rm = FALSE,
filtering = "none",
len.threshold = 80)
{
standardGeneric("B")
}
)
#' @rdname codonUsage
setMethod(
f = "B",
signature = c(cTobject = "codonTable"),
definition = function(cTobject,
subsets,
self,
ribosomal,
id_or_name2,
alt.init,
stop.rm,
filtering,
len.threshold)
{
if (filtering == "hard") {
cTobject <- subset(cTobject, cTobject@len > len.threshold)
} else if (filtering == "soft") {
if (any(cTobject@len < len.threshold))
warning("Some sequences have below-threshold length!")
} else if (filtering == "none")
NULL
gCobject <- genCode(id_or_name2, alt.init, stop.rm)
if (stop.rm) {
cTobject@counts <- cTobject@counts[, gCobject@nostops]
cTobject@len <-
rowSums(cTobject@counts[, gCobject@nostops])
}
pa <- .countsbyaa(cTobject, gCobject) / cTobject@len
fc <- .normFrequencies(cTobject, gCobject)
gc <-
expectedCU(cTobject, gCobject, subsets, self, ribosomal)
vapply(colnames(gc), function(x) {
dt <- as.data.table(t(abs(t(fc) - gc[, x])))
ba <- vapply(gCobject@cl, function(x)
dt[, Reduce('+', .SD), .SDcols = x],
numeric(length = nrow(dt)))
rowSums(pa * ba, na.rm = TRUE)
}, numeric(length = nrow(cTobject@counts)))
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### MCB, Urutia and Hurst 2001.
###
#' @rdname codonUsage
#' @export
setGeneric(
name = "MCB",
def = function(cTobject,
subsets = list(),
self = TRUE,
ribosomal = FALSE,
id_or_name2 = "1",
alt.init = TRUE,
stop.rm = FALSE,
filtering = "none",
len.threshold = 80)
{
standardGeneric("MCB")
}
)
#' @rdname codonUsage
setMethod(
f = "MCB",
signature = c(cTobject = "codonTable"),
definition = function(cTobject,
subsets,
self,
ribosomal,
id_or_name2,
alt.init,
stop.rm,
filtering,
len.threshold)
{
if (filtering == "hard") {
cTobject <- subset(cTobject, cTobject@len > len.threshold)
} else if (filtering == "soft") {
if (any(cTobject@len < len.threshold))
warning("Some sequences have below-threshold length!")
} else if (filtering == "none")
NULL
gCobject <- genCode(id_or_name2, alt.init, stop.rm)
if (stop.rm) {
cTobject@counts <- cTobject@counts[, gCobject@nostops]
cTobject@len <-
rowSums(cTobject@counts[, gCobject@nostops])
}
deg <- gCobject@deg
csums <- .countsbyaa(cTobject, gCobject)
A <- csums[, deg > 1] > 0
fc <- .normFrequencies(cTobject, gCobject)
gc <-
expectedCU(cTobject, gCobject, subsets, self, ribosomal)
vapply(colnames(gc), function(x) {
dt <- as.data.table(t((t(fc) - gc[, x]) ^ 2 / gc[, x]))
dt[which(is.na(dt), arr.ind = TRUE)] <- 0 # gc > 0
dt[cTobject@counts <= 0] <- 0 # counts > 0
ba <- vapply(gCobject@cl, function(y)
dt[, Reduce('+', .SD), .SDcols = y],
numeric(length = nrow(dt)))
rowSums(ba[, deg > 1] * log10(csums[, deg > 1]),
na.rm = TRUE) / rowSums(A)
}, numeric(length = nrow(cTobject@counts)))
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### ENC' Npvembre 2002.
###
#' @rdname codonUsage
#' @export
setGeneric(
name = "ENCprime",
def = function(cTobject,
subsets = list(),
self = TRUE,
ribosomal = FALSE,
id_or_name2 = "1",
alt.init = TRUE,
stop.rm = TRUE,
filtering = "none",
len.threshold = 80)
{
standardGeneric("ENCprime")
}
)
#' @rdname codonUsage
setMethod(
f = "ENCprime",
signature = c(cTobject = "codonTable"),
definition = function(cTobject,
subsets,
self,
ribosomal,
id_or_name2,
alt.init,
stop.rm,
filtering,
len.threshold)
{
if (filtering == "hard") {
cTobject <- subset(cTobject, cTobject@len > len.threshold)
} else if (filtering == "soft") {
if (any(cTobject@len < len.threshold))
warning("Some sequences have below-threshold length!")
} else if (filtering == "none")
NULL
gCobject <- genCode(id_or_name2, alt.init, stop.rm)
if (stop.rm) {
cTobject@counts <- cTobject@counts[, gCobject@nostops]
cTobject@len <-
rowSums(cTobject@counts[, gCobject@nostops])
}
deg <- gCobject@deg
csums <- .countsbyaa(cTobject, gCobject)
fc <- .normFrequencies(cTobject, gCobject)
gc <-
expectedCU(cTobject, gCobject, subsets, self, ribosomal)
vapply(colnames(gc), function(x) {
# chi squared
dt <- as.data.table(t((t(fc) - gc[, x]) ^ 2 / gc[, x]))
dt[which(is.na(dt), arr.ind = TRUE)] <- 0
chisum <- vapply(gCobject@cl, function(x)
dt[, Reduce('+', .SD), .SDcols = x],
numeric(length = nrow(dt)))
chisq <- csums * chisum
# homozygosity
fa <- as.data.table(t((t(
chisq + csums
) - deg) /
(deg * t(csums - 1))))
fa[csums < 5] <- NA
.effNc(fa, deg)
}, numeric(length = nrow(cTobject@counts)))
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### ENC, Wright 1990.
###
#' @rdname codonUsage
#' @export
setGeneric(
name = "ENC",
def = function(cTobject,
id_or_name2 = "1",
alt.init = TRUE,
stop.rm = TRUE,
filtering = "none",
len.threshold = 80)
{
standardGeneric("ENC")
}
)
#' @rdname codonUsage
setMethod(
f = "ENC",
signature = c(cTobject = "codonTable"),
definition = function(cTobject,
id_or_name2,
alt.init,
stop.rm,
filtering,
len.threshold)
{
if (filtering == "hard") {
cTobject <- subset(cTobject, cTobject@len > len.threshold)
} else if (filtering == "soft") {
if (any(cTobject@len < len.threshold))
warning("Some sequences have below-threshold length!")
} else if (filtering == "none")
NULL
gCobject <- genCode(id_or_name2, alt.init, stop.rm)
if (stop.rm) {
cTobject@counts <- cTobject@counts[, gCobject@nostops]
cTobject@len <-
rowSums(cTobject@counts[, gCobject@nostops])
}
cl <- gCobject@cl
counts <- as.data.table(cTobject@counts)
csums <- .countsbyaa(cTobject, gCobject)
freqs <- lapply(seq_along(cl), function(x)
counts[, .SD / csums[, x], .SDcols = cl[[x]]])
pi <- vapply(freqs, function(x)
rowSums(x ^ 2, na.rm = TRUE),
numeric(length = nrow(counts)))
# remove AA with count <= 1
csums[which(csums <= 1, arr.ind = TRUE)] <- NA
fa <- as.data.table((csums * pi - 1) / (csums - 1))
.effNc(fa, gCobject@deg)
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SCUO, Wan et al. 2004.
###
#' @rdname codonUsage
#' @export
setGeneric(
name = "SCUO",
def = function(cTobject,
id_or_name2 = "1",
alt.init = TRUE,
stop.rm = FALSE,
filtering = "none",
len.threshold = 80)
{
standardGeneric("SCUO")
}
)
#' @rdname codonUsage
setMethod(
f = "SCUO",
signature = c(cTobject = "codonTable"),
definition = function(cTobject,
id_or_name2,
alt.init,
stop.rm,
filtering,
len.threshold)
{
if (filtering == "hard") {
cTobject <- subset(cTobject, cTobject@len > len.threshold)
} else if (filtering == "soft") {
if (any(cTobject@len < len.threshold))
warning("Some sequences have below-threshold length!")
} else if (filtering == "none")
NULL
gCobject <- genCode(id_or_name2, alt.init, stop.rm)
if (stop.rm) {
cTobject@counts <- cTobject@counts[, gCobject@nostops]
cTobject@len <-
rowSums(cTobject@counts[, gCobject@nostops])
}
cl <- gCobject@cl
counts <- as.data.table(cTobject@counts)
csums <- .countsbyaa(cTobject, gCobject)
freqs <- lapply(seq_along(cl), function(x)
counts[, .SD / csums[, x], .SDcols = cl[[x]]])
Ha <-
vapply(freqs, function(x)
rowSums(-x * log10(x), na.rm = TRUE),
numeric(length = nrow(counts)))
Hmax <- log10(vapply(cl, length, numeric(length = 1)))
Oa <- t((Hmax - t(Ha)) / Hmax)
Fa <- csums / rowSums(csums[, gCobject@deg > 1])
rowSums(Oa * Fa, na.rm = TRUE)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.