#' @include clonoStats_class.R
NULL
# diversity functions
.nCells <- function(p){
return(sum(p))
}
.nClonotypes <- function(f){
return(sum(f))
}
.shannon <- function(p){
p <- p[which(p > 0)]
p <- p / sum(p)
p <- p[which(p > 0)]
return(-sum(p * log(p)))
}
.normentropy <- function(p){
p <- p[which(p > 0)]
p <- p / sum(p)
p <- p[which(p > 0)]
return(-sum(p * log(p)) / log(length(p)))
}
.invsimpson <- function(p){
p <- p[which(p > 0)]
p <- p / sum(p)
p <- p[which(p > 0)]
return(1 / sum(p^2))
}
.ginisimpson <- function(p){
p <- p[which(p > 0)]
p <- p / sum(p)
p <- p[which(p > 0)]
return(1 - sum(p^2))
}
.chao1 <- function(f){
# f is the table of singletons, doubletons, etc.
# for integer counts, if p = x/sum(x), then f = table(x)
return(sum(f) + f[1]*(f[1]-1) / (2*(f[2]+1)))
}
#' @importFrom stats qnorm
.chaobunge <- function(f, t = 10, conf = 0.95){
# adapted from jipingw/SPECIES::ChaoBunge()
# f is the table of singletons, doubletons, etc.
# for integer counts, if p = x/sum(x), then f = table(x)
f <- f[seq_len(max(c(1, which(f>0))))] # trim trailing 0s
if (t!=round(t)||t<0){
stop("The cutoff t to define less abundant species must be",
" non-negative integer!")
}
if(is.numeric(conf)==FALSE||conf>1||conf<0){
stop("confidence level must be a numerical value between 0 and 1,",
" e.g. 0.95")
}
if(t > length(f)){
warning("The t that defines the abundant/rare species must be no ",
"larger than the most abundant species!","\n",
"We use t=", length(f), " in this calculation!")
t <- length(f)
}
m <- length(f)
############################
A1 <- sum(f[seq_len(t)])
A2 <- sum(f[seq_len(t)])-f[1]
B1 <- sum(seq_len(t) * f[seq_len(t)])
## point estimate (Equation 2, page 533 of Chao and Bunge 2002)
if(t == length(f)){
theta <- 1 - f[1]*sum(seq_len(m)^2*f) / (sum(seq_len(m)*f))^2
chao40 <- sum(f[seq_len(t)[-1]]) / theta
chao4 <- (1/theta-1) * sum(f[seq_len(m)[-1]]) - f[1]+sum(f)
}
if(t < length(f)){
theta <- 1 - f[1]*sum(seq_len(t)^2*f[seq_len(t)]) /
(sum(seq_len(t)*f[seq_len(t)]))^2
chao4 <- sum(f[t+seq_len(length(f)-t)]) +
sum(f[seq_len(t)[-1]]/theta)
chao40 <- sum(f[seq_len(t)[-1]])/theta
}
## duplication standard error(Unnumbered Eq. right below Eq. 2,
## page 534 of Chao and Bunge 2002)
D <- sum(seq_len(t)*seq_len(t)*f[seq_len(t)])
partial3 <- numeric(t)
partial3[1] <- sum(-f[seq_len(t)[-1]] / (1-f[1]*D/B1^2)^2 *
(-(D+f[1])/(B1^2) + (2*f[1]*D/B1^3)))
for (i in seq_len(t)[-1]){
partial3[i] <- 1/(1-f[1]*D/B1^2) - A2/(1-f[1]*D/B1^2)^2 *
(-f[1]*i^2/B1^2+2*f[1]*D*i/B1^3)
}
cova3 <- matrix(0, nrow = t, ncol = t)
for (i in seq_len(t)){
for (j in seq_len(t)){
if(i==j){
cova3[i,j] <- f[i]*(1-f[i]/chao40)
}
if(i!=j){
cova3[i,j] <- -f[i]*f[j]/chao40
}
}
}
SE <- 0
for (i in seq_len(t)){
for (j in seq_len(t)){
SE <- SE + partial3[i] * partial3[j] * cova3[i,j]
}
}
SE <- partial3 %*% cova3 %*% partial3
SE4 <- sqrt(SE[1,1])
##confidence interval
coe <- qnorm(1-(1-conf)/2, 0, 1)
C <- exp(coe*log(1+SE4^2/(chao4-A1)^2)^0.5)
lb <- floor(A1+(chao4-A1)/C)
ub <- ceiling(A1+(chao4-A1)*C)
return(c(est = chao4, CI.lower = lb, CI.upper = ub))
}
#' @title Sample diversity estimation
#' @name calculateDiversity
#' @param ... Additional arguments passed to external calculation methods.
#' @import methods
#' @export
setGeneric(name = "calculateDiversity",
signature = "x",
def = function(x, ...) standardGeneric("calculateDiversity"))
#' @rdname calculateDiversity
#'
#' @description This function uses various methods to estimate the clonotypic
#' diversity of samples based on a matrix of clonotype abundances (samples are
#' columns).
#'
#' @param x A matrix of abundance values where rows are features (clonotypes)
#' and columns are samples. This is created with \code{summarizeClonotypes}
#' using a sparse matrix computed with either \code{EMquant} or
#' \code{CRquant}.
#' @param methods A character vector specifying which diversity measures to use
#' (default = \code{'all'}, see Details).
#'
#' @details Available methods are total cells with appropriate TCR data
#' (\code{'nCells'}, not a diversity measure, but a useful point of
#' comparison), total clonotypes (\code{'nClonotypes'}), Shannon entropy
#' (\code{'shannon'}), Simpson index (\code{'simpson'}), inverse Simpson index
#' (\code{'invsimpson'}), Chao1 richness (\code{'chao1'}), and Chao-Bunge
#' richness (\code{'chaobunge'}). A special value of \code{'all'} is also
#' allowed, which will run all methods listed above.
#'
#' @details The \code{'chao1'} and \code{'chaobunge'} estimates assume all
#' abundances are integers. When this is not the case for the input matrix,
#' \code{k}, all values are multiplied by the \code{scaling_factor} and
#' rounded to the nearest integer. The resulting estimate is then divided by
#' \code{scaling_factor} to return to the original scale. The
#' \code{'shannon'}, \code{'simpson'}, and \code{'invsimpson'} methods work
#' with any input type.
#'
#' @return A matrix of diversity estimates for each sample. Note that the
#' \code{'chaobunge'} method also includes an estimate of the standard error.
#'
#' @examples
#' data('contigs')
#' x <- clonoStats(contigs)
#' calculateDiversity(x)
#'
#' @export
setMethod(f = "calculateDiversity",
signature = signature(x = "clonoStats"),
definition = function(x, methods = c('all','nCells','nClonotypes',
'shannon', 'normentropy',
'invsimpson', 'ginisimpson',
'chao1', 'chaobunge'),
...){
methods <- match.arg(methods, several.ok = TRUE)
if('all' %in% methods){
methods <- c('nCells', 'nClonotypes', 'shannon',
'normentropy', 'invsimpson', 'ginisimpson',
'chao1', 'chaobunge')
}
# check if all counts are integers
ints <- all(clonoAbundance(x) %% 1 == 0)
if(!ints && any(c('chao1','chaobunge') %in% methods)){
warning('Methods "chao1" and "chaobunge" are not valid with ',
'non-integer abundances.')
}
p <- clonoAbundance(x)
f <- clonoFrequency(x)
# loop over methods
results <- lapply(methods, function(m){
if(m == 'nCells'){
return(vapply(seq_len(ncol(p)), function(j){
.nCells(p[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'nClonotypes'){
return(vapply(seq_len(ncol(f)), function(j){
.nClonotypes(f[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'shannon'){
return(vapply(seq_len(ncol(p)), function(j){
.shannon(p[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'normentropy'){
return(vapply(seq_len(ncol(p)), function(j){
.normentropy(p[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'invsimpson'){
return(vapply(seq_len(ncol(p)), function(j){
.invsimpson(p[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'ginisimpson'){
return(vapply(seq_len(ncol(p)), function(j){
.ginisimpson(p[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'chao1'){
return(vapply(seq_len(ncol(f)), function(j){
.chao1(f[,j])
}, FUN.VALUE = 0.0))
}
if(m == 'chaobunge'){
cb <- vapply(seq_len(ncol(f)), function(j){
.chaobunge(f[,j])
}, FUN.VALUE = rep(0.0,3))
rownames(cb) <- c('chaobunge.est','chaobunge.CI.lower',
'chaobunge.CI.upper')
return(cb)
}
})
names(results) <- methods
results <- do.call(rbind, results)
colnames(results) <- colnames(p)
return(results)
})
#' @rdname calculateDiversity
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
#' @importFrom S4Vectors metadata
#' @export
setMethod(f = "calculateDiversity",
signature = signature(x = "SingleCellExperiment"),
definition = function(x, ...){
calculateDiversity(metadata(x)$clonoStats)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.