## Calculate negative sum of NB log-likelihoods
#' @param input_parameter values of (mu, r) in NB distribution
#' @param x_vec a vector of x values as observations from NB distribution
#' @param sc_vec a vector of size factor s_c in NB(s_c*mu, r)
#' @importFrom stats dnbinom
#' @noRd
neg_nb_logsum <- function(input_parameter, x_vec, sc_vec){
mu <- input_parameter[1]
r <- input_parameter[2]
return(-sum(dnbinom(x_vec, size = r, mu = sc_vec * mu, log = TRUE)))
## Calculate MLE of NB distribution with size factor
## per cell-type per gene
nb_size_mle <- function(x_vec, sc_vec){
if(sum(x_vec) == 0){
return(list(param = c(0.001, 0.2),
value = neg_nb_logsum(c(0.001, 0.2), x_vec, sc_vec)))
avg <- mean(x_vec); s2 <- var(x_vec)
init_mu <- avg
tmp_init_r <- avg^2/(s2 - avg)
init_r_0 <- ifelse(is.na(tmp_init_r)|(tmp_init_r < 0.1), 0.1, tmp_init_r)
init_r <- ifelse(init_r_0 > 50, 50, init_r_0)
nb_est <- nlminb(c(init_mu, init_r),
objective = neg_nb_logsum,
gradient = NULL,
lower = c(1e-6, 1e-6),
upper = c(1e3, 1e6),
x_vec = x_vec,
sc_vec = sc_vec)
return(list(param = nb_est$par, value = nb_est$objective))
## Calculate the sum of negative likelihood
## all cell-type per gene per batch
single_batch_one_gene_likelihood <- function(
gvec_list <- split(gvec, f = factor(candidate_label))
sc_list <- split(cell_size_factor, f = factor(candidate_label))
neg_llk_g <- 0
for(k in seq_len(ncluster)){
gvec_ct <- gvec_list[[k]]
sc_ct <- sc_list[[k]]
neg_llk_g <- neg_llk_g + nb_size_mle(gvec_ct, sc_ct)$value
## Calculate the sum of negative likelihood
## all cell-types per gene all batches
multi_batch_one_gene_likelihood <- function(gvec,
lrt_pval_threshold = 0.01){
gvec_ct_list <- split(gvec, f = factor(candidate_label))
lrt_test_pval <- numeric(ncluster)
total_nllk <- 0
for(k in seq_len(ncluster)){
cur_ct_gvec <- gvec_ct_list[[k]]
cur_ct_size <- size_ct_list[[k]]
cur_ct_batch <- batch_ct_list[[k]]
cur_ct_nbatch <- length(unique(cur_ct_batch))
## fit one NB for all batches within this cluster
common_nb_nllk <- nb_size_mle(cur_ct_gvec, cur_ct_size)$value
if(cur_ct_nbatch == 1){
total_nllk <- total_nllk + common_nb_nllk
lrt_test_pval[k] <- 1
} else{
sep_nllk <- 0
gvec_ct_batch_list <- split(cur_ct_gvec, f = cur_ct_batch)
size_ct_batch_list <- split(cur_ct_size, f = cur_ct_batch)
## fit separate NB for each batch within this cluster
for(b in seq_len(cur_ct_nbatch)){
sep_nllk <- sep_nllk + nb_size_mle(gvec_ct_batch_list[[b]],
## likelihood ratio test (LRT) to decide which one to choose
lrt_test_pval_cur_ct <- round(pchisq(2*(common_nb_nllk - sep_nllk),
2*(cur_ct_nbatch - 1),
lower.tail = FALSE), 4)
lrt_test_pval[k] <- lrt_test_pval_cur_ct
total_nllk <- total_nllk + ifelse(lrt_test_pval_cur_ct < lrt_pval_threshold,
sep_nllk, common_nb_nllk)
return(list(NegLLK = total_nllk, nReject = sum(lrt_test_pval < lrt_pval_threshold)))
## Calculate the CDI values for one label set
calculate_CDI_oneset <- function(
batch_label = NULL,
lrt_pval_threshold = 0.01){
original_ncluster <- length(unique(candidate_label))
#### One-batch scenario
if(is.null(batch_label) | (length(unique(batch_label)) == 1)){
## filter clusters with small number of cells
min_ncell <- min(table(candidate_label))
if(min_ncell < 3){
sub_indx <- c(seq_len(length(candidate_label)))[(candidate_label %in% names(which(table(candidate_label) > 2)))]
candidate_label <- candidate_label[sub_indx]
sub_gcmat <- sub_gcmat[,sub_indx]
cell_size_factor <- cell_size_factor[sub_indx]
ng <- nrow(sub_gcmat); nc <- ncol(sub_gcmat)
## after filtering, it is possible ncluster < original_cluster
## for fair comparison, use original_cluster in penalty
ncluster <- length(unique(candidate_label))
## calculating log-likelihood
rownames(sub_gcmat) <- paste0("g", seq_len(ng))
sub_gclist <- split(sub_gcmat, f = rownames(sub_gcmat))
neg_llk_list <- bplapply(sub_gclist,
candidate_label = candidate_label,
ncluster = ncluster,
cell_size_factor = cell_size_factor,
neg_llk <- sum(unlist(neg_llk_list))
npara <- ng * original_ncluster * 2
#### Multi-batch scenario
} else{
combine_label <- paste0("ct_", candidate_label, "_b_", batch_label)
min_combine_ncell <- min(table(combine_label))
## filter clusters with small number of cells
if(min_combine_ncell < 3){
sub_indx <- c(seq_len(length(combine_label)))[(combine_label %in% names(which(table(combine_label) > 2)))]
candidate_label <- candidate_label[sub_indx]
batch_label <- batch_label[sub_indx]
sub_gcmat <- sub_gcmat[,sub_indx]
cell_size_factor <- cell_size_factor[sub_indx]
ng <- nrow(sub_gcmat); nc <- ncol(sub_gcmat)
## after filtering, it is possible ncluster < original_cluster
## for fair comparison, use original_cluster in penalty
ncluster <- length(unique(candidate_label))
rownames(sub_gcmat) <- paste0("g", seq_len(ng))
batch_ct_list <- split(batch_label, f = candidate_label)
size_ct_list <- split(cell_size_factor, f = candidate_label)
sub_gclist <- split(sub_gcmat, f = rownames(sub_gcmat))
neg_llk_list <- bplapply(sub_gclist,
candidate_label = candidate_label,
ncluster = ncluster,
batch_ct_list = batch_ct_list,
size_ct_list = size_ct_list,
lrt_pval_threshold = lrt_pval_threshold,
neg_llk <- sum(unlist(lapply(neg_llk_list, '[[', 'NegLLK')))
total_rej <- sum(unlist(lapply(neg_llk_list, '[[', 'nReject')))
npara <- (ng * original_ncluster + total_rej) * 2
CDI_AIC = 2*neg_llk + 2*npara,
CDI_BIC = 2*neg_llk + npara*log(nc),
neg_llk_val = neg_llk,
N_cluster = original_ncluster))
#' Size factor of each cell
#' Different cells have different library sizes.
#' This function calculates the size factor of each cell in the UMI count matrix
#' to capture the variation in cell library size.
#' @param X The class of X can be "matrix", "Seurat" object, or "SingleCellExperiment" object.
#' If X is a matrix, it should be a raw UMI count matrix where each row represents a gene, and
#' each column represents a cell. The genes should be those before feature gene selection.
#' If X is a Seurat object or SingleCellExperiment object, users need to specify where the count
#' matrix is stored in count_slot.
#' @param count_slot A string indicating the location of raw UMI count.
#' For Seurat object, it is a slot in "RNA" of "assays";
#' For SingleCellExperiment object, it is a slot in "assays".
#' Each row represents a gene, and each column represents a cell.
#' The genes should be those before feature gene selection.
#' @importFrom matrixStats colMedians
#' @importFrom SingleCellExperiment SingleCellExperiment rowData colData
#' @importFrom SummarizedExperiment assays
#' @importFrom Seurat GetAssayData
#' @importFrom methods is
#' @return A numeric vector indicating the size factor of the cells.
#' This should be one of the inputs of the function calculate_CDI.
#' @examples
#' ng <- 100; nc <- 100
#' set.seed(1)
#' X <- cbind(
#' matrix(
#' c(rnbinom(ng*nc/4, size = 1, mu = 0.1),
#' rnbinom(ng*nc/4, size = 1, mu = 0.5)),
#' nrow = ng,
#' byrow = TRUE),
#' matrix(
#' c(rnbinom(ng*nc/4, size = 1, mu = 1),
#' rnbinom(ng*nc/4, size = 1, mu = 0.5)),
#' nrow = ng,
#' byrow = TRUE))
#' colnames(X) <- paste0('c', seq_len(nc))
#' rownames(X) <- paste0('g', seq_len(ng))
#' ## Input: matrix
#' cell_size <- size_factor(X = X)
#' ## Input: SingleCellExperiment object
#' library(SingleCellExperiment)
#' sim_sce <- SingleCellExperiment(
#' list(count = X),
#' colData = data.frame(Cell_name = colnames(X)),
#' rowData = data.frame(Gene_name = rownames(X)))
#' cell_size <- size_factor(X = sim_sce, count_slot = "count")
#' ## Input: Seurat object
#' library(Seurat)
#' library(SeuratObject)
#' sim_seurat <- CreateSeuratObject(counts = as.data.frame(X))
#' sim_seurat <- AddMetaData(sim_seurat, colnames(X), "Cell_name")
#' cell_size <- size_factor(X = sim_seurat, count_slot = "counts")
#' @export
size_factor <- function(
count_slot = NULL){
# extract counts
gcmat = extract_count(X, count_slot)
# calculate size factors
gcmat[gcmat == 0] <- 0.5
nc <- ncol(gcmat)
log_gcmat <- log(gcmat)
ref_size <- exp(rowMeans(log_gcmat))
ratio_to_ref <- sweep(gcmat, 1, ref_size, "/")
cell_size_factor <- colMedians(ratio_to_ref)
#' Clustering Deviance Index (CDI)
#' This function calculates CDI-AIC and CDI-BIC for each candidate set of cell labels.
#' CDI calculates AIC and BIC of cell-type-specific gene-specific NB model for UMI counts,
#' where the cell types are based on each candidate label set,
#' and only the selected subset of genes are considered.
#' Whether to use CDI-AIC or CDI-BIC depend on the goals.
#' We suggest using CDI-BIC to select optimal main cell types and using CDI-AIC
#' to select optimal subtypes, because BIC puts more penalty on the complexity
#' of models (number of clusters).
#' @param X The class of X can be "matrix", "Seurat" object, or "SingleCellExperiment" object.
#' If X is a matrix, it should be a UMI count matrix where each row represents a gene, and
#' each column represents a cell.
#' If X is a Seurat object or SingleCellExperiment object,
#' users need to specify where the count matrix and
#' batch labels are stored in count_slot and batch_slot, respectively.
#' If feature_gene_index is NULL, genes in X should only included feature genes
#' (that are selected by feature_gene_selection function); if feature_gene_index
#' is not NULL, this function will extract a subset of X
#' with genes indexed by feature_gene_index.
#' @param feature_gene_index A vector of unique integers indicating the indices of feature
#' genes. The default value if NULL, which means all genes in X will be used to
#' calculate CDI. The integers in feature_gene_index need to be no greater than
#' the number of genes in X.
#' @param cand_lab_df A vector of cluster labels of the cells or
#' a data frame where each column corresponds to one set of cluster labels of
#' the cells. This (these) label sets can be clustering results obtained by
#' any clustering methods. The length (number of rows) of
#' cand_lab_df should be the same as the number of columns in
#' the count matrix.
#' If the column names of label set data frame are provided with the format
#' "[ClusteringMethod]_k[NumberOfClusters]" such as "KMeans_K5, `calculate_CDI`
#' will extract the "[ClusteringMethod]" as the Cluster_method.
#' The clustering method can also be provided in the
#' argument "clustering_method" for each label set.
#' @param cell_size_factor A numeric vector indicating the size factor
#' of the cells. This should be the output of function size_factor.
#' The length of cell_size_factor should be the same as the number of columns
#' in the count matrix.
#' @param batch_label A vector of characters indicating the batch labels of the cells.
#' The length of batch_label should be the same as the number of columns
#' in the count matrix.
#' @param count_slot A string indicating the location of raw UMI count.
#' For Seurat object, it is a slot in "RNA" of "assays";
#' For SingleCellExperiment object, it is a slot in "assays".
#' Each row represents a gene, and each column represents a cell.
#' The genes should be those before feature gene selection.
#' @param batch_slot A string indicating the location of batch labels of cells.
#' For Seurat object, it is a slot in meta.data;
#' For SingleCellExperiment object, it is a slot in "colData".
#' The default value is NULL indicating that there is no batch information available.
#' @param lrt_pval_threshold A numeric value within (0, 1) indicating
#' the p-value threshold for the likelihood ratio test (LRT). If multiple
#' batches exist, within each cluster and each gene, CDI will test whether
#' a batch-common NB model or a batch-specific NB model should be fitted
#' with the LRT. If the p-value is less than this threshold, a batch-specific
#' NB model will be fitted. Otherwise, a batch-common NB model will be fitted.
#' @param clustering_method A vector of characters indicating the corresponding clustering
#' method for each label set. The length of the vector needs to be the same
#' as the number of columns in cand_lab_df.
#' @param BPPARAM A \code{\link{BiocParallelParam}} object from the BiocParallel
#' package. By specifying this argument, users can control over how to perform
#' the parallel computing. Default is \code{\link{SerialParam}} which uses a
#' single core.
#' @importFrom SingleCellExperiment SingleCellExperiment rowData colData
#' @importFrom SummarizedExperiment assays
#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom stats nlminb pchisq var
#' @importFrom matrixStats rowMedians
#' @importFrom Seurat GetAssayData FetchData
#' @importFrom methods is
#' @return calculate_CDI returns a data frame with 5 columns. The columns are
#' Label_name (name of each label set), Cluster_method (clustering method), CDI-AIC,
#' CDI-BIC, and N_cluster (number of clusters). Each row corresponds to one set of cell labels.
#' @examples
## Simulate count matrix, batch, and cell clustering labels
#' ng <- 100; nc <- 100
#' set.seed(1)
#' # count matrix
#' X <- cbind(
#' matrix(
#' c(rnbinom(ng*nc/4, size = 1, mu = 0.1),
#' rnbinom(ng*nc/4, size = 1, mu = 0.5)),
#' nrow = ng,
#' byrow = TRUE),
#' matrix(
#' c(rnbinom(ng*nc/4, size = 1, mu = 1),
#' rnbinom(ng*nc/4, size = 1, mu = 0.5)),
#' nrow = ng,
#' byrow = TRUE))
#' colnames(X) <- paste0('c', seq_len(nc))
#' rownames(X) <- paste0('g', seq_len(ng))
#' # batch label
#' Batches <- rep(seq_len(2), nc/2)
#' # cell clustering labels
#' Method1_k2 <- rep(seq_len(2), c(nc/2,nc/2))
#' Method1_k3 <- sample(seq_len(3), nc, replace = TRUE)
#' label_df <- data.frame(
#' Method1_k2 = Method1_k2,
#' Method1_k3 = Method1_k3)
#' ## select feature genes (see feature_gene_selection function)
#' selected_genes <- seq_len(30)
#' ## calculate size factor (see size_factor function)
#' size_factor_vec <- rep(1, nc)
#' calculate_CDI(
#' X = X[selected_genes, ],
#' cand_lab_df = label_df,
#' cell_size_factor = size_factor_vec,
#' batch_label = Batches)
#' ## Input: SingleCellExperiment object
#' library(SingleCellExperiment)
#' sim_sce <- SingleCellExperiment(
#' list(count = X),
#' colData = data.frame(
#' Cell_name = colnames(X),
#' batch = Batches),
#' rowData = data.frame(
#' Gene_name = rownames(X)))
#' calculate_CDI(
#' X = sim_sce,
#' feature_gene_index = selected_genes,
#' cand_lab_df = label_df,
#' cell_size_factor = size_factor_vec,
#' count_slot = "count",
#' batch_slot = "batch")
#' ## Input: Seurat object
#' library(Seurat)
#' library(SeuratObject)
#' sim_seurat <- CreateSeuratObject(counts = as.data.frame(X))
#' sim_seurat <- AddMetaData(sim_seurat, colnames(X), "Cell_name")
#' sim_seurat <- AddMetaData(sim_seurat, Batches, "batch")
#' calculate_CDI(
#' X = sim_seurat,
#' feature_gene_index = selected_genes,
#' cand_lab_df = label_df,
#' cell_size_factor = size_factor_vec,
#' count_slot = "counts",
#' batch_slot = "batch")
#' ## parallel computing
#' library(BiocParallel)
#' ## single core
#' bp_object <- SerialParam()
#' ## multi-cores
#' ## bp_object <- MulticoreParam(workers = 2)
#' calculate_CDI(
#' X = X[selected_genes, ],
#' cand_lab_df = label_df,
#' cell_size_factor = size_factor_vec,
#' batch_label = Batches,
#' lrt_pval_threshold = 0.01,
#' clustering_method = NULL,
#' BPPARAM = bp_object)
#' @references SMartin Morgan, Valerie Obenchain, Michel Lang, Ryan
#' Thompson and Nitesh Turaga (2021).
#' \doi{https://github.com/Bioconductor/BiocParallel}
#' @export
calculate_CDI <- function(
feature_gene_index = NULL,
batch_label = NULL,
count_slot = NULL,
batch_slot = NULL,
lrt_pval_threshold = 0.01,
clustering_method = NULL,
BPPARAM = SerialParam()){
# extract count and batch label
gcmat <- extract_count(X, count_slot)
batch_label <- extract_batch(X, batch_label, batch_slot)
if(!is.null(batch_label) & length(batch_label) != ncol(gcmat)){
stop("the length of batch_label does not match the number of cells (columns) in X.")
if(max(feature_gene_index) > nrow(gcmat)){
stop("feature_gene_index exceed the number of genes in X.")
} else{
sub_gcmat <- as.matrix(gcmat[feature_gene_index, ])
# if feature genes are not given, use all genes
} else{
sub_gcmat <- as.matrix(gcmat)
rm(X, gcmat)
## if cand_lab_df is a vector or a data frame with one column
vec_1col_df <- ifelse(is.vector(cand_lab_df), TRUE, dim(cand_lab_df)[2] == 1)
sub_gcmat = sub_gcmat,
candidate_label = unlist(cand_lab_df),
batch_label = batch_label,
cell_size_factor = cell_size_factor,
lrt_pval_threshold = lrt_pval_threshold))
## if cand_lab_df is a a data frame with more than one column
} else {
lab_name <- colnames(cand_lab_df)
cdi_return_df <- data.frame(Label_name = paste0("Label", seq_len(ncol(cand_lab_df))))
cdi_return_df["Label_name"] <- lab_name
cdi_return_df["Cluster_method"] <- ifelse(
grepl(pattern = "^(\\w+)(_k)(\\d+)$", x = lab_name, ignore.case = TRUE),
unlist(lapply(strsplit(lab_name, "_"), "[", 1)),
cdi_return_df["Cluster_method"] <- clustering_method
cdi_return <- apply(
X = cand_lab_df,
FUN = calculate_CDI_oneset,
sub_gcmat = sub_gcmat,
batch_label = batch_label,
cell_size_factor = cell_size_factor,
lrt_pval_threshold = lrt_pval_threshold)
tmp_df <- do.call(rbind.data.frame, cdi_return)
cdi_return_df <- cbind(cdi_return_df, tmp_df)
