#Other metrics
#' entropy
#'
#' @param sce \code{SingleCellExperiment} object, with the integrated data.
#' @param group Character. Name of group/batch variable.
#' Needs to be one of \code{names(colData(sce))}.
#' @param k Numeric. Number of k-nearest neighbours (knn) to use.
#' @param dim_red Character. Name of embeddings to use as subspace for distance
#' distributions. Default is "PCA".
#' @param assay_name Character. Name of the assay to use for PCA.
#' Only relevant if no existing 'dim_red' is provided.
#' Must be one of \code{names(assays(sce))}. Default is "logcounts".
#' @param n_dim Numeric. Number of dimensions to include to define the subspace.
#' @param res_name Character. Appendix of the result score's name
#' (e.g. method used to combine batches).
#'
#' @details The entropy function calculates the Shannon entropy of the group
#' variable within each cell's k-nearest neighbourhood.
#' For balanced batches a Shannon entropy close to 1 indicates high randomness
#' and mixing. For unbalanced batches entropy should be interpreted with
#' caution, but could work as a relative measure in a comparative setting.
#'
#' @return A \code{SingleCellExperiment} with the entropy score within colData.
#' @export
#'
#' @examples
#' library(SingleCellExperiment)
#' sim_list <- readRDS(system.file("extdata/sim50.rds", package = "CellMixS"))
#' sce <- sim_list[[1]][, c(1:15, 400:420, 16:30)]
#'
#' sce <- entropy(sce, "batch", k = 20)
#'
#' @importFrom SingleCellExperiment colData
#' @importFrom BiocNeighbors findKNN
#' @importFrom magrittr %>% set_rownames set_colnames
#' @importFrom purrr %>% map
#' @importFrom BiocGenerics table
#' @importFrom methods is
#' @importFrom SummarizedExperiment colData<-
entropy <- function(sce, group, k, dim_red = "PCA", assay_name = "logcounts",
n_dim = 10, res_name = NULL){
#------------------Check input parameter ---------------------------------#
if( !is(sce, "SingleCellExperiment") ){
stop("Error: 'sce' must be a 'SingleCellExperiment' object.")
}
if( !group %in% names(colData(sce)) ){
stop("Error: 'group' variable must be in 'colData(sce)'")
}
if( !is(colData(sce)[,group], "factor") ){
sce[[group]] <- as.factor(colData(sce)[, group])
}
colData(sce)[, group] <- droplevels(colData(sce)[, group])
if( k >= ncol(sce) ){
warning("'k' exceeds number of cells. Is set to max (all cells).")
k <- ncol(sce) - 1
}
#----------------- determine knn matrix ----------------------------------#
cell_names <- colnames(sce)
names(cell_names) <- cell_names
subspace <- .defineSubspace(sce, assay_name, dim_red, n_dim)
#determine knn
knn <- findKNN(subspace, k = k) %>% map(set_rownames, cell_names)
#group assignment of knn cells for each cell
knn[[group]] <- matrix(
colData(sce)[as.numeric(as.character(knn$index)), group],
nrow = nrow(knn$index)) %>%
set_rownames(cell_names)
n_batch <- length(levels(colData(sce)[,group]))
#----------------- calculate entropy ----------------------------------#
entropy <- apply(knn[[group]], 1, function(x){
freq_batch = table(x)/length(x)
freq_batch_positive = freq_batch[freq_batch > 0]
shannon <- -sum(freq_batch_positive * log(freq_batch_positive))/
log(n_batch)
})
#Add to colData of sce
entropy <- as.data.frame(entropy) %>% set_colnames("entropy")
if( !is.null(res_name) ){
entropy <- as.data.frame(entropy) %>% set_colnames(res_name)
}
stopifnot(all(rownames(entropy) == colnames(sce)))
colData(sce) <- cbind(colData(sce), entropy)
colData(sce)[,colnames(entropy)] <- entropy
sce
}
#' isi
#'
#' @param sce \code{SingleCellExperiment} object, with the integrated data.
#' @param group Character. Name of group/batch variable.
#' Needs to be one of \code{names(colData(sce))}.
#' @param k Numeric. Number of k-nearest neighbours (knn) to use.
#' @param dim_red Character. Name of embeddings to use as subspace for distance
#' distributions. Default is "PCA".
#' @param assay_name Character. Name of the assay to use for PCA.
#' Only relevant if no existing 'dim_red' is provided.
#' @param n_dim Numeric. Number of dimensions to include to define the subspace.
#' @param weight Boolean. If TRUE, batch probabilities to calculate the isi
#' score are weighted by the mean distance of their cells towards the cell
#' of interest. Relevant for metrics: 'isi'.
#' @param res_name Character. Appendix of the result score's name
#' (e.g. method used to combine batches).
#'
#'@details The isi function calculates the inverse Simpson index of the group
#' variable within each cell's k-nearest neighbourhood.
#' The Simpson index describes the probability that two entities are taken at
#' random from the dataset and its inverse represent the effective number of
#' batches in a neighbourhood. The inverse Simpson index has been proposed as a
#' diversity score for batch mixing in single cell RNAseq by Korunsky et al.
#' They provide a distance-based neighbourhood weightening in their Lisi package.
#' Here, we provide a simplified way of weightening probabilitities, if the
#' \code{weight} argument is enabled.
#' @return A \code{SingleCellExperiment} with the entropy score within colData.
#' @export
#'
#' @references
#' Korsunsky I Fan J Slowikowski K Zhang F Wei K et. al. (2018).
#' Fast, sensitive, and accurate integration of single cell data with Harmony.
#' bioRxiv (preprint)
#'
#' @examples
#' library(SingleCellExperiment)
#' sim_list <- readRDS(system.file("extdata/sim50.rds", package = "CellMixS"))
#' sce <- sim_list[[1]][, c(1:15, 400:420, 16:30)]
#'
#' sce <- isi(sce, "batch", k = 20)
#' @importFrom SingleCellExperiment colData
#' @importFrom BiocNeighbors findKNN
#' @importFrom magrittr %>% set_rownames set_colnames set_names
#' @importFrom purrr %>% map
#' @importFrom BiocGenerics table
#' @importFrom methods is
#' @importFrom SummarizedExperiment colData<-
#' @importFrom dplyr bind_rows
isi <- function(sce, group, k, dim_red = "PCA", assay_name = "logcounts",
n_dim = 10, weight = TRUE, res_name = NULL){
#------------------Check input parameter ---------------------------------#
if( !is(sce, "SingleCellExperiment") ){
stop("Error: 'sce' must be a 'SingleCellExperiment' object.")
}
if( !group %in% names(colData(sce)) ){
stop("Error: 'group' variable must be in 'colData(sce)'")
}
if( !is(colData(sce)[,group], "factor") ){
sce[[group]] <- as.factor(colData(sce)[, group])
}
colData(sce)[, group] <- droplevels(colData(sce)[, group])
if( k >= ncol(sce) ){
warning("'k' exceeds number of cells. Is set to max (all cells).")
k <- ncol(sce) - 1
}
#----------------- determine knn matrix ----------------------------------#
cell_names <- colnames(sce)
names(cell_names) <- cell_names
batch_ids <- levels(colData(sce)[,group])
subspace <- .defineSubspace(sce, assay_name, dim_red, n_dim)
#determine knn
knn <- findKNN(subspace, k = k) %>% map(set_rownames, cell_names)
# group assignment of knn cells for each cell
knn[[group]] <- matrix(
colData(sce)[as.numeric(as.character(knn$index)), group],
nrow = nrow(knn$index)) %>%
set_rownames(cell_names)
#----------------- calculate Inverse simpson index -----------------------#
#We simplify the LISI score implemented in harmony by using a
#fixed neighbourhood of knn.
knn[["p"]] <- lapply(batch_ids, function(batch){
idx <- apply(knn[[group]], 1, function(cell){
id <- which(cell %in% batch)
})
if( length(idx) == 0 ){
p <- rep(0, length(cell_names)) %>% set_names(cell_names)
weights <- rep(0, length(cell_names)) %>% set_names(cell_names)
summary <- cbind(p, weights)
}else{
p <- lapply(cell_names, function(cell){
if( length(idx[[cell]]) == 0 ){
summary <- c(0, 0)
}else{
p_1 <- length(idx[[cell]])/k
p_cell <- p_1
weights <- 1/(knn[[2]][cell, idx[[cell]]] + 1)
summary <- c(p_cell, mean(weights))
}
}) %>% bind_cols() %>% t() %>% set_colnames(c("p", "weights"))
}
}) %>% set_names(batch_ids)
#Get distance based weights
knn[["weights"]] <- do.call("cbind", lapply(batch_ids, function(batch){
weights <- knn[["p"]][[batch]][,"weights"]
})) %>% set_colnames(batch_ids)
knn[["weights"]] <- knn[["weights"]]/rowSums(knn[["weights"]]) *
length(batch_ids)
#Calculate p^2
knn[["p2"]] <- do.call("cbind", lapply(batch_ids, function(batch){
p <- knn[["p"]][[batch]][,"p"]
})) %>% set_colnames(batch_ids)
if( weight ){
knn[["p2"]] <-(knn[["p2"]] * knn[["weights"]])^2
}else{
knn[["p2"]] <- knn[["p2"]]^2
}
rownames(knn[["p2"]]) <- cell_names
#Inverse Simpson
knn[["in_simpson"]] <- 1/rowSums(knn[["p2"]])
#Add to colData of sce
in_simpson <- as.data.frame(knn[["in_simpson"]]) %>% set_colnames("isi")
if( !is.null(res_name) ){
in_simpson <- in_simpson %>% set_colnames(res_name)
}
stopifnot(all(rownames(in_simpson) == colnames(sce)))
colData(sce)[,colnames(in_simpson)] <- in_simpson
sce
}
#' mixMetric
#'
#' @param sce \code{SingleCellExperiment} object, with the integrated data.
#' @param group Character. Name of group/batch variable.
#' Needs to be one of \code{names(colData(sce))}.
#' @param k Numeric. Number of k-nearest neighbours (knn) to use.
#' @param dim_red Character. Name of embeddings to use as subspace for distance
#' distributions. Default is "PCA".
#' @param assay_name Character. Name of the assay to use for PCA.
#' Only relevant if no existing 'dim_red' is provided.
#' @param n_dim Numeric. Number of dimensions to include to define the subspace.
#' @param k_pos Position of the cell, which rank to use for scoring,
#' defaults to 5.
#' @param res_name Character. Appendix of the result score's name
#' (e.g. method used to combine batches).
#'
#'@details The mixMetric function implements the mixingMetric function from
#'Seurat (See \code{\link[Seurat]{MixingMetric}}. It takes the median rank of
#'the '__k_pos__ neighbour from each batch as estimation for the data's entropy
#'according to the batch variable. The same result can be assesed using the
#' \code{\link[Seurat]{MixingMetric}} function and a seurat object from the
#' __Seurat__ package.
#'
#' @return A \code{SingleCellExperiment} with the mixing metric within colData.
#' @export
#'
#' @references
#' Stuart T Butler A Hoffman P Hafemeister C Papalexi E et. al. (2019)
#' Comprehensive Integration of Single-Cell Data.
#' Cell.
#'
#' @examples
#' library(SingleCellExperiment)
#' sim_list <- readRDS(system.file("extdata/sim50.rds", package = "CellMixS"))
#' sce <- sim_list[[1]][, c(1:15, 400:420, 16:30)]
#'
#' sce <- mixMetric(sce, "batch", k = 20)
#' @importFrom SingleCellExperiment colData
#' @importFrom BiocNeighbors findKNN
#' @importFrom magrittr %>% set_rownames set_colnames set_names
#' @importFrom stats median
#' @importFrom purrr %>% map
#' @importFrom methods is
#' @importFrom SummarizedExperiment colData<-
#' @importFrom dplyr bind_rows
mixMetric <- function(sce, group, k = 300, dim_red = "PCA",
assay_name = "logcounts", n_dim = 10, k_pos = 5,
res_name = NULL){
#------------------Check input parameter ---------------------------------#
if( !is(sce, "SingleCellExperiment") ){
stop("Error: 'sce' must be a 'SingleCellExperiment' object.")
}
if( !group %in% names(colData(sce)) ){
stop("Error: 'group' variable must be in 'colData(sce)'")
}
if( !is(colData(sce)[,group], "factor") ){
sce[[group]] <- as.factor(colData(sce)[, group])
}
colData(sce)[, group] <- droplevels(colData(sce)[, group])
if( k >= ncol(sce) ){
warning("'k' exceeds number of cells. Is set to max (all cells).")
k <- ncol(sce) - 1
}
#----------------- determine knn matrix ----------------------------------#
cell_names <- colnames(sce)
names(cell_names) <- cell_names
batch_ids <- levels(colData(sce)[,group])
subspace <- .defineSubspace(sce, assay_name, dim_red, n_dim)
#determine knn
knn <- findKNN(subspace, k = k) %>% map(set_rownames, cell_names)
# group assignment of knn cells for each cell
knn[[group]] <- matrix(
colData(sce)[as.numeric(as.character(knn$index)), group],
nrow = nrow(knn$index)) %>%
set_rownames(cell_names)
#----------------- calculate mixing metric -----------------------#
#We implement seurat's mixing metric to prevent dependencies on seurat.
#Get the position of the 5th neighbour within each batch
#Add cell itself to knn list (to reproduce seurat's results -
#reflects not how mm is actually described)
knn[["group_mod"]] <- cbind(colData(sce)[,group], knn[[group]])
knn[["pos"]] <- lapply(batch_ids, function(batch){
idx <- apply(knn[["group_mod"]], 1, function(cell){
id <- which(cell %in% batch)
})
if( !is.list(idx) )
idx <- split(idx, col(idx))
names(idx) <- cell_names
if( length(idx) == 0 ){
pos <- rep(k, length(cell_names)) %>% set_names(cell_names)
}else{
pos <- lapply(cell_names, function(cell){
if( length(idx[[cell]]) < k_pos ){
pos <- k
}else{
pos <-idx[[cell]][k_pos]
}
})
}
}) %>% bind_rows() %>% t() %>% set_colnames(batch_ids)
#Get median
knn[["mm"]] <- lapply(cell_names, function(cell){
cell_med <- median(knn[["pos"]][cell,])
}) %>% bind_rows() %>% t() %>% set_rownames(cell_names) %>%
set_colnames("median")
#Add to colData of sce
mm <- as.data.frame(knn[["mm"]]) %>% set_colnames("mm")
if( !is.null(res_name) ){
mm <- mm %>% set_colnames(res_name)
}
stopifnot(all(rownames(mm) == colnames(sce)))
colData(sce)[,colnames(mm)] <- mm
sce
}
#' locStructure
#'
#' @param sce \code{SingleCellExperiment} object, with the integrated data.
#' @param group Character. Name of group/batch variable.
#' Needs to be one of \code{names(colData(sce))}.
#' @param dim_combined Charactyer. Name of the reduced dimensional
#' representation of the integrated data.
#' Needs to be one of \code{reducedDimNames(sce))}.
#' @param k Numeric. Number of k-nearest neighbours (knn) to use.
#' @param dim_red Character. Name of embeddings to calculate neighbourhoods
#' before integration. Default is "PCA".
#' @param assay_name Character. Name of the assay to use for PCA of the original
#' (not integrated) data. Should not refer to "corrected" counts.
#' @param n_dim Numeric. Number of dimensions to include for the original data.
#' @param n_combined Numeric. Number of dimensions to include for the
#' integrated data.
#' @param res_name Character. Appendix of the result score's name
#' (e.g. method used to combine batches).
#'
#'@details The locStructure function implements the localStructure function
#'from Seurat (See \code{\link[Seurat]{LocalStruct}}. For each group it
#'calculates the k nearest neighbour within PCA space before integration and
#'compares it to the knn within the reduced dimensional representation after
#'integration. The score represents the proportion of overlapping neighbours.
#'The \code{\link[Seurat]{LocalStruct}} function is based on the
#'\code{\link[Seurat]{RunPCA}} function, while here \code{\link[scater]{runPCA}}
#'is used. This can cause small deviance from the
#'\code{\link[Seurat]{LocalStruct}} function, but overall these functions are
#' equivalent.
#'
#' @return A \code{SingleCellExperiment} with the mixing metric within colData.
#' @export
#'
#' @references
#' Stuart T Butler A Hoffman P Hafemeister C Papalexi E et. al. (2019)
#' Comprehensive Integration of Single-Cell Data.
#' Cell.
#'
#' @examples
#' library(SingleCellExperiment)
#' sim_list <- readRDS(system.file("extdata/sim50.rds", package = "CellMixS"))
#' sce <- sim_list[["batch20"]][, c(1:50, 300:350)]
#'
#' sce <- locStructure(sce, "batch", "MNN", k = 20, assay_name = "counts")
#' @importFrom SingleCellExperiment colData
#' @importFrom BiocNeighbors findKNN
#' @importFrom magrittr %>% set_rownames set_colnames set_names
#' @importFrom purrr %>% map
#' @importFrom methods is
#' @importFrom SummarizedExperiment colData<-
#' @importFrom dplyr bind_rows
#' @importFrom scater runPCA
locStructure <- function(sce, group, dim_combined, k = 100, dim_red = "PCA",
assay_name = "logcounts", n_dim = 10, n_combined = 10,
res_name = NULL){
#------------------Check input parameter ---------------------------------#
if( !is(sce, "SingleCellExperiment") ){
stop("Error: 'sce' must be a 'SingleCellExperiment' object.")
}
if( !group %in% names(colData(sce)) ){
stop("Error: 'group' variable must be in 'colData(sce)'")
}
if( k > min(table(droplevels(colData(sce)[, group]))) ){
stop("'k' exceeds number of cells/batch.")
}
if( !assay_name %in% names(assays(sce)) ){
stop("Error: 'assay_name' not found.")
}
if( !dim_combined %in% reducedDimNames(sce) ){
stop("Error: 'dim_combined' not found.")
}
if( ncol(reducedDims(sce)[[dim_combined]]) < n_combined ){
stop("Error: 'n_combined' exceeds the dimensions of 'dim_combined'.")
}
if( !is(colData(sce)[,group], "factor") ){
sce[[group]] <- as.factor(colData(sce)[, group])
}
colData(sce)[, group] <- droplevels(colData(sce)[, group])
#----------------- determine subspace ----------------------------------#
cell_names <- colnames(sce)
names(cell_names) <- cell_names
batch_ids <- levels(colData(sce)[,group])
subspace_com <- reducedDims(sce)[[dim_combined]][, seq_len(n_combined)]
#----------------- calculate local Structure -----------------------#
#We implement seurat's mixing metric to prevent dependencies on seurat.
#Overlap between knn_combined and knn within each group
gr_overlap <- lapply(batch_ids, function(batch){
cell_group <- colnames(sce[,colData(sce)[,group] == batch])
#calculate PCA on these batch cells only
sce_sub <- sce[,cell_group]
sce_sub <- runPCA(sce_sub, ncomponents = n_dim, scale = TRUE,
exprs_values = assay_name, ntop = 2000)
#filter subspace by group
subspace_b <- reducedDims(sce_sub)[["PCA"]]
subspace_com_b <- subspace_com[cell_group, ]
#determine knn
knn <- findKNN(subspace_b, k = k) %>%
map(set_rownames, rownames(subspace_b))
knn_combined <- findKNN(subspace_com_b, k = k) %>%
map(set_rownames, rownames(subspace_com_b))
#determine overlap
n_overlap <- vapply(cell_group, function(cell){
length(intersect(knn[["index"]][cell,],
knn_combined[["index"]][cell,]))/k
}, 1)
}) %>% unlist() %>% data.frame() %>% set_colnames("overlap")
#Add to colData of sce
gr_overlap <- as.data.frame(gr_overlap[cell_names,]) %>%
set_colnames("overlap") %>% set_rownames(cell_names)
if( !is.null(res_name) ){
gr_overlap <- gr_overlap %>% set_colnames(res_name)
}
stopifnot(all(rownames(gr_overlap) == colnames(sce)))
colData(sce)[,colnames(gr_overlap)] <- gr_overlap
sce
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.