#' kBET - k-nearest neighbour batch effect test
#'
#' @description \code{kBET} runs a chi square test to evaluate
#' the probability of a batch effect.
#'
#' @param df dataset (rows: cells, columns: features)
#' @param batch batch id for each cell or a data frame with
#' both condition and replicates
#' @param k0 number of nearest neighbours to test on (neighbourhood size)
#' @param knn an n x k matrix of nearest neighbours for each cell (optional)
#' @param testSize number of data points to test,
#' (10 percent sample size default, but at least 25)
#' @param do.pca perform a pca prior to knn search? (defaults to TRUE)
#' @param dim.pca if do.pca=TRUE, choose the number of dimensions
#' to consider (defaults to 50)
#' @param heuristic compute an optimal neighbourhood size k
#' (defaults to TRUE)
#' @param n_repeat to create a statistics on batch estimates,
#' evaluate 'n_repeat' subsets
#' @param alpha significance level
#' @param adapt In some cases, a number of cells do not contribute
#' to any neighbourhood
#' and this may cause an imbalance in observed and expected batch
#' label frequencies.
#' Frequencies will be adapted if adapt=TRUE (default).
#' @param addTest perform an LRT-approximation to the multinomial
#' test AND a multinomial exact test (if appropriate)
#' @param plot if stats > 10, then a boxplot of the resulting
#' rejection rates is created
#' @param verbose displays stages of current computation (defaults to FALSE)
#' @return list object
#' \enumerate{
#' \item \code{summary} - a rejection rate for the data,
#' an expected rejection rate for random
#' labeling and the significance for the observed result
#' \item \code{results} - detailed list for each tested cells;
#' p-values for expected and observed label distribution
#' \item \code{average.pval} - significance level over the averaged
#' batch label distribution in all neighbourhoods
#' \item \code{stats} - extended test summary for every sample
#' \item \code{params} - list of input parameters and adapted parameters,
#' respectively
#' \item \code{outsider} - only shown if \code{adapt=TRUE}. List of samples
#' without mutual nearest neighbour: \itemize{
#' \item \code{index} - index of each outsider sample)
#' \item \code{categories} - tabularised labels of outsiders
#' \item \code{p.val} - Significance level of outsider batch label distribution
#' vs expected frequencies.
#' If the significance level is lower than \code{alpha},
#' expected frequencies will be adapted}
#' }
#' @return If the optimal neighbourhood size (k0) is smaller than 10, NA is returned.
#' @examples
#' batch <- rep(seq_len(10),each=20)
#' data <- matrix(rpois(n = 50000, lambda = 10)*rbinom(50000,1,prob=0.5), nrow=200)
#'
#' batch.estimate <- kBET(data,batch)
#' @importFrom FNN get.knn
#' @import ggplot2
#' @importFrom stats quantile pchisq
#' @importFrom RColorBrewer brewer.pal
#' @importFrom utils data
#' @include bisect.R
#' @include kBET-utils.R
#' @name kBET
#' @export
kBET <- function(
df, batch, k0 = NULL, knn = NULL,
testSize = NULL, do.pca = TRUE, dim.pca = 50,
heuristic = TRUE, n_repeat = 100,
alpha = 0.05, addTest = FALSE,
verbose = FALSE, plot = TRUE, adapt = TRUE
) {
#create a subsetting mode:
#if (is.data.frame(batch) && dim(batch)[2]>1) {
# cat('Complex study design detected.\n')
# design.names <- colnames(batch)
# cat(paste0('Subset by ', design.names[[2]], '\n'))
# bio <- unlist(unique(batch[[design.names[2]]]))
#drop subset batch
# new.batch <- base::subset(batch,
# batch[[2]]== bio[1])[,!design.names %in% design.names[[2]]]
# sapply(df[batch[[2]]==bio[1],], kBET, new.batch,
# k0, knn, testSize, heuristic,n_repeat, alpha, addTest, verbose)
#}
#preliminaries:
dof <- length(unique(batch)) - 1 #degrees of freedom
if (is.factor(batch)) {
batch <- droplevels(batch)
}
frequencies <- table(batch)/length(batch)
#get 3 different permutations of the batch label
batch.shuff <- replicate(3, batch[sample.int(length(batch))])
class.frequency <- data.frame(class = names(frequencies),
freq = as.numeric(frequencies))
dataset <- df
dim.dataset <- dim(dataset)
#check the feasibility of data input
if (dim.dataset[1] != length(batch) && dim.dataset[2] != length(batch)) {
stop("Input matrix and batch information do not match. Execution halted.")
}
if (dim.dataset[2] == length(batch) && dim.dataset[1] != length(batch)) {
if (verbose) {
cat('Input matrix has samples as columns. kBET needs samples as rows. Transposing...\n')
}
dataset <- t(dataset)
dim.dataset <- dim(dataset)
}
#check if the dataset is too small per se
if (dim.dataset[1]<=10){
if (verbose){
cat("Your dataset has less than 10 samples. Abort and return NA.\n")
}
return(NA)
}
stopifnot(class(n_repeat) == 'numeric', n_repeat > 0)
do_heuristic <- FALSE
if (is.null(k0) || k0 >= dim.dataset[1]) {
do_heuristic <- heuristic
if (!heuristic) {
#default environment size: quarter the size of the largest batch
k0 <- floor(mean(class.frequency$freq)*dim.dataset[1]/4)
} else {
#default environment size: three quarter the size of the largest batch
k0 <- floor(mean(class.frequency$freq)*dim.dataset[1]*0.75)
if (k0 < 10) {
if (verbose){
warning(
"Your dataset has too few samples to run a heuristic.\n",
"Return NA.\n",
"Please assign k0 and set heuristic=FALSE."
)
}
return(NA)
}
}
if (verbose) {
cat(paste0('Initial neighbourhood size is set to ', k0, '.\n'))
}
}
#if k0 was set by the user and is too small & we do not operate on a
#knn graph, abort
#the reason is that if we want to test kBET on knn graph data integration methods,
#we usually face small numbers of nearest neighbours.
if (k0 < 10 & is.null(knn)) {
if (verbose){
warning(
"Your dataset has too few samples to run a heuristic.\n",
"Return NA.\n",
"Please assign k0 and set heuristic=FALSE."
)
}
return(NA)
}
# find KNNs
if (is.null(knn)) {
if (!do.pca) {
if (verbose) {
cat('finding knns...')
tic <- proc.time()
}
#use the nearest neighbour index directly for further use in the package
knn <- get.knn(dataset, k = k0, algorithm = 'cover_tree')$nn.index
} else {
dim.comp <- min(dim.pca, dim.dataset[2])
if (verbose) {
cat('reducing dimensions with svd first...\n')
}
data.pca <- svd(x = dataset, nu = dim.comp, nv = 0)
if (verbose) {
cat('finding knns...')
tic <- proc.time()
}
knn <- get.knn(data.pca$u, k = k0, algorithm = 'cover_tree')
}
if (verbose) {
cat('done. Time:\n')
print(proc.time() - tic)
}
}
#backward compatibility for knn-graph
if(is(knn, "list")){
knn <- knn$nn.index
if (verbose){
cat('KNN input is a list, extracting nearest neighbour index.\n')
}
}
#set number of tests
if (is.null(testSize) || (floor(testSize) < 1 || dim.dataset[1] < testSize)) {
test.frac <- 0.1
testSize <- ceiling(dim.dataset[1]*test.frac)
if (testSize < 25 && dim.dataset[1] > 25) {
testSize <- 25
}
if (verbose) {
cat('Number of kBET tests is set to ')
cat(paste0(testSize, '.\n'))
}
}
#decide to adapt general frequencies
if (adapt) {
# idx.run <- sample.int(dim.dataset[1], size = min(2*testSize, dim.dataset[1]))
outsider <- which(!(seq_len(dim.dataset[1]) %in% knn[, seq_len(k0 - 1)]))
is.imbalanced <- FALSE #initialisation
p.out <- 1
#avoid unwanted things happen if length(outsider) == 0
if (length(outsider) > 0) {
p.out <- chi_batch_test(outsider, class.frequency, batch, dof)
if (!is.na(p.out)){
is.imbalanced <- p.out < alpha
if (is.imbalanced) {
new.frequencies <- table(batch[-outsider])/length(batch[-outsider])
new.class.frequency <- data.frame(class = names(new.frequencies),
freq = as.numeric(new.frequencies))
if (verbose) {
outs_percent <- round(length(outsider) / length(batch) * 100, 3)
cat(paste(
sprintf(
'There are %s cells (%s%%) that do not appear in any neighbourhood.',
length(outsider), outs_percent
),
'The expected frequencies for each category have been adapted.',
'Cell indexes are saved to result list.',
'', sep = '\n'
))
}
} else {
if (verbose) {
cat(paste0('No outsiders found.'))
}
}
} else{
if (verbose) {
cat(paste0('No outsiders found.'))
}
}
}
}
if (do_heuristic) {
#btw, when we bisect here that returns some interval with
#the optimal neihbourhood size
if (verbose) {
cat('Determining optimal neighbourhood size ...')
}
opt.k <- bisect(scan_nb, bounds = c(10, k0), known = NULL, dataset, batch, knn)
#result
if (length(opt.k) > 1) {
k0 <- opt.k[2]
if (verbose) {
cat(paste0('done.\nNew size of neighbourhood is set to ', k0, '.\n'))
}
} else {
if (verbose) {
cat(paste(
'done.',
'Heuristic did not change the neighbourhood.',
sprintf('If results appear inconclusive, change k0=%s.', k0),
'', sep = '\n'
))
}
}
}
#initialise result list
rejection <- list()
rejection$summary <- data.frame(
kBET.expected = numeric(4),
kBET.observed = numeric(4),
kBET.signif = numeric(4)
)
rejection$results <- data.frame(
tested = numeric(dim.dataset[1]),
kBET.pvalue.test = rep(0,dim.dataset[1]),
kBET.pvalue.null = rep(0, dim.dataset[1])
)
#get average residual score
env <- as.vector(cbind(knn[, seq_len(k0 - 1)], seq_len(dim.dataset[1])))
cf <- if (adapt && is.imbalanced) new.class.frequency else class.frequency
rejection$average.pval <- 1 - pchisq(k0 * residual_score_batch(env, cf, batch), dof)
#initialise intermediates
kBET.expected <- numeric(n_repeat)
kBET.observed <- numeric(n_repeat)
kBET.signif <- numeric(n_repeat)
if (addTest) {
#initialize result list
rejection$summary$lrt.expected <- numeric(4)
rejection$summary$lrt.observed <- numeric(4)
rejection$results$lrt.pvalue.test <- rep(0,dim.dataset[1])
rejection$results$lrt.pvalue.null <- rep(0, dim.dataset[1])
lrt.expected <- numeric(n_repeat)
lrt.observed <- numeric(n_repeat)
lrt.signif <- numeric(n_repeat)
#decide to perform exact test or not
if (choose(k0 + dof,dof) < 5e5 && k0 <= min(table(batch))) {
exact.expected <- numeric(n_repeat)
exact.observed <- numeric(n_repeat)
exact.signif <- numeric(n_repeat)
rejection$summary$exact.expected <- numeric(4)
rejection$summary$exact.observed <- numeric(4)
rejection$results$exact.pvalue.test <- rep(0, dim.dataset[1])
rejection$results$exact.pvalue.null <- rep(0, dim.dataset[1])
}
for (i in seq_len(n_repeat)) {
# choose a random sample from dataset (rows: samples, columns: features)
idx.runs <- sample.int(dim.dataset[1], size = testSize)
env <- cbind(knn[idx.runs, seq_len(k0 - 1)], idx.runs)
#env.rand <- t(sapply(rep(dim.dataset[1],testSize), sample.int, k0))
#perform test
if (adapt && is.imbalanced) {
p.val.test <- apply(env, 1, FUN = chi_batch_test,
new.class.frequency, batch, dof)
} else {
p.val.test <- apply(env, 1, FUN = chi_batch_test,
class.frequency, batch, dof)
}
is.rejected <- p.val.test < alpha
#p.val.test <- apply(env, 1, FUN = chi_batch_test, class.frequency,
#batch, dof)
p.val.test.null <- apply(apply(batch.shuff, 2,
function(x, freq, dof, envir) {
apply(envir, 1, FUN = chi_batch_test, freq, x, dof)},
class.frequency, dof, env), 1, mean, na.rm=TRUE)
#p.val.test.null <- apply(env.rand, 1, FUN = chi_batch_test,
#class.frequency, batch, dof)
#summarise test results
kBET.expected[i] <- sum(p.val.test.null < alpha,
na.rm=TRUE) / sum(!is.na(p.val.test.null))
kBET.observed[i] <- sum(is.rejected,na.rm=TRUE) / sum(!is.na(p.val.test))
#compute significance
kBET.signif[i] <-
1 - ptnorm(kBET.observed[i],
mu = kBET.expected[i],
sd = sqrt(kBET.expected[i] * (1 - kBET.expected[i]) / testSize),
alpha = alpha)
#assign results to result table
rejection$results$tested[idx.runs] <- 1
rejection$results$kBET.pvalue.test[idx.runs] <- p.val.test
rejection$results$kBET.pvalue.null[idx.runs] <- p.val.test.null
#compute likelihood-ratio test (approximation for multinomial exact test)
cf <- if (adapt && is.imbalanced) new.class.frequency else class.frequency
p.val.test.lrt <- apply(env, 1, FUN = lrt_approximation, cf, batch, dof)
p.val.test.lrt.null <- apply(apply(batch.shuff, 2,
function(x, freq, dof, envir) {
apply(envir, 1, FUN = lrt_approximation, freq, x, dof)},
class.frequency, dof, env), 1, mean, na.rm=TRUE)
lrt.expected[i] <-
sum(p.val.test.lrt.null < alpha, na.rm=TRUE) / sum(!is.na(p.val.test.lrt.null))
lrt.observed[i] <-
sum(p.val.test.lrt < alpha, na.rm=TRUE) / sum(!is.na(p.val.test.lrt))
lrt.signif[i] <-
1 - ptnorm(lrt.observed[i],
mu = lrt.expected[i],
sd = sqrt(lrt.expected[i] * (1 - lrt.expected[i]) / testSize),
alpha = alpha)
rejection$results$lrt.pvalue.test[idx.runs] <- p.val.test.lrt
rejection$results$lrt.pvalue.null[idx.runs] <- p.val.test.lrt.null
#exact result test consume a fairly high amount of computation time,
#as the exact test computes the probability of ALL
#possible configurations (under the assumption that all batches
#are large enough to 'imitate' sampling with replacement)
#For example: k0=33 and dof=5 yields 501942 possible
#choices and a computation time of several seconds (on a 16GB RAM machine)
if (exists(x = 'exact.observed')) {
if (adapt && is.imbalanced) {
p.val.test.exact <- apply(env, 1, multiNom,
new.class.frequency$freq, batch)
} else {
p.val.test.exact <- apply(env, 1, multiNom,
class.frequency$freq, batch)
}
p.val.test.exact.null <- apply(apply(batch.shuff, 2,
function(x, freq, envir) {
apply(envir, 1, FUN = multiNom, freq, x)},
class.frequency$freq, env), 1, mean, na.rm=TRUE)
# apply(env, 1, multiNom, class.frequency$freq, batch.shuff)
exact.expected[i] <- sum(p.val.test.exact.null < alpha, na.rm=TRUE)/testSize
exact.observed[i] <- sum(p.val.test.exact < alpha, na.rm=TRUE)/testSize
#compute the significance level for the number of rejected data points
exact.signif[i] <-
1 - ptnorm(exact.observed[i],
mu = exact.expected[i],
sd = sqrt(exact.expected[i] * (1 - exact.expected[i]) / testSize),
alpha = alpha)
#p-value distribution
rejection$results$exact.pvalue.test[idx.runs] <- p.val.test.exact
rejection$results$exact.pvalue.null[idx.runs] <- p.val.test.exact.null
}
}
if (n_repeat > 1) {
#summarize chi2-results
CI95 <- c(0.025,0.5,0.975)
rejection$summary$kBET.expected <- c(mean(kBET.expected, na.rm=TRUE) ,
quantile(kBET.expected, CI95,
na.rm=TRUE))
rownames(rejection$summary) <- c('mean', '2.5%', '50%', '97.5%')
rejection$summary$kBET.observed <- c(mean(kBET.observed,na.rm=TRUE) ,
quantile(kBET.observed, CI95,
na.rm=TRUE))
rejection$summary$kBET.signif <- c(mean(kBET.signif,na.rm=TRUE) ,
quantile(kBET.signif, CI95,
na.rm=TRUE))
#summarize lrt-results
rejection$summary$lrt.expected <- c(mean(lrt.expected,na.rm=TRUE) ,
quantile(lrt.expected, CI95,
na.rm=TRUE))
rejection$summary$lrt.observed <- c(mean(lrt.observed,na.rm=TRUE) ,
quantile(lrt.observed, CI95,
na.rm=TRUE))
rejection$summary$lrt.signif <- c(mean(lrt.signif,na.rm=TRUE) ,
quantile(lrt.signif, CI95,
na.rm=TRUE))
#summarize exact test results
if (exists(x = 'exact.observed')) {
rejection$summary$exact.expected <- c(mean(exact.expected,na.rm=TRUE) ,
quantile(exact.expected, CI95,
na.rm=TRUE))
rejection$summary$exact.observed <- c(mean(exact.observed,na.rm=TRUE) ,
quantile(exact.observed, CI95,
na.rm=TRUE))
rejection$summary$exact.signif <- c(mean(exact.signif,na.rm=TRUE) ,
quantile(exact.signif, CI95,
na.rm=TRUE))
}
if (n_repeat < 10) {
cat('Warning: The quantile computation for ')
cat(paste0(n_repeat))
cat(' subset results is not meaningful.')
}
if (plot && exists(x = 'exact.observed')) {
plot.data <- data.frame(class = rep(c('kBET', 'kBET (random)',
'lrt', 'lrt (random)',
'exact', 'exact (random)'),
each = n_repeat),
data = c(kBET.observed, kBET.expected,
lrt.observed, lrt.expected,
exact.observed, exact.expected))
g <- ggplot(plot.data, aes(class, data)) + geom_boxplot() +
theme_bw() + labs(x = 'Test', y = 'Rejection rate') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(g)
}
if (plot && !exists(x = 'exact.observed')) {
plot.data <- data.frame(class = rep(c('kBET', 'kBET (random)',
'lrt', 'lrt (random)'),
each = n_repeat),
data = c(kBET.observed, kBET.expected,
lrt.observed, lrt.expected))
g <- ggplot(plot.data, aes(class, data)) + geom_boxplot() +
theme_bw() + labs(x = 'Test', y = 'Rejection rate') +
scale_y_continuous(limits = c(0,1)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(g)
}
} else {#i.e. no n_repeat
rejection$summary$kBET.expected <- kBET.expected
rejection$summary$kBET.observed <- kBET.observed
rejection$summary$kBET.signif <- kBET.signif
if (addTest) {
rejection$summary$lrt.expected <- lrt.expected
rejection$summary$lrt.observed <- lrt.observed
rejection$summary$lrt.signif <- lrt.signif
if (exists(x = 'exact.observed')) {
rejection$summary$exact.expected <- exact.expected
rejection$summary$exact.observed <- exact.observed
rejection$summary$exact.signif <- exact.signif
}
}
}
} else {#kBET only
for (i in seq_len(n_repeat)) {
# choose a random sample from dataset
#(rows: samples, columns: parameters)
idx.runs <- sample.int(dim.dataset[1], size = testSize)
env <- cbind(knn[idx.runs,seq_len(k0 - 1)], idx.runs)
#perform test
cf <- if (adapt && is.imbalanced) new.class.frequency else class.frequency
p.val.test <- apply(env, 1, chi_batch_test, cf, batch, dof)
#print(dim(env))
is.rejected <- p.val.test < alpha
p.val.test.null <- apply(
batch.shuff, 2,
function(x) apply(env, 1, chi_batch_test, class.frequency, x, dof)
)
# p.val.test.null <- apply(env, 1, FUN = chi_batch_test,
#class.frequency, batch.shuff, dof)
#summarise test results
#kBET.expected[i] <- sum(p.val.test.null < alpha) / length(p.val.test.null)
kBET.expected[i] <- mean(apply(
p.val.test.null, 2,
function(x) sum(x < alpha, na.rm =TRUE) / sum(!is.na(x))
))
kBET.observed[i] <- sum(is.rejected,na.rm=TRUE) / sum(!is.na(p.val.test))
#compute significance
kBET.signif[i] <- 1 - ptnorm(
kBET.observed[i],
mu = kBET.expected[i],
sd = sqrt(kBET.expected[i] * (1 - kBET.expected[i]) / testSize),
alpha = alpha
)
#assign results to result table
rejection$results$tested[idx.runs] <- 1
rejection$results$kBET.pvalue.test[idx.runs] <- p.val.test
rejection$results$kBET.pvalue.null[idx.runs] <- rowMeans(p.val.test.null, na.rm=TRUE)
}
if (n_repeat > 1) {
#summarize chi2-results
CI95 <- c(0.025,0.5,0.975)
rejection$summary$kBET.expected <- c(mean(kBET.expected,na.rm=TRUE),
quantile(kBET.expected,
CI95,na.rm=TRUE))
rownames(rejection$summary) <- c('mean', '2.5%', '50%', '97.5%')
rejection$summary$kBET.observed <- c(mean(kBET.observed,na.rm=TRUE),
quantile(kBET.observed, CI95,
na.rm=TRUE))
rejection$summary$kBET.signif <- c(mean(kBET.signif,na.rm=TRUE),
quantile(kBET.signif, CI95,
na.rm=TRUE))
#return also n_repeat
rejection$stats$kBET.expected <- kBET.expected
rejection$stats$kBET.observed <- kBET.observed
rejection$stats$kBET.signif <- kBET.signif
if (n_repeat < 10) {
cat('Warning: The quantile computation for ')
cat(paste0(n_repeat))
cat(' subset results is not meaningful.')
}
if (plot) {
plot.data <- data.frame(class = rep(c('observed(kBET)',
'expected(random)'),
each = n_repeat),
data = c( kBET.observed,kBET.expected))
g <- ggplot(plot.data, aes(class, data)) +
geom_boxplot() + theme_bw() +
labs(x = 'Test', y = 'Rejection rate') +
scale_y_continuous(limits = c(0,1))
print(g)
}
} else {
rejection$summary$kBET.expected <- kBET.expected[1]
rejection$summary$kBET.observed <- kBET.observed[1]
rejection$summary$kBET.signif <- kBET.signif[1]
}
}
#collect parameters
rejection$params <- list()
rejection$params$k0 <- k0
rejection$params$testSize <- testSize
rejection$params$do.pca <- do.pca
rejection$params$dim.pca <- dim.pca
rejection$params$heuristic <- heuristic
rejection$params$n_repeat <- n_repeat
rejection$params$alpha <- alpha
rejection$params$addTest <- addTest
rejection$params$verbose <- verbose
rejection$params$plot <- plot
#add outsiders
if (adapt) {
rejection$outsider <- list()
rejection$outsider$index <- outsider
rejection$outsider$categories <- table(batch[outsider])
rejection$outsider$p.val <- p.out
}
rejection
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.