#' @importFrom stats pchisq pnorm
#a wrapper for kBET to fix a neighbourhood size
scan_nb <- function(x,df,batch, knn) {
res <- kBET(df = df, batch = batch, k0 = x, knn = knn, testSize = NULL,
heuristic = FALSE, n_repeat = 10, alpha = 0.05,
addTest = FALSE, plot = FALSE, verbose = FALSE, adapt = FALSE)
result <- res$summary
result$kBET.observed[1]
}
#the residual score function of kBET
residual_score_batch <- function(knn.set, class.freq, batch) {
#knn.set: indices of nearest neighbours
#empirical frequencies in nn-environment (sample 1)
#ignore NA entries (which may arise from subsampling a knn-graph)
if (all(is.na(knn.set))){ #if all values of a neighbourhood are NA
return(NA)
}
else{
freq.env <- table(batch[knn.set[!is.na(knn.set)]])/length(!is.na(knn.set))
full.classes <- rep(0, length(class.freq$class))
full.classes[ class.freq$class %in% names(freq.env)] <- freq.env
exp.freqs <- class.freq$freq
#compute chi-square test statistics
sum((full.classes - exp.freqs)^2/exp.freqs)
}
}
#which batch has the largest deviance (and is underrepresented)
max_deviance_batch <- function(knn.set, class.freq, batch) {
#knn.set: indices of nearest neighbours
#empirical frequencies in nn-environment (sample 1)
if (all(is.na(knn.set))){ #if all values of a neighbourhood are NA
return(NA)
}
else{
freq.env <- table(batch[knn.set[!is.na(knn.set)]])/length(!is.na(knn.set))
full.classes <- rep(0, length(class.freq$class))
full.classes[ class.freq$class %in% names(freq.env)] <- freq.env
exp.freqs <- class.freq$freq
#compute chi-square test statistics
allScores <- (full.classes - exp.freqs)/exp.freqs
batch[which(allScores == min(allScores))]
}
}
#the core function of kBET
chi_batch_test <- function(knn.set, class.freq, batch, df) {
#knn.set: indices of nearest neighbours
#empirical frequencies in nn-environment (sample 1)
if (all(is.na(knn.set))){ #if all values of a neighbourhood are NA
return(NA)
}
else{
freq.env <- table(batch[knn.set[!is.na(knn.set)]])
full.classes <- rep(0, length(class.freq$class))
full.classes[ class.freq$class %in% names(freq.env)] <- freq.env
exp.freqs <- class.freq$freq*length(knn.set)
#compute chi-square test statistics
chi.sq.value <- sum((full.classes - exp.freqs)^2/exp.freqs)
result <- 1 - pchisq(chi.sq.value, df) #p-value for the result
if (is.na(result)) { #I actually would like to now when 'NA' arises.
return(NA)
} else {
result
}
}
}
lrt_approximation <- function(knn.set, class.freq, batch, df) {
#knn.set: indices of nearest neighbours
#empirical frequencies in nn-environment (sample 1)
if (all(is.na(knn.set))){ #if all values of a neighbourhood are NA
return(NA)
}
else{
obs.env <- table(batch[knn.set[!is.na(knn.set)]]) #observed realisations of each category
freq.env <- obs.env/sum(obs.env) #observed 'probabilities'
full.classes <- rep(0, length(class.freq$class))
obs.classes <- class.freq$class %in% names(freq.env)
#for stability issues (to avoid the secret division by 0): introduce
#another alternative model where the observed probability
#is either the empirical frequency or 1/(sample size) at minimum
if (length(full.classes) > sum(obs.classes)) {
dummy.count <- length(full.classes) - sum(obs.classes)
full.classes[obs.classes] <- obs.env / (sum(obs.env) + dummy.count)
pmin <- 1/(sum(obs.env) + dummy.count)
full.classes[!obs.classes] <- pmin
} else {
full.classes[ obs.classes] <- freq.env
}
exp.freqs <- class.freq$freq #expected 'probabilities'
#compute likelihood ratio of null and alternative hypothesis,
#test statistics converges to chi-square distribution
full.obs <- rep(0, length(class.freq$class))
full.obs[obs.classes] <- obs.env
lrt.value <- -2*sum(full.obs * log(exp.freqs/full.classes))
result <- 1 - pchisq(lrt.value, df) #p-value for the result
if (is.na(result)) { #I actually would like to now when 'NA' arises.
return(NA)
} else {
result
}
}
}
#truncated normal distribution distribution function
ptnorm <- function(x,mu,sd, a=0, b=1, alpha = 0.05, verbose = FALSE){
#this is the cumulative density of the truncated normal distribution
#x ~ N(mu, sd^2), but we condition on a <= x <= b
if (!is.na(x)){
if (a > b) {
warning("Lower and upper bound are interchanged.")
tmp <- a
a <- b
b <- tmp
}
if (sd <= 0 || is.na(sd)) {
if (verbose) {
warning("Standard deviation must be positive.")
}
if (alpha <= 0) {
stop("False positive rate alpha must be positive.")
}
sd <- alpha
}
if (x < a || x > b) {
warning("x out of bounds.")
cdf <- as.numeric(x > a)
} else {
alp <- pnorm((a - mu) / sd)
bet <- pnorm((b - mu) / sd)
zet <- pnorm((x - mu) / sd)
cdf <- (zet - alp) / (bet - alp)
}
cdf
} else {
return(NA)
}
}
#wrapper for the multinomial exact test function
multiNom <- function(x, y, z) {
z.f <- factor(z)
tmp <- multinomial.test(as.numeric(table(z.f[x])), y)
tmp$p.value
}
#significance test for pcRegression (two levels)
correlate.fun_two <- function(rot.data, batch, batch.levels) {
#rot.data: some vector (numeric entries)
#batch: some vector (categoric entries)
a <- lm(rot.data ~ batch)
result <- numeric(2)
result[1] <- summary(a)$r.squared #coefficient of determination
result[2] <- summary(a)$coefficients[2,4] #p-value (significance level)
t.test.result <- t.test(rot.data[batch == batch.levels[1]],
rot.data[batch == batch.levels[2]], paired = FALSE)
result[3] <- t.test.result$p.value
result
}
#significance test for pcRegression (more than two levels)
correlate.fun_gen <- function(rot.data, batch){
#rot.data: some vector (numeric covariate)
#batch: some vector (categoric covariate)
a <- lm(rot.data ~ batch)
result <- numeric(2)
result[1] <- summary(a)$r.squared #coefficient of determination
F.test.result <- aov(rot.data ~ batch)
F.test.summary <- summary(F.test.result)
result[2] <- summary(a)$coefficients[2,4] #p-value (significance level)
result[3] <- F.test.summary[[1]]$'Pr(>F)'[1] #p-value of the one-way anova test
result
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.