#' @importFrom stats t.test wilcox.test p.adjust median na.omit
NULL
#' @title Compute knn using the fnn algorithm
#'
#' @description This function is a wrapper around FNN package
#' functionality to speed up the KNN process. It uses KD trees as default,
#' along with k set to 100. Selection of K will vary based on the dataset.
#' See k.selection.R.
#' @param cell.df the cell data frame used as input
#' @param input.markers markers to be used as input for knn
#' @param k the number of nearest neighbors to identify
#' @return nn: list of 2, nn.index: index of knn (columns) for each cell (rows)
#' nn.dist: euclidean distance of each k-nearest neighbor
#' @examples
#' Fnn(wand.combined, input.markers)
#' @export
Fnn <- function(cell.df, input.markers, k = 100) {
message("finding k-nearest neighbors")
# Edge case (rflann with kd-tree doesn't have it)
if(k >= nrow(cell.df)) {
stop("k must be less than the total number of data points")
}
if(k < 1) {
stop("Please select k greater than zero")
}
input <- cell.df[,input.markers]
# Using the rflann package
#nn <- Neighbour(query = input, ref = input, k = k + 1)
# Using the FNN package
nn <- FNN::get.knn(data = input, k = k)
nn.index <- nn[[1]]
nn.dist <- nn[[2]]
message("k-nearest neighbors complete")
return(list(nn.index = nn.index, nn.dist = nn.dist))
}
#' @title Performs a series of statistical tests on the batch of cells
#' of interest.
#'
#' @description This function performs the statistics across the nearest
#' neighborhoods, and is one of the main workhorses within the scone.values
#' function
#'
#' @param basal tibble of cells corresponding to the unstimulated condition
#' @param stim a tibble of cells corresponding to the stimulated condition
#' @param fold a string that specifies the use of "median" or "mean" when
#' calculating fold change
#' @param stat.test a string that specifies Mann-Whitney U test (mwu) or T test
#' (t) for q value calculation
#' @param stim.name a string corresponding to the name of the stim being tested
#' compared to basal
#' @return result: a named vector corresponding to the results of the
#' "fold change" and mann-whitney u test
RunStatistics <- function(basal,
stim,
fold = "median",
stat.test = "mwu",
stim.name) {
# Edge case of a knn consisting of only one of the two conditions
# More common in messier datasets
if(nrow(basal) == 0 | nrow(stim) == 0) {
return(rep(NA, times = 2*ncol(basal) + 1))
}
# Fold change between unstim and stim (output is named vector)
if(fold == "median") {
fold <- apply(stim, 2, median) - apply(basal, 2, median)
} else if (fold == "mean") {
fold <- apply(stim, 2, mean) - apply(basal, 2, mean)
} else {
stop("please select median or mean to be used as input for raw
change calculation")
}
# Mann-Whitney U test or T test
if(stat.test == "mwu") {
qvalue <- vapply(seq_len(ncol(basal)), function(j) {
p <- wilcox.test(basal[[j]], stim[[j]])$p.value
return(p)
}, FUN.VALUE = double(1))
} else if (stat.test == 't') {
qvalue <- vapply(seq_len(ncol(basal)), function(j) {
p <- t.test(basal[[j]], stim[[j]])$p.value
return(p)
}, FUN.VALUE = double(1))
} else {
stop("please select Mann-Whitney U test (mwu) or T test (t) for input")
return()
}
# Naming the vectors
# qvalue is not yet a named vector, so its named here
names(qvalue) <- names(fold)
# specifying
names(qvalue) <- paste(names(qvalue), stim.name, "qvalue", sep = ".")
# specifying
names(fold) <- paste(names(fold), stim.name, "change", sep = ".")
# Get the unstim and stim thresholds done
fraction.cond2 <- nrow(stim)/sum(nrow(basal), nrow(stim))
names(fraction.cond2) <- paste(stim.name, "fraction.cond.2", sep = ".")
result <- c(qvalue, fold, fraction.cond2)
return(result)
}
#' @title Runs a t test on the medians or means of multiple donors for the
#' same condition
#'
#' @description This function is for the instance that multiple donors are
#' being compared against each other within the k-nearest neighborhood of
#' interest. The mean value of the markers of interest is calculated across
#' the donors, such that each data point for the subsequent t-test represents
#' a marker for a danor.
#' @param basal tibble that contains unstim for a knn including donor identity
#' @param stim tibble that contains stim for a knn including donor identity
#' @param stim.name string of the name of the current stim being tested
#' @param donors vector of strings corresponding to the designated names
#' of the donors
#' @return result: a named vector of p values (soon to be q values) from the
#' t test done on each marker
MultipleDonorStatistics <- function(basal, stim, stim.name, donors) {
# get the means of all the donors
basal.stats <- tibble()
stim.stats <- tibble()
for(i in donors) {
basal.curr <- basal[basal$donor == i,] %>%
.[!(colnames(.) %in% "donor")]
stim.curr <- stim[stim$donor == i,] %>%
.[!(colnames(.) %in% "donor")]
basal.mean <- apply(basal.curr, 2, mean)
stim.mean <- apply(stim.curr, 2, mean)
basal.stats <- rbind(basal.stats, basal.mean)
stim.stats <- rbind(stim.stats, stim.mean)
}
# assumes "donor" always placed at end!
colnames(basal.stats) <- colnames(basal)[-ncol(basal)]
colnames(stim.stats) <- colnames(stim)[-ncol(basal)]
# T testing (only if there's no NA in the dataset)
if(nrow(na.omit(basal.stats)) == length(donors) & # Watch the newline here
nrow(na.omit(stim.stats)) == length(donors)) {
result <- vapply(seq_len(ncol(basal.stats)), function(i) {
t.test(basal.stats[[i]], stim.stats[[i]])$p.value
}, FUN.VALUE = double(1))
} else {
result <- rep(NA, times = ncol(basal.stats))
}
names(result) <- paste(colnames(basal.stats),
stim.name,
"replicate.qvalue",
sep = ".")
return(result)
}
#' @title Corrects all p values for multiple hypotheses, sets threshold for
#' which change values should be reported
#'
#' @description Given the number of comparisons we make across k-nearest
#' neighborhoods, which is far more than that of disjoint subsetting, this
#' step is important given that there is an increased likelihood that some
#' statistically significant differences will occur by chance.
#' @param cells tibble of change values, p values, and fraction condition 2
#' @param threshold a q value below which the change values will be reported
#' for that cell for that param. If no change is desired, this is set to 1.
#' @return inputted p values, adjusted and therefore described as "q values"
QCorrectionThresholding <- function(cells, threshold) {
# Break apart the result
fold <- cells[,grep("change$", colnames(cells))]
qvalues <- cells[,grep("qvalue$", colnames(cells))]
ratio <- cells[,grep("cond2$", colnames(cells))]
# rest <- cells[,!(colnames(cells) %in% colnames(qvalues))]
# P value correction
qvalues <- apply(qvalues, 2, function(x) p.adjust(x, method = "BH")) %>%
as.tibble
# Thresholding the raw change
if(threshold < 1) {
names <- colnames(fold)
fold <- lapply(seq_len(ncol(fold)), function(i) {
curr <- fold[[i]]
curr <- ifelse(qvalues[[i]] < threshold, curr, 0)
}) %>% do.call(cbind, .) %>%
as.tibble()
colnames(fold) <- names
}
#Bring it all together
result <- bind_cols(qvalues, fold, ratio)
return(result)
}
#' @title Get the KNN density estimaion
#' @description Obtain a density estimation derived from the original manifold,
#' avoiding the lossiness of lower dimensional embeddings
#' @param nn.matrix A list of 2, where the first is a matrix of nn indices,
#' and the second is a matrix of nn distances
#' @return a vector where each element is the KNN-DE for that given cell,
#' ordered by row number, in the original input matrix of cells x features
#' @examples
#' ex.knn <- Fnn(wand.combined, input.markers, k = 30)
#' GetKnnDe(ex.knn)
#' @export
GetKnnDe <- function(nn.matrix) {
# Distances
nn.dist <- nn.matrix[[2]]
# KNN-DE
mean.dist <- apply(nn.dist, 1, mean)
density <- 1/mean.dist
return(density)
}
#' @title Make list of cells by features for each KNN member
#' @description Takes the KNN function output and the cell data, and
#' makes list where each element is a matrix of cells in the KNN and features.
#' @param cell.data tibble of cells by features
#' @param nn.matrix list of 2. First element is cells x 100 nearest neighbor
#' indices. Second element is cells x 100 nearest neighbor distances
#' @return a list where each element is the cell number from the
#' original cell.data tibble and a matrix of cells x feautures for its KNN
#' @examples
#' ex.knn <- Fnn(wand.combined, input.markers, k = 30)
#' knn.list <- MakeKnnList(wand.combined, ex.knn)
#' @export
MakeKnnList <- function(cell.data, nn.matrix) {
# Unpack the KNN output
nn.index <- nn.matrix[[1]]
# The list
knn.list <- lapply(seq_len(nrow(nn.index)), function(i) {
cell.data[nn.index[i,],]
})
return(knn.list)
}
#' @title Master function for per-knn statistics functionality, integrating the
#' other non-exported functions within this script.
#'
#' @description This function is run following the KNN computation
#' and respective cell grouping. The function also contains a progress ticker
#' that allows one to determine how much time left in this function.
#' @param nn.matrix a matrix of cell index by nearest neighbor index, with
#' values being cell index of the nth nearest neighbor
#' @param cell.data tibble of cells by features
#' @param scone.markers vector of all markers to be interrogated via
#' statistical testing
#' @param unstim an object (used so far: string, number)
#' specifying the "basal" condition
#' @param threshold a number indicating the p value the raw change should
#' be thresholded by.
#' @param fold a string that specifies the use of "median" or "mean" when
#' calculating fold change
#' @param stat.test string denoting Mann Whitney U test ("mwu") or T test ("t)
#' @param multiple.donor.compare a boolean that indicates whether t test
#' across multiple donors should be done
#' @return result: tibble of raw changes and p values for each feature of
#' interest, and fraction of cells with condition 2
#' @examples
#' ex.nn <- Fnn(wand.combined, input.markers)
#' SconeValues(ex.nn, wand.combined, funct.markers, "basal")
#' @export
SconeValues <- function(nn.matrix,
cell.data,
scone.markers,
unstim,
threshold = 0.05,
fold = "median",
stat.test = "mwu",
multiple.donor.compare = FALSE) {
message("running per-knn statistics")
# Get the donor names if you need it
if(multiple.donor.compare == TRUE) {
if(is.na(match("donor", cell.data))) {
stop("Multiple donors not found. Please change
multiple.donor.compare to FALSE")
}
donors <- unique(cell.data$donor)
}
# Get all stim.names
conditions <- unique(cell.data$condition)
stims <- conditions[!(conditions %in% unstim)]
# Unpack the nn matrix
nn.dist <- nn.matrix[[2]]
nn.matrix <- nn.matrix[[1]] # overwrite to be used in rest of this
# Get the KNN density estimation (1 / avg distance to knn)
avg.dist <- apply(nn.dist, 1, mean)
knn.density <- 1/avg.dist
# Variables to be used for progress tracker within the loop
percent <- nrow(cell.data) %/% 10
final <- lapply(stims, function(s) {
# Process each column in the nn matrix
# This makes a list of vecors corresponding qvalue and fold change
count <- 0
result <- lapply(seq_len(nrow(nn.matrix)), function(i) {
# A tracker
if(i %% percent == 0) {
count <<- count + 10
message(paste(count, "percent complete", sep = " "))
}
# Index the nn matrix
curr <- cell.data[nn.matrix[i,],]
basal <- curr[curr$condition == unstim,] %>% .[,scone.markers]
stim <- curr[curr$condition == s,] %>% .[,scone.markers]
# Fold change cmoparison and Mann-Whitney U test,
# along with "fraction condition 2"
output <- RunStatistics(basal, stim, fold, stat.test, s)
# Note that this will overwrite the initial output (for now)
if(multiple.donor.compare == TRUE) {
nn.donors <- unique(curr$donor)
# Multiple donor testing used if all donors are in each knn
if(length(unique(nn.donors)) < length(donors)) {
# Check length
donor.output <- rep(NA, times = length(scone.markers))
} else {
basal.d <- curr[curr$condition == unstim,] %>%
.[,c(scone.markers, "donor")]
stim.d <- curr[curr$condition == s,] %>%
.[,c(scone.markers, "donor")]
donor.output <- MultipleDonorStatistics(basal.d,
stim.d,
s,
donors)
}
# Name the donor t test vector
# names(donor.output) <- paste(scone,
# s,
# "donor.t.test.qvalue",
# sep = ".")
output <- c(output, donor.output)
}
return(output)
})
# Melt the list together into a tibble and return it
result <- do.call(rbind, result) %>%
as.tibble()
return(result)
})
# Returning an error message for the Zunder dataset,
# but not the Wanderlust dataset
final <- do.call(cbind, final) %>%
as.tibble()
# Do the p value correction, followed by thresholding if you're going to
frac <- final[, grep("fraction", colnames(final))]
final <- QCorrectionThresholding(final, threshold)
final <- bind_cols(final, frac)
# Add the density estimation
final$density <- knn.density
message("per-knn statistics complete")
# Change the only "character" column to a numeric column
return(final)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.