Nothing
#' @title assessGrouping
#'
#' @description Assess grouping of samples assigned to the same category
#' relative to random.
#'
#' @param distances Data frame object with at least three columns where the
#' first three columns are sample 1 name, sample 2 name, and the distance
#' between them.
#' @param annotations Data frame object with at least two columns where the
#' first two columns are sample name and the category of the sample for
#' grouping. Sample names must match sample 1 and sample 2 names in distances
#' data frame.
#' @param measurement The measurement for comparison between cases and controls
#' and statistical analysis ("mean", "max", or "min). Default "mean"
#' @param output A string denoting what information will be returned: either a
#' list of test and control measurement distances ("measurements"), the p-value
#' of the Kolmogorov-Smirnov test comparing test and control distributions
#' ("KS.pvalue"), or a ggplot object plotting the test and control distributions
#' ("plot"). Default "KS.pvalue"
#' @param ctrl_iterations The number of iterations to test for the control
#' distribution; an integer. Default 10000.
#'
#' @return \item{output = "KS.pvalue"}{the p-value of the Kolmogorov-Smirnov
#' test comparing test and control distributions}
#' \item{output = "plot"}{a ggplot object plotting the test and control
#' distributions}
#' \item{output = "measurements"}{a list or test and control measurement
#' distances}
#'
#' @importFrom magrittr '%>%'
#' @importFrom dplyr group_by filter summarize
#' @importFrom stats ks.test
#' @importFrom ggplot2 ggplot aes stat_ecdf theme_classic theme ylab xlab
#'
#' @examples
#' ## create random distance data frame
#' dist<-expand.grid(letters, letters)
#' dist$distance<-rnorm(nrow(dist))
#' annot<-data.frame(sample<-letters, category<- rep(1:13, 2))
#' ## get KS p-value
#' assessGrouping(dist, annot)
#' ## get plot of test vs control distributions
#' assessGrouping(dist, annot,
#' output = "plot")
#'
#' @export
assessGrouping <- function(distances,
annotations,
measurement = "mean",
output = "KS.pvalue",
ctrl_iterations = 10000) {
if (measurement != "mean" & measurement !=
"max" & measurement != "min") {
stop("measurement argument must equal 'mean', 'max', or 'min'
for statistical analysis.")
}
if (output != "measurements" & output != "plot" & output != "KS.pvalue") {
stop("output argument must equal 'measurements', 'plot, or 'KS.pvalue")
}
if (ctrl_iterations < 20) {
stop("ctrl_iterations argument is too small; consider increasing
iteration count")
}
if (ncol(distances) < 3) {
stop("The first three columns of the distances data frame should be
sample 1 name, sample 2 name, and the distance between them")
}
if (ncol(annotations) < 2) {
stop("The first two columns of the annotations data frame should be
sample name and the category of the sample for grouping")
}
colnames(distances)[c(1, 2, 3)] <- c("Var1", "Var2", "dist")
colnames(annotations)[c(1, 2)] <- c("sample", "category")
# filter for distances with proper annotations
test_distances <- dplyr::filter(distances,
.data$Var1 %in% annotations$sample &
.data$Var2 %in% annotations$sample)
test_distances<-test_distances[!duplicated(t(apply(test_distances[,c(1, 2)],
1, sort))), ] # remove duplicate pairs
test_distances <- dplyr::filter(test_distances, .data$Var1 != .data$Var2)
test_categories <- dplyr::filter(annotations, sample %in% distances$Var1 |
sample %in% distances$Var2)
if (nrow(test_distances) < 2 | nrow(test_categories) < 2) {
stop("Sample names in the distance data frame do not match sample names
in annotations data frame.")
}
# get categories for which there is at least two samples
unique_categories <- test_categories %>%
dplyr::group_by(.data$category) %>%
dplyr::summarise(no_rows = length(.data$category),
.groups = "drop") %>% dplyr::filter(.data$no_rows > 1)
test_output <- lapply(unique_categories$category,
function(i) {
subset_samples <- dplyr::filter(test_categories,
.data$category == i)$sample %>%
as.character()
subset_distances <- dplyr::filter(test_distances,
.data$Var1 %in% subset_samples &
.data$Var2 %in% subset_samples)$dist
if (measurement == "mean") {
return(mean(subset_distances))
} else if (measurement == "max") {
return(max(subset_distances))
} else {
return(min(subset_distances))
}
}) %>% unlist()
ctrl_output <- lapply(seq_len(ctrl_iterations),
function(i) {
subset_distances <- sample(test_distances$dist,
sample(unique_categories$no_rows, 1))
if (measurement == "mean") {
return(mean(subset_distances))
} else if (measurement == "max") {
return(max(subset_distances))
} else {
return(min(subset_distances))
}
}) %>% unlist()
if (output == "measurements") {
return(list(test_output = test_output,
ctrl_output = ctrl_output))
} else if (output == "KS.pvalue") {
return(suppressWarnings(ks.test(test_output, ctrl_output,
alternative = "greater")$p.value))
} else if (output == "plot") {
group <- c(rep("ctrl", length(ctrl_output)),
rep("test", length(test_output)))
dat_plot <- data.frame(KSD = c(ctrl_output, test_output),
group = group)
plot <- ggplot(dat_plot, aes(x = .data$KSD, group = group,
color = group)) +
stat_ecdf(size = 1, na.rm = TRUE) +
theme_classic() + theme(legend.position = "top") +
ylab("cummulative distribution") +
xlab("distance") + theme(legend.position = NULL)
return(plot)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.