#' Generic multivariateAnalysisR function
#' @rdname multivariateAnalysisR
#'
#' @description Performs multivariate analysis across specified clusters in datasets
#' @import ggplot2 RColorBrewer
#'
#' @param significanceLevel double value for testing significance in ANOVA test
#' @param patternKeys list of strings indicating pattern subsets from seuratobj to be analyzed
#' @param seuratobj Seurat Object Data containing patternKeys in meta.data
#' @param dictionaries list of dictionaries indicating clusters to be compared
#' @param customNames list of custom names for clusters in corresponding order
#' @param exclusive boolean value for determining interpolation between params in clusters
#' @param exportFolder name of folder to store exported graphs and CSV files
#' @param ANOVAwidth width of ANOVA png
#' @param ANOVAheight height of ANOVA png
#' @param CIwidth width of CI png
#' @param CIheight height of CI png
#' @param CIspacing spacing between each CI in CI graph
#'
#' @return a sorted list of ANOVA and CI results; ANOVA and Confidence Intervals are visualized and exported in both PNG and CSV
#' @export
#'
multivariateAnalysisR <- function(
significanceLevel = 0.05, # double value for testing significance in ANOVA test
patternKeys, # list of strings indicating pattern subsets from seuratobj to be analyzed
seuratobj, # Seurat Object Data containing patternKeys in meta.data
dictionaries, # list of dictionaries indicating clusters to be compared
customNames = NULL, # list of custom names for clusters in corresponding order
exclusive = TRUE, # boolean value for determining interpolation between params in clusters
exportFolder = "", # name of folder to store exported graphs
ANOVAwidth = 1000, # width of ANOVA png
ANOVAheight = 1000, # height of CI png
CIwidth = 1000, # width of CI png
CIheight = 1000, # height of CI png
CIspacing = 1 # spacing between each CI in CI graph
){
getSpecificData <- function(key, seuratobj) {
expr <- parse(text = key)
result <- eval(expr, envir = seuratobj@meta.data)
return (result)
}
getIndividualIndices <- function(key, values, seuratobj) {
result <- c()
subset <- getSpecificData(key = key, seuratobj = seuratobj)
for (value in values) {
indices <- which(subset == value)
result <- unique(c(result, indices))
}
return (result)
}
getFinalIndices <- function(exclusive, seuratobj, dictionary) {
allLists <- list()
allListsIndices <- 1
for (key in names(dictionary)) {
values <- dictionary[[key]]
individualIndices <- getIndividualIndices(key = key, values = values, seuratobj = seuratobj)
allLists[[allListsIndices]] <- individualIndices
allListsIndices <- allListsIndices + 1
}
if (exclusive == TRUE) {
result <- Reduce(intersect, allLists)
} else {
result <- unique(unlist(allLists))
}
return (result)
}
getPatternFromIndices <- function(indices, patternKey, seuratobj) {
result <- getSpecificData(key = patternKey, seuratobj = seuratobj)
return (result[indices])
}
ANOVA_find_mean <- function(patternList) {
combined_vector <- unlist(patternList)
result <- mean(combined_vector)
return (result)
}
ANOVA_find_ssb <- function(patternList, totalMean) {
numGroups <- length(patternList)
result <- 0
for (i in 1 : numGroups) {
groupMean <- mean(patternList[[i]])
result <- result + length(patternList[[i]]) * (groupMean - totalMean)^2
}
return (result)
}
ANOVA_find_ssw <- function(patternList) {
groupMeans <- sapply(patternList, mean)
result <- 0
for (i in seq_along(patternList)) {
groupValues <- patternList[[i]]
groupMean <- groupMeans[i]
squaredDiff <- sum((groupValues - groupMean)^2)
result <- result + squaredDiff
}
return(result)
}
# ANOVA is one-way for this case, as we are representing different dictionaries as one big unidirectional change rather than a change across two independent factors
ANOVA <- function(significanceLevel = 0.05, patternKey, exclusive, seuratobj, dictionaries) {
patternList <- list()
patternListIndices <- 1
for (dictionary in dictionaries) {
dictIndices <- getFinalIndices(exclusive = exclusive, seuratobj = seuratobj, dictionary = dictionary)
dictPatterns <- getPatternFromIndices(indices = dictIndices, patternKey = patternKey, seuratobj = seuratobj)
patternList[[patternListIndices]] <- dictPatterns
patternListIndices <- patternListIndices + 1
}
# ANOVA analysis begins
# step 1: find total mean
totalMean <- ANOVA_find_mean(patternList)
# step 2: find sum of squares between groups (ssb)
ssb <- ANOVA_find_ssb(patternList, totalMean)
# step 3: find sum of squares within groups (ssw)
ssw <- ANOVA_find_ssw(patternList)
# step 4: find degrees of freedom (df) for between groups (dfB) and within groups (dfW)
k <- length(patternList)
N <- sum(sapply(patternList, length))
dfB <- k - 1
dfW <- N - k
# step 5: find mean square between groups (msb) and the mean square within groups (msw)
msb <- ssb / dfB
msw <- ssw / dfW
# step 6: find F-statistic, p-value, and boolean value for significance
FStats <- msb / msw
pValue <- 1 - pf(FStats, dfB, dfW)
isSignificant <- if (pValue < significanceLevel) TRUE else FALSE
# return result
return (c(FStats, pValue, isSignificant))
}
CI <- function(patternKey, exclusive, seuratobj, dictionary1, dictionary2) {
# for dictionary1
pattern1DictIndices <- getFinalIndices(exclusive = exclusive, seuratobj = seuratobj, dictionary = dictionary1)
pattern1DictPatterns <- getPatternFromIndices(indices = pattern1DictIndices, patternKey = patternKey, seuratobj = seuratobj)
# for dictionary2
pattern2DictIndices <- getFinalIndices(exclusive = exclusive, seuratobj = seuratobj, dictionary = dictionary2)
pattern2DictPatterns <- getPatternFromIndices(indices = pattern2DictIndices, patternKey = patternKey, seuratobj = seuratobj)
# CI analysis begins
# Welch’s t-test is used as it is designed to be more reliable when the two samples have unequal variances and/or unequal sample sizes
# step 1: perform Welch's t-test
welchResults <- t.test(pattern1DictPatterns, pattern2DictPatterns, var.equal=FALSE) # var.equal=TRUE if variances are approximately equal
# step 2: get confidence interval (95%)
CILowerBound <- welchResults$conf.int[1]
CIUpperBound <- welchResults$conf.int[2]
# flip CI to keep it positively directional
if (mean(c(CILowerBound, CIUpperBound)) < 0) {
CILowerBound <- -CILowerBound
CIUpperBound <- -CIUpperBound
}
if (CILowerBound > CIUpperBound) {
temp <- CIUpperBound
CIUpperBound <- CILowerBound
CILowerBound <- temp
}
# return result
return (c(CILowerBound, CIUpperBound))
}
multivariateNaming <- function(index, customNames) {
if ((index < 1) || (index > length(customNames))) {
return (paste("cluster", index, sep = ""))
} else {
return (customNames[index])
}
}
# check basic conditions
errorMsg <- ""
if (significanceLevel < 0) {
errorMsg <- paste(errorMsg, "significanceLevel must be non-negative.\n", sep="")
}
if (length(patternKeys) < 1) {
errorMsg <- paste(errorMsg, "patternKeys must have at least one pattern.\n", sep="")
}
if (length(dictionaries) < 2) {
errorMsg <- paste(errorMsg, "dictionaries must have at least two clusters to be compared.\n", sep="")
}
if (!is.logical(exclusive)) {
errorMsg <- paste(errorMsg, "exclusive must be a boolean value.\n", sep="")
}
if (errorMsg != "") {
stop(cat(errorMsg), "Invalid parameter(s). Please fix and try again.")
}
# begin
grossList <- list()
for (patternKey in patternKeys) {
# ANOVA
patternANOVA <- ANOVA(
significanceLevel = significanceLevel,
patternKey = patternKey,
exclusive = exclusive,
seuratobj = seuratobj,
dictionaries = dictionaries
)
# CI
patternCI <- list()
# looping through all pairs
dictionariesLength <- length(dictionaries)
for (firstIndex in 1 : (dictionariesLength - 1)) {
for (secondIndex in (firstIndex + 1) : dictionariesLength) {
pair1Name <- multivariateNaming(firstIndex, customNames)
pair2Name <- multivariateNaming(secondIndex, customNames)
pairCI <- CI(
patternKey = patternKey,
exclusive = exclusive,
seuratobj = seuratobj,
dictionary1 = dictionaries[[firstIndex]],
dictionary2 = dictionaries[[secondIndex]]
)
pairCIresults <- list(
"pair1Name" = pair1Name,
"pair2Name" = pair2Name,
"CI" = pairCI
)
patternCI <- c(patternCI, list(pairCIresults))
}
}
patternResults <- list("patternKey" = patternKey,
"ANOVA" = patternANOVA,
"CI" = patternCI)
grossList <- c(grossList, list(patternResults))
}
# reorder in increasing order (why not decreasing: ggplot reverses)
sorted_indices <- order(sapply(grossList, function(x) x[["ANOVA"]][1]), decreasing = FALSE)
sortedGrossList <- grossList[sorted_indices]
# ggplotting ANOVA
# step 1: initialization
name <- sapply(sortedGrossList, function(x) x$patternKey)
value <- sapply(sortedGrossList, function(x) x[["ANOVA"]][1])
pValue <- sapply(sortedGrossList, function(x) x[["ANOVA"]][2]) # for CSV export
significance <- sapply(sortedGrossList, function(x) x[["ANOVA"]][3])
color <- ifelse(significance == 1, "blue", "red")
# Create a data frame
df <- data.frame(
name = factor(name, levels = unique(name)),
value = value,
color = color
)
# Create barplot
ggplot(df, aes(x = name, y = value)) +
geom_bar(stat = 'identity', aes(fill = color), position = 'dodge') +
coord_flip() +
labs(title = "Multivariate Analysis - ANOVA",
x = "Patterns",
y = "F-statistic") +
scale_fill_manual(name = "Significance",
values = c("red" = "red", "blue" = "blue"),
labels = c(
"red" = paste("p > ", significanceLevel, sep = ""),
"blue" = paste("p < ", significanceLevel, sep = ""))
) +
theme_minimal() +
theme(
axis.line = element_line(linewidth = 0.5, linetype = "solid"),
plot.title = element_text(size = 20, face = "bold"),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12)
)
# save ANOVA plot
ANOVA_file_path <- file.path(getwd(), exportFolder, "multivariateAnalysisR_ANOVA.png")
ggsave(filename = ANOVA_file_path, plot = last_plot(), width = ANOVAwidth, height = ANOVAheight, units = "px", limitsize = FALSE, device = "png", bg = "white")
# ggplotting CI
group <- unlist(lapply(sortedGrossList, function(x) rep(x$patternKey, times = length(x$CI))))
group <- factor(group, levels = unique(group)) # convert 'group' to a factor with original order
ci_lower <- unlist(
lapply(sortedGrossList, function(x) {
unlist(
lapply(x$CI, function(y) {
rep(y$CI[1], times = length(y$CI[1]))
})
)
})
)
ci_upper <- unlist(
lapply(sortedGrossList, function(x) {
unlist(
lapply(x$CI, function(y) {
rep(y$CI[2], times = length(y$CI[2]))
})
)
})
)
lengthSortedGrossListCI <- length(sortedGrossList[[1]]$CI)
y_pos <- seq(length(group) + length(sortedGrossList) - 1) * CIspacing # y-axis positions for each group and dividing lines
dividing_lines_indices <- seq(length(sortedGrossList) - 1) * (lengthSortedGrossListCI + 1)
dividing_lines <- y_pos[dividing_lines_indices]
y_pos <- y_pos[-dividing_lines_indices]
if (lengthSortedGrossListCI < 3) {
colorPalette <- switch(lengthSortedGrossListCI, "red", c("red", "blue"))
} else {
colorPalette <- grDevices::rainbow(lengthSortedGrossListCI) # different color-generating tools may be used for better distinctiveness
}
colors <- rep(colorPalette, length(sortedGrossList))
labels <- group # identify which group (patternKey) the CI refers to
color_labels <- unlist(
lapply(sortedGrossList, function(x) {
unlist(
lapply(x$CI, function(y) {
paste(y$pair1Name, y$pair2Name, sep = "-")
})
)
})
)
# Create a data frame
data <- data.frame(
group,
ci_lower,
ci_upper,
y_pos,
colors,
labels
)
# Create the horizontal plot
ggplot_obj <- ggplot(data, aes(y = y_pos, x = (ci_lower + ci_upper) / 2)) +
geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper, color = colors), height = 0.2, size = 1.5) +
geom_text(aes(label = labels), vjust = -1.6, size = 4, fontface = "bold") +
scale_color_manual(name = "Color Labels",
values = colors,
labels = color_labels) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") + # Add a reference line at x = 0
labs(title = "Multivariate Analysis - Confidence Intervals",
x = "Confidence Intervals",
y = "Patterns") +
theme_minimal() +
theme(
axis.line = element_line(linewidth = 0.5, linetype = "solid"),
plot.title = element_text(size = 20, face = "bold"),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12)
)
# dividing lines per pattern
ggplot_obj <- ggplot_obj +
geom_hline(yintercept = dividing_lines, linetype = "dashed", color = "black", size = 0.5)
# save CI plot
CI_file_path <- file.path(getwd(), exportFolder, "multivariateAnalysisR_CI.png")
ggsave(filename = CI_file_path, plot = last_plot(), width = CIwidth, height = CIheight, units = "px", limitsize = FALSE, device = "png", bg = "white")
# CSV export
# ANOVA
ANOVA_significance <- ifelse(significance == 1, "TRUE", "FALSE")
ANOVA_CSV_df <- data.frame(
"Pattern" = rev(name),
"F-statistic value" = rev(value),
"p-value" = rev(pValue),
"Statistically Significant" = rev(ANOVA_significance)
)
ANOVA_export_path <- file.path(getwd(), exportFolder, "multivariateAnalysisR_ANOVA.csv")
write.csv(ANOVA_CSV_df, ANOVA_export_path, row.names = FALSE)
# CI
color_labels_pair1name <- unlist(
lapply(sortedGrossList, function(x) {
unlist(
lapply(x$CI, function(y) {
y$pair1Name
})
)
})
)
color_labels_pair2name <- unlist(
lapply(sortedGrossList, function(x) {
unlist(
lapply(x$CI, function(y) {
y$pair2Name
})
)
})
)
CI_CSV_df <- data.frame(
"Pattern" = rev(group),
"CI Lower Value" = rev(ci_lower),
"CI Higher Value" = rev(ci_upper),
"CI Cluster 1" = color_labels_pair1name,
"CI Cluster 2" = color_labels_pair2name
)
CI_export_path <- file.path(getwd(), exportFolder, "multivariateAnalysisR_CI.csv")
write.csv(CI_CSV_df, CI_export_path, row.names = FALSE)
return(sortedGrossList)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.