#' PamParamSelection
#' @description Runs PAM with differing numbers of partitions and returns
#' validation metrics.
#' @param dataset A transcriptomics dataset. Preferably filtered first.
#' First columns should be gene names. All other columns should be expression
#' levels. Not needed if an argument to distance is given.
#' @param distance A distance matrix. If a distance matrix has already been
#' created (such as by using the\code{\link{DistanceGen}} function), the matrix
#' can be passed to this function to save time. If a distance matrix is not
#' provided then it will be generated by the function.
#' @param k A numeric vector giving the number of clusters to be evaluated.
#' @param metric The distance metric to be used to calculate the distances
#' between genes. See parallelDist::parDist for all accepted arguments. Also
#' allows the option of 'abs.correlation'. Not used if a distance matrix is
#' provided.
#' @param scale Logical. If TRUE then each gene will be scaled
#' @param nthreads The number of threads to be used for parallel computations.
#' Warning! Increasing this value will massively increase RAM usage.
#' If NULL then the maximum number of threads available will be used.
#' @examples
#' k.options <- seq(2,10)
#' pam.validation <- PamParamSelection(Laurasmappings, k=k.options, nthreads = 2)
#' @export
PamParamSelection <- function(dataset = NULL, distance = NULL, k = c(2, 5, 10),
metric = "euclidean", scale = TRUE, nthreads = 4){
i <- NULL
if (is.null(dataset) == TRUE & is.null(distance) == TRUE) {
# Check that either a dataset or a distance matrix has been provided
stop("Either a transcriptomics dataset or a distance matrix needs to be provided!")
}
if (is.null(nthreads) == TRUE) {
# Set the threads to maximum if NULL is given as argument for nthreads
nthreads <- parallel::detectCores()
}
# Load the dopar binary operator from foreach package
`%dopar%` <- foreach::`%dopar%`
if (is.null(distance) == TRUE) {
dataset.sc <- CircadianTools::GeneScale(dataset, scale = scale)
# Calculate distance matrix
distance <- CircadianTools::DistanceGen(dataset = dataset.sc,
metric = metric, nthreads = nthreads)
}
cl <- parallel::makeForkCluster(nthreads) # Create cluster for parallelism
doParallel::registerDoParallel(cl)
result.df <- foreach::foreach(i = k, .combine = rbind) %dopar% {
clust <- cluster::pam(distance, k = i) # Run PAM
# Calculate Dunn index
dunn <- clValid::dunn(distance, clust$cluster)
# Calculate connectivity
connect <- (clValid::connectivity(distance, clust$cluster))
# Calculate silhouette / silhouette width
silhoutte_values <- cluster::silhouette(clust)
silhouette <- mean(silhoutte_values[, 3])
data.frame(i, dunn, connect, silhouette, "PAM") # Make row of result.df
}
parallel::stopCluster(cl)
# Define column headings for result.df
colnames(result.df) <- c("k", "Dunn", "Connectivity", "Silhouette",
"Method")
return(result.df)
}
#' AgglomParamSelection
#' @description Runs agglomerative hierarchical clustering with differing
#' numbers of partitions and returns validation metrics.
#' @param dataset A transcriptomics dataset. Preferably filtered first. First
#' columns should be gene names. All other columns should be expression levels.
#' Not needed if an argument to distance is given.
#' @param distance A distance matrix. If a distance matrix has already been
#' created (such as by using the \code{\link{DistanceGen}} function), the
#' matrix can be passed to this function to save time. If a distance matrix is
#' not provided then it will be generated by the function.
#' @param k A numeric vector giving the number of clusters to be evaluated.
#' @param metric The distance metric to be used to calculate the distances
#' between genes. See parallelDist::parDist for all accepted arguments. Also
#' allows the option of 'abs.correlation'. Not used if a distance matrix is
#' provided.
#' @param scale Logical. If TRUE then each gene will be scaled
#' @param nthreads The number of threads to be used for parallel computations.
#' If NULL then the maximum number of threads available will be used.
#' @examples
#' k.options <- seq(2,10)
#' hclust.validation <- AgglomParamSelection(Laurasmappings, k=k.options,
#' nthreads = 2)
#' @export
AgglomParamSelection <- function(dataset = NULL, distance = NULL,
k = c(2, 5, 10), scale = TRUE, metric = "euclidean",
nthreads = 4) {
i <- NULL
if (is.null(dataset) == TRUE & is.null(distance) == TRUE) {
# Check that either a dataset or a distance matrix has been provided
stop("Either a transcriptomics dataset or a distance matrix needs to be provided!")
}
if (is.null(nthreads) == TRUE) {
# Set the threads to maximum if none is specified
nthreads <- parallel::detectCores()
}
# Load the dopar binary operator from foreach package
`%dopar%` <- foreach::`%dopar%`
if (is.null(distance) == TRUE) {
# Center / scale each gene
dataset.sc <- CircadianTools::GeneScale(dataset, scale = scale)
distance <- DistanceGen(dataset = dataset.sc, metric = metric,
nthreads = nthreads)
}
clust <- stats::hclust(distance) # Run hclustering
cl <- parallel::makeForkCluster(nthreads) # Create cluster for parallelism
doParallel::registerDoParallel(cl)
result.df <- foreach::foreach(i = k, .combine = rbind) %dopar% {
cluster <- stats::cutree(clust, k = i) # Cut tree
# Calculate Dunn index
dunn <- clValid::dunn(distance, cluster)
# Calculate connectivity
connect <- (clValid::connectivity(distance, cluster))
# Calculate silhouette / silhouette width
silhoutte_values <- cluster::silhouette(cluster, distance)
silhouette <- mean(silhoutte_values[, 3])
# Make row of result.df
data.frame(i, dunn, connect, silhouette, "Agglomerative")
}
parallel::stopCluster(cl)
# Define column headings for result.df
colnames(result.df) <- c("k", "Dunn", "Connectivity", "Silhouette",
"Method")
return(result.df)
}
#' ClusterParamSelection
#' @description Calculates validation metrics for different clustering methods
#' and different numbers of partitions. The validation metrics are plotted.
#' @param dataset A transcriptomics dataset. Preferably filtered first. First
#' columns should be gene names. All other columns should be expression levels.
#' @param distance A distance matrix. If a distance matrix has already been
#' created (such as by using the \code{\link{DistanceGen}} function), the matrix
#' can be passed to this function to save time. If a distance matrix is not
#' provided then it will be generated by the function.
#' @param k A numeric vector giving the number of clusters to be evaluated.
#' @param method The clustering method(s) to be used. Multiple methods can be
#' considered by providing a character vector. Currently accepts 'pam',
#' 'agglom' and 'diana'.
#' @param metric The distance metric to be used to calculate the distances
#' between genes. See parallelDist::parDist for all accepted arguments. Also
#' allows the option of 'abs.correlation'. Not used if a distance matrix is
#' provided.
#' @param nthreads The number of threads to be used for parallel computations.
#' If NULL then the maximum number of threads available will be used.
#' @param save.plot Logical. If TRUE then saves the plots generated.
#' @param save.df Logical. If TRUE then saves the validation metric results as a
#' .csv file.
#' @param scale Logical. If TRUE then each gene will be scaled.
#' @param path The directory to be used for saving plots and the validation
#' metric results to. Uses the name of the dataset object appended with
#' '_validation' if this argument is not specified.
#' @examples
#' k.options <- seq(2,10)
#' hclust.validation <- ClusterParamSelection(Laurasmappings, k=k.options,
#' nthreads = 2)
#' @export
ClusterParamSelection <- function(dataset = NULL, distance = NULL,
k = c(2, 5, 10), method = c("pam", "agglom", "diana"),
metric = "euclidean", scale = TRUE, nthreads = 2,
save.plot = TRUE, save.df = TRUE, path = NULL) {
Dunn <- NULL
Method <- NULL
Connectivity <- NULL
Silhouette <- NULL
if (is.null(dataset) == TRUE & is.null(distance) == TRUE) {
# Check that either a dataset or a distance matrix has been provided
stop("Either a transcriptomics dataset or a distance matrix needs to be provided!")
}
if (is.null(path) == TRUE) {
# If a filename isn't specified then the name of the dataframe object is used
path <- deparse(substitute(dataset))
# Add _validation to directory
path <- paste(path, "_validation", sep = "")
}
if (dir.exists(path) == FALSE) {
dir.create(path) # Create directory if it doesn't already exist
}
# Initialize dataframe to hold validation results
validation.df <- data.frame()
if (is.null(distance) == TRUE) {
dataset.sc <- CircadianTools::GeneScale(dataset = dataset,
scale = scale)
distance <- CircadianTools::DistanceGen(dataset = dataset.sc,
metric = metric, nthreads = nthreads)
}
if ("pam" %in% method == TRUE) {
pam.results <- CircadianTools::PamParamSelection(distance = distance,
k = k, nthreads = nthreads)
# Add pam validation results if pam is specified in methods
validation.df <- rbind(validation.df, pam.results)
}
if ("agglom" %in% method == TRUE) {
hclust.results <- CircadianTools::AgglomParamSelection(distance =
distance, k = k, nthreads = nthreads)
# Add agglom validation results if hclust is specified in methods
validation.df <- rbind(validation.df, hclust.results)
}
if ("diana" %in% method == TRUE) {
diana.results <- CircadianTools::DianaParamSelection(distance =
distance, k = k, nthreads = nthreads)
# Add diana validation results if diana is specified in methods
validation.df <- rbind(validation.df, diana.results)
}
# Vector of colours used in package
colours.vector <- c("#008dd5", "#ffa630", "#ba1200", "#840032", "#412d6b")
# Select number of colours required
colours.vector <- colours.vector[1:length(method)]
#### Dunn ####
p <- ggplot2::ggplot(ggplot2::aes(x = k, y = Dunn, color = Method),
data = validation.df) + ggplot2::geom_line(size = 1.2)
p <- p + ggplot2::theme_bw() + ggplot2::ggtitle("Dun Index Plot")
p <- p + ggplot2::scale_colour_manual(values = colours.vector)
p <- p + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 1))
print(p)
if (save.plot == TRUE) {
ggplot2::ggsave("dunn_index.png", plot = p, path = path, width = 10,
height = 4.5, units = "in") # Save the plot
}
#### Connectivity ####
p <- ggplot2::ggplot(ggplot2::aes(x = k, y = Connectivity, color = Method),
data = validation.df) + ggplot2::geom_line(size = 1.2)
p <- p + ggplot2::theme_bw() + ggplot2::ggtitle("Connectivity Plot")
p <- p + ggplot2::scale_colour_manual(values = colours.vector)
p <- p + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 1))
print(p)
if (save.plot == TRUE) {
ggplot2::ggsave("connectivity_plot.png", plot = p, path = path,
width = 10, height = 4.5, units = "in") # Save the plot
}
#### Silhouette ####
p <- ggplot2::ggplot(ggplot2::aes(x = k, y = Silhouette, color = Method),
data = validation.df) + ggplot2::geom_line(size = 1.2)
p <- p + ggplot2::theme_bw() + ggplot2::ggtitle("Silhouette Plot")
p <- p + ggplot2::scale_colour_manual(values = colours.vector)
p <- p + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 1))
print(p)
if (save.plot == TRUE) {
ggplot2::ggsave("silhouette_plot.png", plot = p, path = path,
width = 10, height = 4.5, units = "in") # Save the plot
}
if (save.df == TRUE) {
# Save the validation results as .csv file
df.filename <- paste(path, "/results.csv", sep = "")
utils::write.csv(validation.df, file = df.filename, row.names = FALSE)
}
return(validation.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.