# ---------------------------------------------------------------------------- #
#
# Functions which implement internal cluster validation metrics
#
# ---------------------------------------------------------------------------- #
# avgSilhouette
#
# For a clustering solution for any k, compute the average silhouette
# width as an internal measure of cluster validity
#
# @param clusters Vector where values are cluster assignments for samples
# @param dmatrix Dissimilarity matrix as generated by the
# \code{dist()} function
#
# @return Numeric value
avgSilhouette <- function(clusters, dmatrix) {
stats <- summary(cluster::silhouette(clusters, dmatrix))
return(stats[["avg.width"]])
}
# clusterValidity
#
# For a given value of k, an hclust object and the corresponding dissimilarity
# matrix, cut the tree at height \code{k} and compute internal measures of
# cluster validity. For now, we compute only the Average Silhouette width
# of the partition.
#
# @param k Numeric value giving number of clusters, used to cut the dendrogram
# from an \code{hclust} object
# @param hclust_obj hclust object
# @param dmatrix Dissimilarity matrix as produced by the \code{dist} function
#
# @return Data frame with one row
internalClusterValidity <- function(k, hclust_obj, dmatrix) {
clusters <- cutree(hclust_obj, k)
data.frame("k" = k,
"Average_Silhouette" = avgSilhouette(clusters, dmatrix))
}
getK <- function(ft_mat, optimal_clusters) {
if (optimal_clusters) {
ft_mat %>%
clusterValidityPerK() %>%
dplyr::slice(which.max(Average_Silhouette))
} else {
ft_mat %>%
clusterValidityPerK() %>%
dplyr::filter(k == 2)
}
}
# clusterValidityPerK
#
# Given an hclust object and the corresponding dissimilarity
# matrix, cut the tree at every possible height \code{k} and compute internal
# measures of cluster validity. For now, we compute only the Average Silhouette
# width of the partition.
#
# @param ft_mat Data frame or matrix, feature matrix as generated by
# \code{\link{summarizePeaks}} or \code{\link{binarizePeaks}}
#
# @return Data frame with one row per value of k
clusterValidityPerK <- function(ft_mat) {
dmatrix <- dist(ft_mat)
hclust_obj <- hclust(dmatrix, method = "complete")
k <- seq(2, (nrow(ft_mat) - 1))
lapply(k, internalClusterValidity, hclust_obj, dmatrix) %>%
dplyr::bind_rows()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.