Nothing
########################################################################################################################
## clustering.R
## created: 2012-06-18
## creator: Yassen Assenov
## ---------------------------------------------------------------------------------------------------------------------
## Definition and implementation of clustering algorithms and visualization.
########################################################################################################################
## G L O B A L S #######################################################################################################
## Dissimilarity metrics used in clustering
DIST.METRICS <- c(
"correlation" = "correlation-based",
"manhattan" = "Manhattan distance",
"euclidean" = "Euclidean distance")
## Clustering algorithms applied
ALGORITHMS <- c("hierarchical" = "hierarchical")
## Linkage methods used in hierarchical clustering
AGGLOMERATIONS <- c("average" = "average", "complete" = "complete", "median" = "median-based")
## C L A S S ###########################################################################################################
#' RnBeadClustering Class
#'
#' Storage class for the results of a clustering algorithm applied on an \code{\linkS4class{RnBSet}} dataset.
#'
#' @section Slots:
#' \describe{
#' \item{dissimilarity}{Dissimilarity metric used in the form of a one-element \code{character} vector.}
#' \item{dimensionality}{Dimensionality of the clustered points in the form of a one-element \code{integer} vector.}
#' \item{algorithm}{Clustering algorithm (and optionally, type) as a \code{character} vector of length 1 or 2.}
#' \item{result}{Resulting object after applying the clustering algorithm on a dataset.}
#' \item{assignments}{Cluster assignments for the samples in the dataset as a matrix. Row names in this matrix are
#' sample identifiers, and each column is dedicated to partitioning into \emph{k} clusters for a fixed \emph{k}.}
#' \item{silhouettes}{\code{numeric} vector of mean silhouette values for each tested value of \emph{k}.}
#' }
#'
#' @section Methods and Functions:
#' \describe{
#' \item{\code{samples}}{Gets the identifiers of all samples used in the clustering.}
#' }
#'
#' @name RnBeadClustering-class
#' @rdname RnBeadClustering-class
#' @aliases initialize,RnBeadClustering-method
#' @author Yassen Assenov
#' @exportClass RnBeadClustering
setClass("RnBeadClustering",
representation(
dissimilarity = "character",
dimensionality = "integer",
algorithm = "character",
result = "list",
assignments = "matrix",
silhouettes = "numeric"),
package = "RnBeads")
## M E T H O D S #######################################################################################################
setValidity("RnBeadClustering",
function(object) {
x <- object@dissimilarity
if (!(is.character(x) && length(x) == 1 && (!is.na(x)))) {
return("invalid value for dissimilarity; expected single character")
}
if (!(x %in% names(DIST.METRICS))) {
return("unsupported value for dissimilarity")
}
x <- object@dimensionality
if (!(is.integer(x) && length(x) == 1 && isTRUE(x > 0))) {
return("invalid value for dimensionality; expected single positive integer")
}
x <- object@algorithm
if (!(is.character(x) && (length(x) %in% c(1, 2)) && (!any(is.na(x))))) {
return("invalid value for algorithm; expected character")
}
if (!(x[1] %in% names(ALGORITHMS))) {
return("unsupported value for algorithm")
}
if (length(x) == 2) {
if (!(x[2] %in% names(AGGLOMERATIONS))) {
return("unsupported value for algorithm; unsupported linkage method")
}
}
if (!(length(object@result) != 0)) {
return("invalid value for result")
}
x <- object@assignments
if (!(is.integer(x) && is.matrix(x) && (!any(is.na(x))))) {
return("invalid value for assignments")
}
## TODO: Validate that colnames(x) are "integer" values in the range 2:nrow(x)
y <- object@silhouettes
if (!(is.numeric(y) && length(y) == ncol(x))) {
return("invalid value for silhouettes")
}
if (!identical(colnames(x), names(y))) {
return("mismatch between assignments and silhouettes")
}
TRUE
}
)
########################################################################################################################
setMethod("initialize", "RnBeadClustering",
function(.Object, dissimilarity, dimensionality, algorithm, result, assignments, silhouettes) {
.Object@dissimilarity <- dissimilarity
.Object@dimensionality <- dimensionality
.Object@algorithm <- algorithm
.Object@result <- result
.Object@assignments <- assignments
.Object@silhouettes <- silhouettes
validObject(.Object)
.Object
}
)
########################################################################################################################
if (!isGeneric("samples")) {
setGeneric("samples", function(object) standardGeneric("samples"))
}
#' @rdname samples-methods
#' @export
setMethod("samples", signature(object = "RnBeadClustering"),
function(object) {
rownames(object@assignments)
}
)
## F U N C T I O N S ###################################################################################################
## dist.correlation
##
## Computes correlation-based dissimilarities between the row vectors of the given matrix.
##
## @param x Matrix to use in calculating dissimilarities.
## @return Object of class \code{"dist"}.
## @author Yassen Assenov
dist.correlation <- function(x) {
res <- as.dist(1 - cor(t(x), use = "pairwise.complete.obs"))
#take care of the following type of warnings that occur for invariable rows:
# Warning message:
# In cor(t(x), use = "pairwise.complete.obs") :
# the standard deviation is zero
res[is.na(res)] <- 1
return(res)
}
########################################################################################################################
## get.silhouette
##
## Computes the average silhouette value of the specified clustering using the given distance metric.
##
## @param x \code{integer} vector with k different integer cluster codes.
## @param dist Distance metric to use.
## @return Average among the silhouette values of all points in \code{x}.
## @author Yassen Assenov
get.silhouette <- function(x, dist) {
mean(silhouette(x, dist = dist)[, "sil_width"])
}
########################################################################################################################
## plot.heatmap.rand
##
## Creates a heatmap visualizing the given Rand indices.
##
## @param tbl Matrix of computed adjusted Rand indices. All values are expected to be between -1 and 1.
## @param fname File name to contain the generated plot. If this is \code{NA} (default), the heatmap is plotted to the
## currently active graphics device.
## @param report Report to host the generated image. This parameter is used only when \code{fname} is not \code{NA}. If
## provided, this parameter must be an instance of class \code{Report}.
##
## @author Yassen Assenov
plot.heatmap.rand <- function(tbl, fname = NA, report = NULL) {
tbl.melt <- melt(tbl, varnames = c("x", "y"))
tbl.melt[[1]] <- factor(as.character(tbl.melt[[1]]), levels = rev(rownames(tbl)))
tbl.melt[[2]] <- factor(as.character(tbl.melt[[2]]), levels = colnames(tbl))
colnames(tbl.melt)[3] <- "rindex"
if (!is.na(fname)) {
width <- 2 + ncol(tbl) * 0.4 + 1.5
height <- 3 + nrow(tbl) * 0.4
rplot <- createReportPlot(fname, report, width = width, height = height)
}
colors.g <- rnb.getOption("colors.3.gradient")
pp <- ggplot(tbl.melt) + aes_string("y", "x") + coord_fixed() + labs(x = NULL, y = NULL, fill = "Rand index") +
geom_tile(aes_string(fill = "rindex"), color = "white") +
scale_fill_gradient2(limits = c(-1, 1), low = colors.g[1], mid = colors.g[2], high = colors.g[3]) +
scale_x_discrete(expand = c(0, 0)) + scale_y_discrete(expand = c(0, 0)) +
theme(axis.ticks = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1),
plot.margin = unit(c(0.5, 1.5, 0.5, 0.5), c("lines", "in", "lines", "lines")),
legend.justification = c(0, 0.5), legend.position = c(1, 0.5))
print(pp)
if (!is.na(fname)) {
return(invisible(off(rplot)))
}
}
########################################################################################################################
#' rnb.execute.clustering
#'
#' Performs hierarchical clustering on the samples of the given dataset using multiple distance metrics and
#' agglomeration methods for a single given region type.
#'
#' @param rnb.set Methylation dataset as an object of type inheriting \code{\linkS4class{RnBSet}}.
#' @param region.type the clustering is performed on methylation levels from regions of that type.
#' see \code{\link{rnb.region.types}} for possible values.
#' @return List of clustering results, whereby each element is an object of type \code{\linkS4class{RnBeadClustering}}.
#' In case clustering cannot be performed, the return value is \code{NULL}. Reasons for a failure include, among
#' others, the case when \code{rnb.set} contains less than 3 samples, or undefined distances between a pair of
#' samples due to (too many) missing values in the respective methylation matrix.
#'
#' @examples
#' \donttest{
#' library(RnBeads.hg19)
#' data(small.example.object)
#' results <- rnb.execute.clustering(rnb.set.example, "promoters")
#' # List applied dissimilarity metrics
#' sapply(results, slot, "dissimilarity")
#' # List applied clustering algorithms
#' str(lapply(results, slot, "algorithm"))
#' }
#' @author Yassen Assenov
#' @export
rnb.execute.clustering <- function(rnb.set, region.type = "sites") {
if (!inherits(rnb.set, "RnBSet")) {
stop("invalid value for rnb.set")
}
X <- t(meth(rnb.set, type = region.type))
if (nrow(X) < 3) {
## Too few samples; cannot compute silhouette values
rnb.status(c("Skipped clustering on", region.type, ": too few samples"))
return(NULL)
}
if (rnb.getOption("exploratory.clustering") == "top") {
N <- max(rnb.getOption("exploratory.clustering.top.sites"))
if (N < ncol(X)) {
X <- X[, order(colVars(X, na.rm = TRUE), decreasing = TRUE)[1:N], drop = FALSE]
}
rm(N)
}
results <- list()
for (dist.metric in names(DIST.METRICS)) {
if (dist.metric == "correlation") {
dmatrix <- dist.correlation(X)
} else {
dmatrix <- dist(X, method = dist.metric)
}
for (agglomeration in names(AGGLOMERATIONS)) {
result <- tryCatch(hclust(dmatrix, method = agglomeration), error = function(e) { NULL })
if (is.null(result)) {
logger.warning(c("Failed clustering on", region.type, "( using", dist.metric, ")"))
return(NULL)
}
cluster.assignments <- as.matrix(cutree(result, k = c(2:(nrow(X) - 1))))
silhouette.values <- apply(cluster.assignments, 2, get.silhouette, dist = dmatrix)
attr(result, "class") <- NULL # remove S3 class information in order to store it in RnBeadClustering
result <- new("RnBeadClustering", dist.metric, ncol(X), c("hierarchical", agglomeration), result,
cluster.assignments, silhouette.values)
results <- c(results, result)
}
rnb.status(c("Performed clustering on", region.type, "using", dist.metric, "as a distance metric"))
}
return(results)
}
########################################################################################################################
#' rnb.execute.clustering.all
#'
#' Performs hierarchical clustering on the samples of the given dataset using multiple distance metrics and
#' agglomeration methods for all suggested site and region types.
#'
#' @param rnb.set Methylation dataset as an object of type inheriting \code{\linkS4class{RnBSet}}.
#' @return List of list of clustering results; each element corresponds to one region type and is a list of objects
#' of type \code{\linkS4class{RnBeadClustering}}.
#'
#' @seealso \code{\link{rnb.execute.clustering}} for performing clustering using a single site or region type.
#'
#' @author Fabian Mueller
#' @export
rnb.execute.clustering.all <- function(rnb.set) {
if (!inherits(rnb.set, "RnBSet")) {
stop("invalid value for rnb.set")
}
result <- list()
if (rnb.getOption("analyze.sites")) {
result[["sites"]] <- rnb.execute.clustering(rnb.set, "sites")
}
for (rtype in rnb.region.types.for.analysis(rnb.set)) {
clust.res <- rnb.execute.clustering(rnb.set, rtype)
if (is.null(clust.res)){
result[rtype] <- list(NULL)
} else {
result[[rtype]] <- clust.res
}
}
return(result)
}
########################################################################################################################
### Just a helper function to not copy and paste code
rnb.section.clustering.add.heatmap <- function(report, X, fname, cluster.rows, clust.result, sample.ids, locus.colors,
sample.colors = NULL) {
dist.metric <- clust.result@dissimilarity
agglomeration <- clust.result@algorithm[2]
rplot <- createReportPlot(fname, report, width = 7.2, height = 7.2,
create.pdf = rnb.getOption("exploratory.clustering.heatmaps.pdf"), high.png = 200)
cresult <- clust.result@result
attr(cresult, "class") <- "hclust"
cresult <- as.dendrogram(cresult)
meth.colors <- get.methylation.color.panel()
meth.breaks <- seq(0, 1, length.out = length(meth.colors) + 1L)
heatmap.parameters <- list(x = X, Rowv = FALSE, Colv = cresult, dendrogram = "column", scale = "none",
breaks = meth.breaks, col = meth.colors, trace = "none",
density.info = "density", key.title = NA, key.xlab = expression(beta), key.ylab = "Density")
if (cluster.rows) {
## Plotting sites/regions, attempt to cluster rows
heatmap.parameters$labRow <- NA
heatmap.parameters$margins <- c(4, 1)
if (dist.metric == "correlation") {
distfun <- dist.correlation
} else {
distfun <- function(x) { dist(x, method = dist.metric) }
}
cc <- tryCatch(hclust(distfun(X), method = agglomeration), error = function(err) {
## Row dendrogram cannot be drawn
logger.warning(c("Could not cluster rows of the heatmap with method", dist.metric, "and linkage",
agglomeration, ". Discarding row dendrogram"))
return(NULL)
}
)
if (!is.null(cc)) {
heatmap.parameters$Rowv <- as.dendrogram(cc)
heatmap.parameters$dendrogram <- "both"
}
} else {
## Plotting quantiles
heatmap.parameters$labRow <- rownames(X)
heatmap.parameters$margins <- c(4, 3)
}
if(any(nchar(sample.ids)>10)){
heatmap.parameters$margins[1] <- 10
}
heatmap.parameters$labCol <- sample.ids
heatmap.parameters$RowSideColors <- locus.colors
heatmap.parameters$ColSideColors <- sample.colors
tryCatch(suppressWarnings(do.call(heatmap.2, heatmap.parameters)), error = function(err) {
print(rnb.message.plot("Heatmap could not be created"))
list()
}
)
return(off(rplot))
}
########################################################################################################################
#' rnb.section.clustering
#'
#' Creates a report section dedicated to sample clustering.
#'
#' @param report Report on methylation profiles to contain the clustering section. This must be an object of
#' type \code{\linkS4class{Report}}.
#' @param rnb.set Methylation dataset as an object of type inheriting \code{\linkS4class{RnBSet}}.
#' @param clust.results Results of applying hierarchical clustering as an object of type
#' \code{\linkS4class{RnBeadClustering}}, or a non-empty list of such objects.
#' @param site.reg.info Information about traits to be visualized and tested for associations with clusters, as
#' returned by \code{\link{get.site.and.region.types}}.
#' @param section.started Flag indicating if the clustering section in the given report is already created.
#' @return The modified report.
#'
#' @author Yassen Assenov, Fabian Mueller
#' @noRd
rnb.section.clustering <- function(report, rnb.set, clust.results, site.reg.info, section.started = FALSE) {
rnb.require("mclust")
beta.values <- meth.matrices(rnb.set)
if (inherits(clust.results, "RnBeadClustering")) {
clust.results <- list(clust.results)
}
if (!(is.list(clust.results) && length(clust.results) != 0 &&
all(sapply(unlist(clust.results,recursive=FALSE), inherits, "RnBeadClustering")))) {
stop("invalid value for clust.results; expected list of list of RnBeadClustering objects")
}
if (!all(sapply(names(clust.results), function(rr) { all(sapply(clust.results[[rr]],
FUN=function(x){identical(samples(x), colnames(beta.values[[rr]]))})) }))) {
stop("invalid value for clust.results; does not match rnb.set")
}
dist.metrics <- unique(sapply(unlist(clust.results,recursive=FALSE), slot, "dissimilarity"))
algorithms <- lapply(clust.results, FUN=function(x){
lapply(x, slot, "algorithm")
})
if (!all(sapply(unlist(algorithms,recursive=FALSE), length) == 2)) {
stop("unsupported clustering algorithms")
}
agglomerations <- unique(sapply(unlist(algorithms,recursive=FALSE), "[", 2))
if (any(sapply(clust.results,FUN=function(x){length(dist.metrics) * length(agglomerations) != length(x)}))) {
stop("invalid value for clust.results; list is incomplete")
}
if (rnb.getOption("logging") && logger.isinitialized() == FALSE) {
logger.start(fname = NA) # initialize console logger
}
logger.start("Clustering Section")
## Define sample IDs and coloring
sample.ids <- samples(rnb.set)
pheno.columns <- names(rnb.sample.groups(rnb.set,rnb.getOption("exploratory.columns")))
if (length(pheno.columns) != 0) {
pheno.colors <- rnb.pheno2colors(pheno(rnb.set)[, pheno.columns])
if (length(pheno.columns) == 1) {
pheno.colors$colors <- as.matrix(pheno.colors$colors)
colnames(pheno.colors$colors) <- pheno.columns
pheno.colors$mapping <- list(pheno.colors$mapping)
}
scolors <- 1:length(pheno.columns)
} else {
pheno.colors <- NULL
scolors <- 0L
}
locus.columns <- unique(unlist(lapply(site.reg.info,FUN=function(x){colnames(x)})))
lcolors <- 0L
if (length(locus.columns) != 0) {
lcolors <- 1:length(locus.columns)
}
reg.types <- intersect(names(clust.results),names(beta.values))
## Create heatmaps
description <- "The figure below shows clustering of samples using several algorithms and distance metrics."
if (section.started) {
rnb.add.paragraph(report, description)
} else {
report <- rnb.add.section(report, "Clustering", description)
}
probe.order <- lapply(beta.values,FUN=function(x){
order(rowVars(x,na.rm = TRUE), decreasing = TRUE)
})
logger.start("Generating Heatmaps")
report.plots <- list(quant=list(),loc=list())
topprobes <- c(0L, rnb.getOption("exploratory.clustering.top.sites"))
locus.colors <- list()
for (ri in 1:length(reg.types)) {
rt <- reg.types[ri]
if (rt %in% names(site.reg.info)) {
cur.annot <- colnames(site.reg.info[[rt]])
locus.colors[[rt]] <- lapply(locus.columns,FUN=function(aa){
if (aa %in% cur.annot){
get.site.and.region.types.colors(site.reg.info,rt,aa)
} else {
NULL
}
})
names(locus.colors[[rt]]) <- locus.columns
} else {
locus.colors[[rt]] <- NULL
}
logger.start(c("Region type:",rt))
clust.res <- clust.results[[rt]]
for (tprobes in topprobes){
tprobes.trunc <- tprobes #truncate if tprobes exceeds the number of sites/regions
if (tprobes == 0) {
X <- apply(beta.values[[rt]], 2, quantile, probs = seq(0, 1, length = 101), na.rm = TRUE)
rownames(X)[1:nrow(X) %% 10 != 1] <- ""
} else {
tprobes.trunc <- min(c(tprobes,length(probe.order[[rt]])))
X <- beta.values[[rt]][probe.order[[rt]][1:tprobes.trunc], ]
}
for (i in 1:length(clust.res)){
clust.result <- clust.res[[i]]
dist.metric <- clust.result@dissimilarity
agglomeration <- clust.result@algorithm[2]
fname.base <- paste(ri, dist.metric, agglomeration, sep = "_")
for (si in scolors) {
sample.colors <- NULL
locus.colors.cur <- NULL
fname <- fname.base
if (si != 0) {
fname <- paste(fname.base, si, sep = "_")
sample.colors <- pheno.colors$colors[, si]
}
if (tprobes != 0){
fname.base.2 <- paste("heatmapLoc",fname,sep="_")
fname <- fname.base.2
for (li in lcolors) {
if (li != 0) {
fname <- paste(fname.base.2, li, sep = "_")
locus.colors.cur <- locus.colors[[rt]][[locus.columns[li]]][probe.order[[rt]][1:tprobes.trunc]]
}
fname <- paste(fname, tprobes, sep = "_")
rplot <- rnb.section.clustering.add.heatmap(report,X,fname,TRUE,clust.result,sample.ids,locus.colors.cur,sample.colors)
report.plots[["loc"]] <- c(report.plots[["loc"]],rplot)
}
} else {
fname <- paste("heatmapQuant",fname,sep="_")
rplot <- rnb.section.clustering.add.heatmap(report,X,fname,FALSE,clust.result,sample.ids,NULL,sample.colors)
report.plots[["quant"]] <- c(report.plots[["quant"]],rplot)
}
}
}
}
logger.completed()
}
setting.names.common <- list(
"Site/region level" = reg.types,
"Dissimilarity metric" = DIST.METRICS[dist.metrics],
"Agglomeration strategy (linkage)" = AGGLOMERATIONS[agglomerations])
names(setting.names.common[["Site/region level"]]) <- 1:length(reg.types)
setting.names <- setting.names.common
if (max(scolors) != 0) {
sam.colors <- colnames(pheno.colors$colors)
names(sam.colors) <- 1:ncol(pheno.colors$colors)
setting.names[["Sample color based on"]] <- sam.colors
rm(sam.colors)
}
txt.clustering <- paste0("Hierarchical clustering of samples based on ",
ifelse(rnb.getOption("exploratory.clustering") == "all", "all methylation values",
paste(max(rnb.getOption("exploratory.clustering.top.sites")), "most variable loci")), ". The heatmap displays ")
description <- c(txt.clustering, "methylation percentiles per sample. ",
"The legend for sample coloring can be found in the figure below.")
report <- rnb.add.figure(report, description, report.plots[["quant"]], setting.names)
if (max(lcolors) != 0) {
lcolors <- locus.columns
names(lcolors) <- 1:length(locus.columns)
setting.names[["Site/region color based on"]] <- lcolors
}
setting.names[["Visualize"]] <- paste(topprobes[-1], "most variable loci")
names(setting.names[["Visualize"]]) <- topprobes[-1]
description <- c(txt.clustering, "only selected sites/regions with the highest variance across all samples. ",
"The legend for locus and sample coloring can be found in the figure below.")
report <- rnb.add.figure(report, description, report.plots[["loc"]], setting.names)
logger.status(c("Created", length(report.plots[["quant"]]) + length(report.plots[["loc"]]),
"heatmaps based on the clustering results"))
logger.completed()
## Add color legends
logger.start("Adding Color Legends")
if (!is.null(pheno.colors) || length(locus.columns) != 0) {
add.legend <- function(s.index, legend.colors) {
screen(s.index)
par(mar = c(0, 0, 0, 0) + 0.1)
plot.new()
legend("center", legend = names(legend.colors), fill = legend.colors, bty = "n", ncol = 1)
}
ns <- length(scolors)
nl <- length(lcolors)
## TODO: Simplify this weird parameter computations
param.mat <- c()
if (max(scolors) == 0) {
param.mat <- cbind(0,1:nl)
} else {
for (i in 1:ns){
if (nl == 0) {
param.mat <- rbind(param.mat,cbind(i,0))
} else {
param.mat <- rbind(param.mat,cbind(i,1:nl))
}
}
}
report.plots <- list()
for (ri in 1:length(reg.types)) {
rt <- reg.types[ri]
for (i in 1:(dim(param.mat)[1])) {
fname <- paste("heatmapLegend",ri, param.mat[i,1],param.mat[i,2], sep = "_")
rplot <- createReportPlot(fname, report, width = 7.2, height = 1.8, high.png = 200)
screens <- rbind(c(0, 0.5, 0, 1), c(0.5, 1, 0, 1))
s.indices <- split.screen(screens)
if (param.mat[i,2] != 0){
leg.loc <- attr(site.reg.info[[rt]],"legend")[[lcolors[param.mat[i,2]]]]
if (!is.null(leg.loc)){
#convert color gradient
if (!is.character(leg.loc)){
if (leg.loc == 2){
colors.grad <- rnb.getOption("colors.gradient")
vals <- round(site.reg.info[[rt]][,lcolors[param.mat[i,2]]],2)
leg.loc <- colors.grad[1:2]
names(leg.loc) <- c(min(vals),max(vals))
} else {
stop("Invalid legend")
}
}
add.legend(s.indices[1],leg.loc)
}
}
if (param.mat[i,1] != 0) {
add.legend(s.indices[2], pheno.colors$mapping[[param.mat[i,1]]])
}
close.screen(all.screens = TRUE)
off(rplot)
report.plots <- c(report.plots,rplot)
}
}
if (max(lcolors) == 0) {
lcolors <- "n.a."
names(lcolors) <- 0
}
if (max(scolors) == 0) {
scolors <- "n.a."
names(scolors) <- 0
} else {
scolors <- colnames(pheno.colors$colors)
names(scolors) <- 1:ncol(pheno.colors$colors)
}
regs <- reg.types
names(regs) <- 1:length(reg.types)
setting.names.leg <- list(
"Site/region level" = regs,
"Sample color based on" = scolors,
"Site/region color based on" = lcolors)
description <- c("Probe and sample colors used in the heatmaps in the previous figures.")
report <- rnb.add.figure(report, description, report.plots, setting.names.leg)
}
rm(scolors, description, probe.order, topprobes, tprobes, X)
logger.completed()
## Create plots of silhouette values
logger.start("Estimating Optimal Numbers of Clusters")
refText <- c("Rousseeuw, P. J. (1987) Silhouettes: A graphical aid to the interpretation and validation of ",
"cluster analysis. <i>Journal of Computational and Applied Mathematics</i>, <b>20</b>, 53-65")
report <- rnb.add.reference(report, refText)
description <- c("Using the average silhouette value as a measure of cluster assignment ",
rnb.get.reference(report, refText), ", it is possible to infer the number of clusters produced by each of the ",
"studied methods. The figure below shows the corresponding mean silhouette value for every observed ",
"separation into clusters.")
report <- rnb.add.section(report, "Identified Clusters", description, level = 3)
## TODO: Support also different lengths of silhouettes
colnames.sil <- lapply(reg.types,FUN=function(rt){
as.character(AGGLOMERATIONS[sapply(algorithms[[rt]], "[", 2)])
})
names(colnames.sil) <- reg.types
silhouettes <- lapply(reg.types,FUN=function(rt){
sils <- as.matrix(sapply(clust.results[[rt]], slot, "silhouettes"))
colnames(sils) <- colnames.sil[[rt]]
return(sils)
})
names(silhouettes) <- reg.types
algorithm.names <- lapply(algorithms,FUN=function(algs){
sapply(algs, function(a.name) {
base::ifelse(length(a.name) == 2, paste0(a.name[1], " (", a.name[2], " linkage)"), a.name)
})
})
clusters.identified.inner <- data.frame(
"Metric" = character(0),
"Algorithm" = character(0),
"Clusters" = integer(0),
stringsAsFactors = FALSE)
clusters.identified <- list()
report.plots.all <- list()
for (rt in 1:length(reg.types)){
rt <- reg.types[ri]
clusters.identified.cur <- clusters.identified.inner
report.plots <- lapply(dist.metrics, function(dist.metric) {
cinds <- (sapply(clust.results[[rt]], slot, "dissimilarity") == dist.metric)
for (i in which(cinds)) {
cn <- as.integer(rownames(silhouettes[[rt]])[which.max(silhouettes[[rt]][, i])])
clusters.identified.cur[nrow(clusters.identified.cur) + 1, "Metric"] <<- DIST.METRICS[dist.metric]
clusters.identified.cur[nrow(clusters.identified.cur), "Algorithm"] <<- algorithm.names[[rt]][i]
clusters.identified.cur[nrow(clusters.identified.cur), "Clusters"] <<- cn
}
dframe <- data.frame(
"clusters" = rep(as.integer(rownames(silhouettes[[rt]])), sum(cinds)),
"silhouette" = as.vector(silhouettes[[rt]][, cinds]),
"linkage" = rep(colnames(silhouettes[[rt]])[cinds], each = nrow(silhouettes[[rt]])), check.names = FALSE)
fname <- paste("silhouette", ri, dist.metric, sep = "_")
rplot <- createReportPlot(fname, report, width = 7, height = 5)
pp <- ggplot(dframe, aes_string(x = "clusters", y = "silhouette", colour = "linkage", group = "linkage")) +
scale_colour_manual(values = rnb.getOption("colors.category")) +
labs(x = paste("Number of clusters"), y = paste("Silhouette value")) + geom_line()
print(pp)
off(rplot)
rplot
})
report.plots.all <- c(report.plots.all,report.plots)
clusters.identified <- c(clusters.identified,list(clusters.identified.cur))
}
names(clusters.identified) <- 1:length(reg.types)
description <- c("Line plot visualizing mean silhouette values of the clustering algorithm outcomes for each ",
"applicable value of <i>K</i> (number of clusters).")
report <- rnb.add.figure(report, description, report.plots.all, setting.names.common[1:2])
description <- "The table below summarizes the number of clusters identified by the algorithms."
rnb.add.paragraph(report, description)
report <- rnb.add.tables(report, clusters.identified, setting.names.common[1], row.names = FALSE)
logger.status("Estimated number of clusters based on mean silhouette value")
logger.completed()
logger.start("Overlapping Clusters with Sample Traits")
if (!is.null(pheno.colors)) {
report <- rnb.add.section(report, "Clusters and Traits", NULL, level = 3)
## Compute Rand indices to test for associations between clusterings and traits
randIndices.all <- list()
csv.files <- sapply(reg.types, function(x) { as.character(NA) })
for (ri in 1:length(reg.types)){
rt <- reg.types[ri]
randIndices <- matrix(as.double(NA), nrow = nrow(clusters.identified[[ri]]), ncol = ncol(pheno.colors$colors))
dimnames(randIndices) <- list(clusters.identified[[ri]][, "Algorithm"], colnames(pheno.colors$colors))
dist.metrics <- DIST.METRICS[sapply(clust.results[[rt]], slot, "dissimilarity")]
for (i in 1:nrow(clusters.identified[[ri]])) {
j <- which(dist.metrics == clusters.identified[[ri]][i, "Metric"] &
algorithm.names[[rt]] == clusters.identified[[ri]][i, "Algorithm"])
cl.assignments <- clust.results[[rt]][[j]]@assignments[, as.character(clusters.identified[[ri]][i, "Clusters"])]
for (j in colnames(randIndices)) {
tr.assignments <- as.integer(as.factor(pheno(rnb.set)[, j]))
valids <- (!is.na(tr.assignments))
randIndices[i, j] <- mclust::adjustedRandIndex(cl.assignments[valids], tr.assignments[valids])
}
}
randIndices.all <- c(randIndices.all,list(randIndices))
rm(dist.metrics, i, j, cl.assignments, tr.assignments, valids)
## Save the table to an external file
rIndices <- cbind(clusters.identified[[ri]][, c("Metric", "Algorithm")], randIndices)
csv.files[rt] <- paste("adjusted_rand_indices_",ri,".csv",sep="")
fname.full <- file.path(rnb.get.directory(report, "data", absolute = TRUE), csv.files[rt])
utils::write.csv(rIndices, file = fname.full, row.names = FALSE)
csv.files[rt] <- paste(rnb.get.directory(report, "data"), csv.files[rt], sep = "/")
logger.status(c("Computed adjusted rand indices and saved to", csv.files[rt]))
}
names(randIndices.all) <- reg.types
refText <- c("Hubert, L. and Arabie, P. (1985) Comparing partitions. ",
"<i>Journal of Classification</i>, <b>2</b>(1), 193-218")
report <- rnb.add.reference(report, refText)
txt <- c("The figure below shows associations between clusterings and the examined traits. Associations are ",
"quantified using the adjusted Rand index ", rnb.get.reference(report, refText), ". Rand indices near 1 ",
"indicate high agreement while values close to -1 indicate seperation. The full table of all computed ",
"indices is stored in the following comma separated files:")
rnb.add.paragraph(report, txt)
## Add a list of links to the created CSV files
txt <- as.list(paste("<a href=\"", csv.files, "\">", names(csv.files), " (csv)</a>", sep = ""))
rnb.add.list(report, txt)
rm(csv.files, ri, rt, randIndices, rIndices, fname.full, refText, txt)
## Create heatmaps of adjusted Rand indices
report.plots.all <- list()
for (ri in 1:length(reg.types)){
rt <- reg.types[ri]
report.plots <- lapply(names(DIST.METRICS), function(dist.metric) {
fname <- paste("heatmap", "rindex", ri, dist.metric, sep = "_")
cinds <- which(DIST.METRICS[dist.metric] == clusters.identified[[ri]][, "Metric"])
return(plot.heatmap.rand(t(randIndices.all[[rt]][cinds, ]), fname, report))
})
report.plots.all <- c(report.plots.all,report.plots)
}
description <- c("Heatmap visualizing Rand indices computed between sample traits (rows) and clustering ",
"algorithm outcomes (columns).")
report <- rnb.add.figure(report, description, report.plots.all, setting.names.common[1:2])
}
logger.completed()
logger.completed()
return(report)
}
########################################################################################################################
#' rnb.step.clustering
#'
#' Performs hierarchical clustering on the samples of the given dataset and generates a corresponding section in the
#' report.
#'
#' @param rnb.set Methylation dataset as an object of type inheriting \code{\linkS4class{RnBSet}}.
#' @param report Report on methylation profiles to contain the clustering section. This must be an object of type
#' \code{\linkS4class{Report}}.
#' @return The modified report.
#'
#' @seealso \code{\link{rnb.execute.clustering}}, \code{\link{rnb.section.clustering}}
#' @author Yassen Assenov
#' @noRd
rnb.step.clustering <- function(rnb.set, report) {
if (!inherits(rnb.set, "RnBSet")) {
stop("invalid value for rnb.set")
}
if (!inherits(report, "Report")) {
stop("invalid value for report")
}
rnb.step.clustering.internal(rnb.set, report, get.site.and.region.types(rnb.set))
}
rnb.step.clustering.internal <- function(rnb.set, report, rinfo) {
logger.start("Sample Clustering")
## Perform clustering algorithms
logger.start("Agglomerative Hierarchical Clustering")
clust.results <- rnb.execute.clustering.all(rnb.set)
logger.completed()
## Handle the cases when clusterings on some or all of the methylation matrices failed
clust.failed <- sapply(clust.results, is.null)
clust.edited <- sum(clust.failed) != 0
if (clust.edited) {
failed.single <- sum(clust.failed) == 1
failed.all <- sum(clust.failed) == length(clust.results)
txt <- "The clustering algorithm was not completed"
if (failed.all) {
txt <- c(txt, ifelse(failed.single, ".", " for all requested levels of methylation."))
} else {
txt <- c(txt, " for the following requested level", ifelse(failed.single, "", "s"), " of methylation - ")
txt <- c(txt, paste("<b>", names(clust.results)[clust.failed], "</b>", sep = "", collapse = ", "), ".")
}
if (length(samples(rnb.set)) < 3) {
txt <- c(txt, " Note that clustering is performed only when the dataset contains at least 3 samples.")
} else {
txt <- c(txt, " The most likely reason for this error is inability to calculate a distance between a pair ",
"of samples due to (too many) missing values in the respective methylation ",
ifelse(failed.single, "matrix", "matrices"), ".")
}
report <- rnb.add.section(report, "Clustering", txt)
if (failed.all) {
logger.completed()
return(report)
}
clust.results <- clust.results[!clust.failed]
rm(failed.single, failed.all, txt)
}
rm(clust.failed)
## Generate clustering section in the report
report <- rnb.section.clustering(report, rnb.set, clust.results, rinfo, clust.edited)
logger.completed()
return(report)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.