# == title
# Consensus partition
#
# == param
# -data A numeric matrix where subgroups are found by columns.
# -top_value_method A single top-value method. Available methods are in `all_top_value_methods`.
# Use `register_top_value_methods` to add a new top-value method.
# -top_n Number of rows with top values. The value can be a vector with length > 1. When n > 5000,
# the function only randomly sample 5000 rows from top n rows. If ``top_n`` is a vector, paritition
# will be applied to every values in ``top_n`` and consensus partition is summarized from all partitions.
# -partition_method A single partitioning method. Available methods are in `all_partition_methods`.
# Use `register_partition_methods` to add a new partition method.
# -max_k Maximal number of subgroups to try. The function will try for ``2:max_k`` subgroups
# -k Alternatively, you can specify a vector k.
# -sample_by Should randomly sample the matrix by rows or by columns?
# -p_sampling Proportion of the submatrix which contains the top n rows to sample.
# -partition_repeat Number of repeats for the random sampling.
# -partition_param Parameters for the partition method which are passed to ``...`` in a registered partitioning method. See `register_partition_methods` for detail.
# -anno A data frame with known annotation of samples. The annotations will be plotted in heatmaps and the correlation
# to predicted subgroups will be tested.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
# ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -scale_rows Whether to scale rows. If it is ``TRUE``, scaling method defined in `register_partition_methods` is used.
# -verbose Whether print messages.
# -mc.cores Multiple cores to use. This argument will be removed in future versions.
# -cores Number of cores, or a ``cluster`` object returned by `parallel::makeCluster`.
# -prefix Internally used.
# -.env An environment, internally used.
# -help Whether to print help messages.
#
# == details
# The function performs analysis in following steps:
#
# - calculate scores for rows by top-value method,
# - for each top_n value, take top n rows,
# - randomly sample ``p_sampling`` rows from the top_n-row matrix and perform partitioning for ``partition_repeats`` times,
# - collect partitions from all individual partitions and summarize a consensus partition.
#
# == return
# A `ConsensusPartition-class` object. Simply type object in the interactive R session
# to see which functions can be applied on it.
#
# == seealso
# `run_all_consensus_partition_methods` runs consensus partitioning with multiple top-value methods
# and multiple partitioning methods.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# set.seed(123)
# m = cbind(rbind(matrix(rnorm(20*20, mean = 1, sd = 0.5), nr = 20),
# matrix(rnorm(20*20, mean = 0, sd = 0.5), nr = 20),
# matrix(rnorm(20*20, mean = 0, sd = 0.5), nr = 20)),
# rbind(matrix(rnorm(20*20, mean = 0, sd = 0.5), nr = 20),
# matrix(rnorm(20*20, mean = 1, sd = 0.5), nr = 20),
# matrix(rnorm(20*20, mean = 0, sd = 0.5), nr = 20)),
# rbind(matrix(rnorm(20*20, mean = 0.5, sd = 0.5), nr = 20),
# matrix(rnorm(20*20, mean = 0.5, sd = 0.5), nr = 20),
# matrix(rnorm(20*20, mean = 1, sd = 0.5), nr = 20))
# ) + matrix(rnorm(60*60, sd = 0.5), nr = 60)
# res = consensus_partition(m, partition_repeat = 10, top_n = c(10, 20, 50))
# res
consensus_partition = function(data,
top_value_method = "ATC",
top_n = NULL,
partition_method = "skmeans",
max_k = 6,
k = NULL,
sample_by = "row",
p_sampling = 0.8,
partition_repeat = 50,
partition_param = list(),
anno = NULL,
anno_col = NULL,
scale_rows = NULL,
verbose = TRUE,
mc.cores = 1, cores = mc.cores,
prefix = "",
.env = NULL,
help = cola_opt$help) {
if(help && !missing(data)) {
if(identical(subset, Inf) && ncol(data) > 500) {
qqcat_wrap("You have quite a lot of columns in the matrix. For reducing the runtime, you can use the function `consensus_partition_by_down_sampling()` to apply to a subset of column. The classification of unselected columns are inferred from the classes of the selected columns. Set the argument 'help = FALSE' to turn off this message.\n")
}
}
if(missing(data)) {
data = .env$data
} else {
data = as.matrix(data)
}
if(max_k < 2) {
stop_wrap("max_k should be no less than 2.")
}
if(max_k > 10) {
if(help) {
qqcat_wrap("It is not recommended to set `max_k` larger than 10. Users are suggested to use `hierarchical_partition()` function to obtain more subgroups. Set the argument `help` to FALSE to turn off this message.\n")
}
}
t = system.time(res <- .consensus_partition(
data = data,
top_value_method = top_value_method,
top_n = top_n,
partition_method = partition_method,
k = if(is.null(k)) 2:max_k else k,
p_sampling = p_sampling,
sample_by = sample_by,
partition_repeat = partition_repeat,
partition_param = partition_param,
anno = anno,
anno_col = anno_col,
scale_rows = scale_rows,
verbose = verbose,
cores = cores,
prefix = prefix,
.env = .env))
res@hash = digest(res)
res@running_time = t[["elapsed"]]
if(verbose) {
tc = Sys.time()
tf = format(tc + structure(t[["elapsed"]], units = "secs", class = "difftime") - tc)
qqcat("@{prefix}* @{top_value_method}:@{partition_method} used @{tf}.\n")
}
# test if k = 2 is stable and correlates to column mean
# if(help) {
# if(is_stable_k(res, k = 2)) {
# cl = get_classes(res, k = 2)[, "class"]
# le = unique(cl)
# cm_diff = rowMeans(get_matrix(res[, cl == le[1], drop = FALSE])) - colMeans(get_matrix(res[, cl == le[2]]))
# tb = table(sign(cm_diff))
# if(max(cm) - min(cm) > 1e-10) {
# p <- t.test(cm[cl == le[1]], cm[cl == le[2]])$p.value
# if(p < 0.01) {
# message_wrap(qq("Classification with 2 groups is stable and it significantly (p-value = @{p}) correlates to the column mean of the matrix. You might have system-level batch in the matrix. Consider to transform your matrix, such as by quantile normalization. Set `help = FALSE` to turn it off."))
# }
# }
# }
# }
return(res)
}
.consensus_partition = function(data,
top_value_method = "MAD",
top_n = NULL,
partition_method = "kmeans",
k = 2:6,
p_sampling = 0.8,
sample_by = c("row", "column"),
partition_repeat = 50,
partition_param = list(),
anno = NULL,
anno_col = NULL,
scale_rows = NULL,
verbose = TRUE,
cores = 1,
prefix = "",
.env = NULL) {
# .env is used to store shared matrix because the matrix will be used multiple times in
# run_all_consensus_partition_methods() and hierarchical_partition() and we don't want to
# copy it multiple times
# `column_index` is specifically for hierarchical_partition() where in each level only a subset
# of columns in the original matrix is used
# .env$column_index is only used to pass secret column_index parameter,
# the real column_index is stored as a slot in this object.
if(is.null(.env)) {
if(is.data.frame(data)) data = as.matrix(data)
# if(is.null(rownames(data))) rownames(data) = seq_len(nrow(data))
.env = new.env(parent = emptyenv())
.env$data = data
.env$column_index = seq_len(ncol(data))
} else if(is.null(.env$data)) {
if(is.data.frame(data)) data = as.matrix(data)
# if(is.null(rownames(data))) rownames(data) = seq_len(nrow(data))
.env$data = data
.env$column_index = seq_len(ncol(data))
} else if(is.null(.env$column_index)) {
data = .env$data
.env$column_index = seq_len(ncol(data))
} else {
data = .env$data
}
if(is.null(.env$row_index)) {
.env$row_index = seq_len(nrow(data))
}
data = data[.env$row_index, .env$column_index, drop = FALSE]
# process the annotation
if(!is.null(anno)) {
if(is.atomic(anno)) {
known_nm = deparse(substitute(anno))
anno = data.frame(anno)
colnames(anno) = known_nm
if(!is.null(anno_col)) {
anno_col = list(anno_col)
names(anno_col) = known_nm
}
}
if(nrow(anno) != length(.env$column_index)) {
stop_wrap("nrow of `anno` should be the same as ncol of the matrix.")
}
}
if(is.null(anno_col)) {
anno_col = lapply(anno, ComplexHeatmap:::default_col)
} else {
if(ncol(anno) == 1 && is.atomic(anno_col)) {
anno_col = list(anno_col)
names(anno_col) = colnames(anno)
} else if(is.null(names(anno_col))) {
if(length(anno_col) == ncol(anno)) {
names(anno_col) = colnames(anno)
} else {
anno_col = lapply(anno, ComplexHeatmap:::default_col)
}
}
for(nm in names(anno)) {
if(is.null(anno_col[[nm]])) {
anno_col[[nm]] = ComplexHeatmap:::default_col(anno[[nm]])
}
}
}
if(is.null(anno)) {
anno_col = NULL
}
if(!is.null(anno_col)) {
anno_col = lapply(anno_col, function(x) {
if(is.atomic(x)) {
x = x[order(names(x))]
}
x
})
}
if(verbose) qqcat("@{prefix}* run @{top_value_method}:@{partition_method} on a @{nrow(data)}x@{ncol(data)} matrix.\n")
k = sort(k)
l = k <= ncol(data)
if(sum(l) != length(k)) {
qqcat("@{prefix}* Following k (@{paste(k[!l], collapse=', ')}) are removed.\n")
}
k = k[l]
if(length(k) == 0) {
stop_wrap("There is no valid k.\n")
}
partition_fun = get_partition_method(partition_method, partition_param)
get_top_value_fun = get_top_value_method(top_value_method)
if(identical(get_top_value_method(top_value_method), ATC)) {
config_ATC(cores = cores)
if(verbose && cores > 1) qqcat("@{prefix}* set @{cores} cores for ATC()\n")
}
l = abs(rowSds(data)) < 1e-6
if(any(l)) {
if(verbose) qqcat("@{prefix}* remove @{sum(l)} rows with zero standard deviations.\n")
data = data[!l, , drop = FALSE]
.env$row_index = .env$row_index[!l]
}
# also since one top value metric will be used for different partition methods,
# we cache the top values for repetitive use
# they are only used when column_index and row_index are not changed
if(is.null(.env$all_top_value_list)) {
if(verbose) qqcat("@{prefix}* calculating @{top_value_method} values.\n")
all_top_value = get_top_value_fun(data)
all_top_value[is.na(all_top_value)] = -Inf
.env$all_top_value_list = list()
.env$all_top_value_list[[top_value_method]] = all_top_value
} else if(is.null(.env$all_top_value_list[[top_value_method]])) {
if(verbose) qqcat("@{prefix}* calculating @{top_value_method} values.\n")
all_top_value = get_top_value_fun(data)
all_top_value[is.na(all_top_value)] = -Inf
.env$all_top_value_list[[top_value_method]] = all_top_value
} else {
if(verbose) qqcat("@{prefix}* @{top_value_method} values have already been calculated. Get from cache.\n")
all_top_value = .env$all_top_value_list[[top_value_method]]
}
if(is.null(top_n)) {
top_n = find_top_n(all_top_value)
if(is.na(top_n)) top_n = round(length(all_top_value)*0.1)
top_n = min(top_n, round(length(all_top_value)*0.1))
}
top_n = round(top_n)
l = top_n <= nrow(data)
if(sum(l) != length(top_n)) {
qqcat("@{prefix}* Following top_n (@{paste(top_n[!l], collapse = ', ')}) are removed.\n")
}
top_n = top_n[l]
if(length(top_n) == 0) {
# stop_wrap("There is no valid top_n.\n")
top_n = nrow(data)
}
if(is.null(scale_rows)) {
scale_rows = TRUE
}
if(scale_rows) {
scale_method = attr(partition_fun, "scale_method")
if("z-score" %in% scale_method) {
if(verbose) qqcat("@{prefix}* rows are scaled before sent to partition, method: 'z-score' (x - mean)/sd\n")
# data = t(scale(t(data)))
row_mean = rowMeans(data)
row_sd = rowSds(data)
data = (data - row_mean)/row_sd
} else if("min-max" %in% scale_method) {
if(verbose) qqcat("@{prefix}* rows are scaled before sent to partition, method: 'min-max' (x - min)/(max - min)\n")
row_min = rowMins(data)
row_max = rowMaxs(data)
row_range = row_max - row_min
data = (data - row_min)/row_range
} else {
scale_rows = FALSE
}
attr(scale_rows, "scale_method") = scale_method
}
# now we do repetitive clustering
param = data.frame(top_n = numeric(0), k = numeric(0), n_row = numeric(0))
partition_list = list()
n_cores = get_nc(cores)
sample_by = match.arg(sample_by)[1]
for(i in seq_len(length(top_n))) {
ind = order(all_top_value, decreasing = TRUE)[ 1:top_n[i] ]
if(length(ind) > 5000) {
ind = sample(ind, 5000)
if(verbose) qqcat("@{prefix}* get top @{top_n[i]} (randomly sampled 5000) rows by @{top_value_method} method\n")
} else {
if(verbose) qqcat("@{prefix}* get top @{top_n[i]} rows by @{top_value_method} method\n")
}
if(verbose && n_cores > 1) qqcat("@{prefix} - @{partition_method} repeated for @{partition_repeat} times by @{sample_by}-sampling (p = @{p_sampling}) from top @{top_n[i]} rows (@{n_cores} cores).\n")
registerDoParallel(cores)
lt <- foreach(j = seq_len(partition_repeat)) %dorng% {
param = data.frame(top_n = numeric(0), k = numeric(0), n_row = numeric(0))
partition_list = list()
if(sample_by == "row") {
ind_sub = sample(ind, round(p_sampling*length(ind)))
mat = data[ind_sub, , drop = FALSE]
column_ind_sub = seq_len(ncol(data))
} else if(sample_by == "column") {
column_ind_sub = sample(ncol(data), round(p_sampling*ncol(data)))
mat = data[ind, , drop = FALSE]
}
for(y in k) {
if(interactive() && verbose && n_cores == 1) {
msg = qq("@{prefix} [k = @{y}] @{partition_method} repeated for @{j}@{ifelse(j %% 10 == 1, 'st', ifelse(j %% 10 == 2, 'nd', ifelse(j %% 10 == 3, 'rd', 'th')))} @{sample_by}-sampling (p = @{p_sampling}) from top @{top_n[i]} rows.")
cat(strrep("\r", nchar(msg)))
cat(msg)
}
partition_list = c(partition_list, list(list(partition_fun(mat, y, column_ind_sub))))
param = rbind(param, data.frame(top_n = top_n[i], k = y, n_row = nrow(mat), n_col = ncol(mat), stringsAsFactors = FALSE))
}
return(list(param = param, partition_list = partition_list))
}
stopImplicitCluster()
for(i in seq_along(lt)) {
param = rbind(param, lt[[i]]$param)
partition_list = c(partition_list, lt[[i]]$partition_list)
}
if(interactive() && verbose && n_cores == 1) cat("\n")
}
construct_consensus_object = function(param, partition_list, k, prefix = " - ", verbose = TRUE) {
partition_list = do.call("c", partition_list)
partition_list = cl_ensemble(list = partition_list)
if(verbose) qqcat("@{prefix}merge @{length(partition_list)} (@{partition_repeat}x@{length(top_n)}) partitions into a single ensemble object.\n")
if(sample_by == "row") {
partition_consensus = cl_consensus(partition_list)
} else {
partition_consensus = cl_consensus2(partition_list, k)
}
# note: number of class_ids may be less than k
class_ids = as.vector(cl_class_ids(partition_consensus))
# adjust the class labels according to the tightness of each subgroup
if(verbose) qqcat("@{prefix}adjust the class labels according to the mean intra-group distance.\n")
ri = order(all_top_value, decreasing = TRUE)[1:max(param[, "top_n"])]
if(length(ri) > 5000) ri = sample(ri, 5000)
mean_dist = tapply(seq_len(ncol(data)), class_ids, function(ind) {
n = length(ind)
if(n == 1) {
return(Inf)
}
sum(dist(t(data[ri, ind, drop = FALSE]))^2)/(n*(n-1)/2)
})
if(length(mean_dist) < k) {
mean_dist_foo = structure(rep(Inf, k - length(mean_dist)), names = setdiff(seq_len(k), class_ids))
mean_dist = c(mean_dist, mean_dist_foo)
}
map = structure(names = names(mean_dist)[order(mean_dist)], names(mean_dist))
unclassfied = setdiff(1:k, map)
if(length(unclassfied)) {
map = c(map, structure(unclassfied, names = unclassfied))
}
class_ids = as.numeric(map[as.character(class_ids)])
if(verbose) qqcat("@{prefix}calculate global membership from all partitions.\n")
membership_mat = cl_membership(partition_consensus)
class(membership_mat) = "matrix"
if(ncol(membership_mat) < k) {
membership_mat = cbind(membership_mat, matrix(0, nrow = nrow(membership_mat), ncol = k - ncol(membership_mat)))
}
map2 = structure(names(map), names = map)
membership_mat = membership_mat[, as.numeric(map2[as.character(1:k)])]
colnames(membership_mat) = paste0("p", 1:ncol(membership_mat))
rownames(membership_mat) = colnames(data)
attr(membership_mat, "n_of_classes") = NULL
attr(membership_mat, "is_cl_hard_partition") = NULL
if(verbose) qqcat("@{prefix}adjust class labels for every single partition.\n")
class_ids_by_top_n = tapply(seq_along(partition_list), param$top_n, function(ind) {
if(sample_by == "row") {
partition_consensus = cl_consensus(cl_ensemble(list = partition_list[ind]))
} else {
partition_consensus = cl_consensus2(cl_ensemble(list = partition_list[ind]), k)
}
ci = as.vector(cl_class_ids(partition_consensus))
map = relabel_class(ci, class_ids, full_set = 1:k)
as.numeric(map[as.character(ci)])
})
# adjust class labels in each membership matrix to fit to the consensus class labels
membership_each = do.call("cbind", lapply(seq_along(partition_list), function(i) {
x = partition_list[[i]]
class_ids = class_ids_by_top_n[[as.character(param$top_n[i])]]
class = as.vector(cl_class_ids(x))
map = relabel_class(class, class_ids, full_set = 1:k)
class = as.numeric(map[as.character(class)])
as.integer(class)
}))
rownames(membership_each) = rownames(membership_mat)
if(verbose) qqcat("@{prefix}calculate consensus matrix.\n")
# consensus_mat = matrix(1, nrow = nrow(membership_mat), ncol = nrow(membership_mat))
# for(i in seq_len(nrow(membership_each)-1)) {
# for(j in (i+1):nrow(membership_each)) {
# consensus_mat[i, j] = sum(membership_each[i, ] == membership_each[j, ] + 0)/ncol(membership_each)
# consensus_mat[j, i] = consensus_mat[i, j]
# }
# }
membership_each2 = membership_each
membership_each2[is.na(membership_each2)] = as.integer(0)
consensus_mat = get_consensus_matrix(membership_each2)
rownames(consensus_mat) = rownames(membership_mat)
colnames(consensus_mat) = rownames(membership_mat)
class_df = data.frame(
class = class_ids,
entropy = apply(membership_mat, 1, entropy),
stringsAsFactors = FALSE
)
rownames(class_df) = colnames(data)
if(length(unique(class_ids)) == 1) {
class_df$silhouette = rep(0, length(class_ids))
} else {
class_df$silhouette = silhouette(class_ids, dist(t(consensus_mat)))[, "sil_width"]
}
if(verbose) qqcat("@{prefix}calculate statistics for the consensus partition.\n")
if(length(class_df$silhouette)*0.05 > 1) {
l = class_df$silhouette >= quantile(class_df$silhouette, 0.05)
} else {
l = rep(TRUE, length(class_df$silhouette))
}
stat = list(
ecdf = ecdf(consensus_mat[lower.tri(consensus_mat)]),
"1-PAC" = 1 - PAC(consensus_mat[l, l, drop = FALSE]),
mean_silhouette = mean(class_df$silhouette),
concordance = concordance(membership_each, class_ids),
cophcor = cophcor(consensus_mat),
aPAC = aPAC(consensus_mat),
FCC = FCC(consensus_mat[l, l, drop = FALSE])
)
# loc = cmdscale(dist(t(data)), k = 2)
# stat$mean_group_dist_2PC = mean_group_dist(loc, class_ids)
# loc = cmdscale(dist(t(data)), k = 3)
# stat$mean_group_dist_3PC = mean_group_dist(loc, class_ids)
return(list(
class_df = class_df,
membership = membership_mat,
consensus = consensus_mat,
param = param,
membership_each = membership_each,
stat = stat
))
}
object_list = lapply(k, function(y) {
l = param$k == y
top_n_level = unique(param[l, "top_n"])
if(verbose) qqcat("@{prefix}* wrap results for k = @{y}\n")
construct_consensus_object(param[l, ], partition_list[l], y, verbose = FALSE)
})
names(object_list) = as.character(k)
rm(partition_list)
gc(verbose = FALSE, reset = TRUE)
## adjust class labels for different k should also be adjusted
if(verbose) qqcat("@{prefix}* adjust class labels between different k.\n")
reference_class = object_list[[1]]$class_df$class
for(i in seq_along(k)[-1]) {
class_df = object_list[[i]]$class_df
class = class_df[, "class"]
map = relabel_class(class, reference_class, full_set = 1:(k[i]))
l = duplicated(map) | duplicated(map, fromLast = TRUE)
l = l & names(map) != names(which.max(table(class)[names(map)[l]]))
unmapped = setdiff(names(map), map)
if(any(l)) {
map[l] = unmapped[1]
}
map2 = structure(names(map), names = map)
object_list[[i]]$class_df$class = as.numeric(map[as.character(class)])
# the class label for the global membership matrix needs to be adjusted
object_list[[i]]$membership = object_list[[i]]$membership[, as.numeric(map2[as.character(1:k[i])]) ]
colnames(object_list[[i]]$membership) = paste0("p", 1:k[i])
# the class label for the individual membership needs to be adjusted
odim = dim(object_list[[i]]$membership_each)
object_list[[i]]$membership_each = as.numeric(map[as.character(object_list[[i]]$membership_each)])
dim(object_list[[i]]$membership_each) = odim
reference_class = object_list[[i]]$class_df$class
}
# an additional metric for determine "best k"
ak = sapply(object_list, function(obj) {
f = obj$stat$ecdf
x = seq(0, 1, length = 1000)
n = length(x)
sum((x[2:n] - x[1:(n-1)])*f(x[2:n]))
})
names(ak) = NULL
delta_k = ak
for(i in seq_along(k)[-1]) {
delta_k[i] = (ak[i] - ak[i-1])/ak[i-1]
}
for(i in seq_along(object_list)) {
object_list[[i]]$stat$area_increased = delta_k[i]
}
n_sample = ncol(data)
# for(method in c("Rand", "cRand", "NMI", "KP", "FM", "Jaccard", "purity", "PS")) {
for(method in c("Rand", "Jaccard")) {
for(i in seq_along(k)) {
if(i == 1) {
cl1 = rep(1, n_sample)
} else {
cl1 = object_list[[i - 1]]$class_df$class
}
cl2 = object_list[[i]]$class_df$class
object_list[[i]]$stat[[method]] = cl_agreement(as.cl_hard_partition(cl1), as.cl_hard_partition(cl2), method = method)[[1]]
}
}
# process the annotations so it can be shared in `run_all_consensus_partition_methods()` and `hierarchical_partitions()`
res = ConsensusPartition(object_list = object_list, k = k, n_partition = partition_repeat * length(top_n) * length(k),
partition_method = partition_method, top_value_method = top_value_method, top_n = top_n, top_value_list = all_top_value,
anno = anno, anno_col = anno_col, scale_rows = scale_rows, sample_by = sample_by,
column_index = .env$column_index, row_index = .env$row_index, .env = .env)
.env$column_index = NULL
return(res)
}
# == title
# Print the ConsensusPartition object
#
# == param
# -object A `ConsensusPartition-class` object.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
setMethod(f = "show",
signature = "ConsensusPartition",
definition = function(object) {
# fix older version where there was no sample_by slot
error = try(object@sample_by, silent = TRUE)
if(inherits(error, "try-error")) {
object@sample_by = "row"
}
qqcat("A 'ConsensusPartition' object with k = @{paste(object@k, collapse = ', ')}.\n")
qqcat(" On a matrix with @{length(object@row_index)} rows and @{length(object@column_index)} columns.\n")
top_n_str = object@top_n
qqcat(" Top rows (@{paste(top_n_str, collapse = ', ')}) are extracted by '@{object@top_value_method}' method.\n")
qqcat(" Subgroups are detected by '@{object@partition_method}' method.\n")
qqcat(" Performed in total @{object@n_partition} partitions by @{object@sample_by} resampling.\n")
best_k = suggest_best_k(object, help = FALSE)
if(is.na(best_k)) {
qqcat(" There is no best k.\n")
} else {
qqcat(" Best k for subgroups seems to be @{best_k}.\n")
}
qqcat("\n")
qqcat("Following methods can be applied to this 'ConsensusPartition' object:\n")
txt = showMethods(classes = "ConsensusPartition", where = topenv(), printTo = FALSE)
txt = grep("Function", txt, value = TRUE)
fname = gsub("Function: (.*?) \\(package.*$", "\\1", txt)
print(fname)
})
# == title
# Plot the empirical cumulative distribution (eCDF) curve of the consensus matrix
#
# == param
# -object A `ConsensusPartition-class` object.
# -... Other arguments.
#
# == details
# It plots eCDF curve for each k.
#
# This function is mainly used in `collect_plots` and `select_partition_number` functions.
#
# == value
# No value is returned.
#
# == seealso
# See `stats::ecdf` for a detailed explanation of the empirical cumulative distribution function.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# plot_ecdf(golub_cola["ATC", "skmeans"])
setMethod(f = "plot_ecdf",
signature = "ConsensusPartition",
definition = function(object, ...) {
omar = par("mar")
par(mar = c(4.1, 4.1, 1, 1))
on.exit(par(mar = omar))
plot(NULL, xlim = c(0, 1), ylim = c(0, 1), xlab = "consensus value (x)", ylab = "P(X <= x)")
for(i in seq_along(object@k)) {
consensus_mat = get_consensus(object, k = object@k[i])
f = ecdf(consensus_mat[lower.tri(consensus_mat)])
x = seq(0, 1, length = 100)
y = f(x)
x = c(0, x)
y = c(0, y)
lines(x, y, col = i)
}
legend("bottomright", pch = 15, legend = paste0("k = ", object@k), col = seq_along(object@k))
})
# == title
# Several plots for determining the optimized number of subgroups
#
# == param
# -object A `ConsensusPartition-class` object.
# -mark_best Whether mark the best k in the plot.
# -all_stats Whether to show all statistics that were calculated. Used internally.
#
# == details
# There are following plots made:
#
# - eCDF of the consensus matrix under each k, made by `plot_ecdf,ConsensusPartition-method`,
# - `PAC` score,
# - mean sihouette score,
# - the `concordance` for each partition to the consensus partition,
# - area increase of the area under the ECDF of consensus matrix with increasing k,
# - Rand index for current k compared to k - 1,
# - Jaccard coefficient for current k compared to k - 1,
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# select_partition_number(golub_cola["ATC", "skmeans"])
setMethod(f = "select_partition_number",
signature = "ConsensusPartition",
definition = function(object, mark_best = TRUE, all_stats = FALSE) {
op = par(no.readonly = TRUE)
m = get_stats(object, all_stats = all_stats)
if(all_stats) STAT_USED = STAT_ALL
m = m[, colnames(m) %in% c(STAT_USED, "area_increased", "Rand", "Jaccard"), drop = FALSE]
nm = colnames(m)
n = ncol(m)
nr = floor(sqrt(n)+1)
par(mfrow = c(nr, ceiling((n+1)/nr)), mar = c(4, 4, 1, 1))
plot_ecdf(object, lwd = 1)
best_k = suggest_best_k(object, help = FALSE)
if(is.na(best_k)) best_k = -1
for(i in seq_len(ncol(m))) {
if(mark_best) {
l = object@k == best_k
} else {
l = rep(FALSE, length(object@k))
}
plot(object@k, m[, i], type = "b", xlab = "k", ylab = nm[i])
if(any(l)) {
points(object@k, m[, i],
pch = ifelse(l, 16, 1),
col = ifelse(l, "red", "black"))
}
}
par(xpd = NA, mar = c(4, 2, 1, 1))
plot(c(0, 1), c(0, 1), type = "n", axes = FALSE, ann = FALSE)
# if(max(object@k) <= 6) {
# text(x = 0, y = 1,
# "Suggested rules:
# Jaccard index < 0.95;
# If 1-PAC >= 0.90,
# take the maximum k;
# else take the k with higest votes of
# 1. max 1-PAC,
# 2. max mean silhouette,
# 3. max concordance.
# ", cex = 1.2, adj = c(0, 1))
# } else {
# text(x = 0, y = 1,
# "Suggested rules:
# Jaccard index < 0.95;
# take the k with higest votes of
# 1. max 1-PAC,
# 2. max mean silhouette,
# 3. max concordance.
# ", cex = 1.2, adj = c(0, 1))
# }
if(mark_best) legend("topleft", pch = 16, col = "Red", legend = "best k", cex = 1.5)
par(op)
})
# == title
# Heatmap of the consensus matrix
#
# == param
# -object A `ConsensusPartition-class` object.
# -k Number of subgroups.
# -internal Used internally.
# -anno A data frame of annotations for the original matrix columns.
# By default it uses the annotations specified in `consensus_partition` or `run_all_consensus_partition_methods`.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
# ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -show_row_names Whether plot row names on the consensus heatmap (which are the column names in the original matrix)
# -show_column_names Whether show column names.
# -row_names_gp Graphics parameters for row names.
# -simplify Internally used.
# -... other arguments.
#
# == details
# For row i and column j in the consensus matrix, the value of corresponding x_ij
# is the probability of sample i and sample j being in a same group from all partitions.
#
# There are following heatmaps from left to right:
#
# - probability of the sample to stay in the corresponding group
# - silhouette scores which measure the distance of an item to the second closest subgroups.
# - predicted subgroups
# - consensus matrix.
# - more annotations if provided as ``anno``
#
# One thing that is very important to note is that since we already know the consensus subgroups from consensus
# partition, in the heatmap, only rows or columns within the group is clustered.
#
# == value
# No value is returned.
#
# == seealso
# `membership_heatmap,ConsensusPartition-method`
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# consensus_heatmap(golub_cola["ATC", "skmeans"], k = 3)
setMethod(f = "consensus_heatmap",
signature = "ConsensusPartition",
definition = function(object, k, internal = FALSE,
anno = object@anno, anno_col = get_anno_col(object),
show_row_names = FALSE, show_column_names = FALSE, row_names_gp = gpar(fontsize = 8),
simplify = FALSE, ...) {
if(missing(k)) stop_wrap("k needs to be provided.")
if(inherits(object, "DownSamplingConsensusPartition")) {
class_df = get_classes(object, k, reduce = TRUE)
} else {
class_df = get_classes(object, k)
}
class_ids = class_df$class
consensus_mat = get_consensus(object, k)
mat_col_od = column_order_by_group(factor(class_ids, levels = sort(unique(class_ids))), consensus_mat)
membership_mat = get_membership(object, k)
if(simplify) {
ht_list = NULL
} else {
ht_list = Heatmap(membership_mat, name = "Prob", cluster_columns = FALSE, show_row_names = FALSE,
width = unit(5, "mm")*k, col = colorRamp2(c(0, 1), c("white", "red")),
show_column_names = !internal) +
Heatmap(class_df$silhouette, name = "Silhouette", width = unit(5, "mm"),
show_row_names = FALSE, col = colorRamp2(c(0, 1), c("white", "purple")),
show_column_names = !internal)
}
ht_list = ht_list + Heatmap(class_ids, name = "Class", col = cola_opt$color_set_2,
show_row_names = FALSE, width = unit(5, "mm"),
show_column_names = !internal)
ht_list = ht_list + Heatmap(consensus_mat, name = "Consensus", show_row_names = FALSE, show_row_dend = FALSE,
col = colorRamp2(c(0, 1), c("white", "blue")), row_order = mat_col_od, column_order = mat_col_od,
cluster_rows = FALSE, cluster_columns = FALSE, show_column_names = show_column_names, ...)
if(!is.null(anno)) {
if(is.atomic(anno)) {
anno_nm = deparse(substitute(anno))
anno = data.frame(anno)
colnames(anno) = anno_nm
if(!is.null(anno_col)) {
anno_col = list(anno_col)
names(anno_col) = anno_nm
}
} else if(ncol(anno) == 1) {
if(!is.null(anno_col)) {
if(is.atomic(anno_col)) {
anno_col = list(anno_col)
names(anno_col) = colnames(anno)
}
}
}
if(is.null(anno_col))
ht_list = ht_list + rowAnnotation(df = anno, show_annotation_name = !internal)
else {
ht_list = ht_list + rowAnnotation(df = anno, col = anno_col, show_annotation_name = !internal)
}
}
if(show_row_names && !is.null(rownames(consensus_mat))) {
ht_list = ht_list + rowAnnotation(rn = anno_text(rownames(consensus_mat), gp = row_names_gp))
}
if(internal) {
column_title = NULL
} else {
column_title = qq("consensus @{object@partition_method} with @{k} groups from @{object@n_partition/length(object@k)} partitions")
}
draw(ht_list, main_heatmap = "Consensus", column_title = column_title,
show_heatmap_legend = !internal, show_annotation_legend = !internal)
})
# == title
# Heatmap of membership in each partition
#
# == param
# -object A `ConsensusPartition-class` object.
# -k Number of subgroups.
# -internal Used internally.
# -anno A data frame of annotations for the original matrix columns.
# By default it uses the annotations specified in `consensus_partition` or `run_all_consensus_partition_methods`.
# -anno_col A list of colors (color is defined as a named vector) for the annotations. If ``anno`` is a data frame,
# ``anno_col`` should be a named list where names correspond to the column names in ``anno``.
# -show_column_names Whether show column names in the heatmap (which is the column name in the original matrix).
# -column_names_gp Graphics parameters for column names.
# -... Other arguments.
#
# == details
# Each row in the heatmap is the membership in one single partition.
#
# Heatmap is split on rows by ``top_n``.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# data(golub_cola)
# membership_heatmap(golub_cola["ATC", "skmeans"], k = 3)
setMethod(f = "membership_heatmap",
signature = "ConsensusPartition",
definition = function(object, k, internal = FALSE,
anno = object@anno, anno_col = get_anno_col(object),
show_column_names = FALSE, column_names_gp = gpar(fontsize = 8), ...) {
if(missing(k)) stop_wrap("k needs to be provided.")
if(inherits(object, "DownSamplingConsensusPartition")) {
class_df = get_classes(object, k, reduce = TRUE)
} else {
class_df = get_classes(object, k)
}
class_ids = class_df$class
membership_mat = get_membership(object, k)
col_fun = colorRamp2(c(0, 1), c("white", "red"))
membership_each = get_membership(object, k, each = TRUE)
membership_each = t(membership_each)
mat_col_od = column_order_by_group(factor(class_ids, levels = sort(unique(class_ids))), membership_each)
col = cola_opt$color_set_1[1:k]
if(is.null(anno)) {
bottom_anno = NULL
} else {
if(is.atomic(anno)) {
anno_nm = deparse(substitute(anno))
anno = data.frame(anno)
colnames(anno) = anno_nm
if(!is.null(anno_col)) {
anno_col = list(anno_col)
names(anno_col) = anno_nm
}
} else if(ncol(anno) == 1) {
if(!is.null(anno_col)) {
if(is.atomic(anno_col)) {
anno_col = list(anno_col)
names(anno_col) = colnames(anno)
}
}
}
if(is.null(anno_col)) {
bottom_anno = HeatmapAnnotation(df = anno,
show_annotation_name = !internal, annotation_name_side = "right")
} else {
bottom_anno = HeatmapAnnotation(df = anno, col = anno_col,
show_annotation_name = !internal, annotation_name_side = "right")
}
}
param = get_param(object, k, unique = FALSE)
top_n_level = unique(param$top_n)
suppressWarnings(n_row_col <- structure(brewer.pal(length(top_n_level), "Accent"), names = top_n_level))
ht = Heatmap(as.character(param$top_n), name = "top_n", col = n_row_col,
width = unit(10, "mm"), show_row_names = FALSE, show_heatmap_legend = FALSE,
show_column_names = FALSE) +
Heatmap(membership_each, name = "Class", show_row_dend = FALSE, show_column_dend = FALSE, col = cola_opt$color_set_2,
column_title = ifelse(internal, "", qq("membership heatmap, k = @{k}")),
column_order = mat_col_od, cluster_columns = FALSE,
row_split = factor(param$top_n, levels = sort(unique(param$top_n))),
cluster_row_slices = FALSE,
top_annotation = HeatmapAnnotation(Prob = membership_mat,
Class = class_ids, col = c(list(Class = cola_opt$color_set_2), Prob = col_fun),
show_annotation_name = !internal),
bottom_annotation = bottom_anno,
show_column_names = show_column_names, column_names_gp = column_names_gp,
row_title = NULL
)
if(internal) {
row_title = NULL
} else {
row_title = qq("@{round(object@n_partition/length(object@k)/length(top_n_level))} x @{length(top_n_level)} random samplings")
}
draw(ht, main_heatmap = "Class", row_title = row_title,
show_heatmap_legend = FALSE, show_annotation_legend = !internal)
param2 = get_param(object, k)
if(!internal) {
for(i in seq_along(top_n_level)) {
decorate_heatmap_body("top_n", slice = i, {
grid.text(qq("top @{top_n_level[i]} rows"), rot = 90, gp = gpar(fontsize = 10))
})
}
}
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.