## Getter functions ============================================================
## get functions for components of seqArchR's result object
## These keep such accesses in other functions agnostic to how they are
## maintained in the seqArchR result object
#' @title Get functions for seqArchR result object
#'
#' @description Basis vectors' information for the selected iteration.
#'
#' @param res seqArchR result object.
#' @param iter Choose the iteration of seqArchR result to get from.
#'
#' @return get_clBasVec A list with two elements `nBasisVectors` (integer) and
#' `basisVectors` (matrix).
#'
#' @examples
#' res <- readRDS(system.file("extdata", "example_seqArchRresult.rds",
#' package = "seqArchR", mustWork = TRUE))
#'
#' k <- get_clBasVec_k(res=res, iter=2)
#'
#' bMat <- get_clBasVec_m(res=res, iter=2)
#'
#' ## cluster labels of sequences from final clustering
#' scLab <- get_seqClLab(res=res, iter=2)
#'
#' @export
get_clBasVec <- function(res, iter){
return(res$clustBasisVectors[[iter]])
}
## =============================================================================
#' @describeIn get_clBasVec Get the number of basis vectors (clusters)
#' at the selected iteration.
#'
#' @return get_clBasVec_k The number of basis vectors (integer).
#'
#' @export
get_clBasVec_k <- function(res, iter){
return(res$clustBasisVectors[[iter]]$nBasisVectors)
}
## =============================================================================
#' @describeIn get_clBasVec The basis vectors matrix at the selected
#' iteration. Note that eatures along rows.
#'
#' @return get_clBasVec_m The basis vectors' matrix with features along the
#' rows of the matrix.
#'
#' @export
get_clBasVec_m <- function(res, iter){
return(res$clustBasisVectors[[iter]]$basisVectors)
}
## =============================================================================
#' @describeIn get_clBasVec Get the cluster IDs for each sequence. Note that
#' order of sequences here is as per the input.
#'
#' @return get_seqClLab A character vector denoting the cluster IDs for
#' each sequence.
#'
#' @seealso \code{\link{seqs_str}}
#' @export
get_seqClLab <- function(res, iter = NULL){
if(is.null(iter)){
return(res$clustSol$seqsClustLabels)
}else{
return(res$seqsClustLabels[[iter]])
}
}
## =============================================================================
#' @title Get sequences from the seqArchR result object
#' @description Wrapper to fetch sequences from the seqArchR result object as
#' character
#'
#' @param res seqArchR result object
#'
#' @param iter Specify the iteration of seqArchR result. If set to NULL
#' (the default), the original set of sequences (`seqArchRresult$rawSeqs`) is
#' returned.
#'
#' @param cl Specify the cluster number. Sequences belonging to this cluster in
#' iteration `iter` of seqArchR result are returned as character. When `iter` is
#' NULL, this is treated as denoting the cluster number in seqArchR's final
#' clustering solution (`seqArchRresult$clustSol$clusters`).
#'
#' @param ord Specify TRUE if sequences are ordered by clusters. The original
#' ordering of the sequences can be fetched by setting `iter` to NULL and `ord`
#' to FALSE.
#'
#' @details Setting iter to NULL will fetch sequences as per the final
#' clustering solution of seqArchR (`clustSol$clusters`). When `iter` is not
#' NULL, use `cl` to further choose a particular cluster. When `cl` is NULL,
#' the set of sequences returned can be ordered by clusters with `ord = TRUE`.
#' Using `ord = FALSE` fetches the sequences by their original order.
#'
#' @return The selected DNA sequences from the DNAStringSet object as a
#' character vector.
#'
#' @export
#'
#' @examples
#'
#' res <- system.file("extdata", "example_seqArchRresult.rds",
#' package = "seqArchR", mustWork = TRUE)
#'
#' # Fetch sequences from 2nd cluster of seqArchR's final solution
#' ans <- seqArchR::seqs_str(readRDS(res), iter=NULL, cl=2)
#'
#' # Fetch all sequences ordered by the final clustering
#' ans <- seqArchR::seqs_str(readRDS(res), iter=NULL, cl=NULL, ord=TRUE)
#'
#' # Fetch sequences belonging to first cluster in seqArchR's first iteration
#' ans <- seqArchR::seqs_str(readRDS(res), iter=1, cl=1)
#'
#'
seqs_str <- function(res, iter = NULL, cl = NULL, ord = FALSE){
## when iter is NULL, return sequences belonging to a cluster (specified
## by cl) in the final solution
if(is.null(iter) && is.null(cl) && !ord){
return(as.character(res$rawSeqs))
}
if(is.null(iter) && is.null(cl) && ord){
use_ord <- unlist(res$clustSol$clusters)
return(as.character(res$rawSeqs[use_ord]))
}
if(is.null(iter) && !is.null(cl)){
cl_mem <- res$clustSol$clusters[[cl]]
return(as.character(res$rawSeqs[cl_mem]))
}
if(!is.null(iter) && !is.null(cl)){
clust_list <- get_seqs_clust_list(get_seqClLab(res, iter))
cl_mem <- clust_list[[cl]]
return(as.character(res$rawSeqs[cl_mem]))
}
if(!is.null(iter) && is.null(cl) && ord){
clust_list <- get_seqs_clust_list(get_seqClLab(res, iter))
use_ord <- unlist(clust_list)
return(as.character(res$rawSeqs[use_ord]))
}
}
## =============================================================================
# @title Assign samples to clusters
#
# @param samplesMatrix
#
# @return A list that can be assigned as an element in globClustAssignments
#
.get_cluster_memberships_per_run <- function(samplesMatrix, iChunksColl = NA,
iChunkIdx = NA, test_itr = NA,
oChunkIdx = NA) {
.assert_seqArchR_featSampMatrix(samplesMatrix, feat = FALSE)
nClusters <- nrow(samplesMatrix)
returnClusterAsList <- vector("list", nClusters)
## Fetch the cluster memberships for each sample (along the columns)
clustMemberships <- apply(samplesMatrix, 2, which.max)
##
if (length(unique(clustMemberships)) != nClusters) {
cli::cli_alert_warning(
c("Basis vector that got no sequences assigned: ",
setdiff( unique(clustMemberships), seq_len(nClusters))))
}
return(clustMemberships)
}
## =============================================================================
.adjustSampleMemberships <- function(old_mem, samplesMatrix, has_overfit){
## Assign all overfitted samples back to cluster 1
new_mem <- old_mem
##
unoverfit <- setdiff(unique(old_mem), has_overfit)
toassign <- utils::tail(sort(unoverfit, decreasing = FALSE), 1)
for(i in has_overfit){
## simply assigning 1 is problematic if/when 1 is among the overfit
## and/or the only one overfit. This has earlier resulted in an error.
## Instead assign the lowest cluster ID which is not overfit
new_mem[which(old_mem == i)] <- toassign
}
names(new_mem) <- NULL
## At this point new memberships for samples can contain clust ids
## such 1, 4 and 5 because 2 and 3 are overfit. (i.e. they are not
## consecutive). We need to get them to be consecutive
new_mem_elem <- sort(unique(new_mem))
for(i in seq_along(new_mem_elem)){
new_mem[new_mem == new_mem_elem[i]] <- i
}
##
new_mem_labs <- unique(new_mem)
stopifnot((length(unique(old_mem)) - length(has_overfit))
== length(new_mem_labs))
if(length(new_mem_labs) == 1) stopifnot(unique(new_mem_labs) == 1)
##
new_mem
}
## =============================================================================
## This function detects if, for a given number of factors, overfitting has
## occurred.
##
## Return value: the outlier clust ids or numeric(0) if no outlier
##
## Input args:
## -- samplesMat
## -- clusterMemberships
## -- minSeqs (This can be a different value than that passed via
## seqArchR config)
## --
.detect_overfitting <- function(samplesMat, clustMemberships, minSeqs,
sizeThreshold = 10){
# put factors along columns
samplesMat <- t(samplesMat)
len_clusts <- unlist(lapply(seq_len(ncol(samplesMat)), function(x){
length(which(clustMemberships == x))
}))
out_cl_idx_size <- which(len_clusts <= sizeThreshold)
qual_cl_idx <- which(len_clusts < minSeqs & len_clusts > sizeThreshold)
## Update:
## -- Keep only IQR overfitted outliers when the qualified clusters are
## among the ones in between.
## -- For any in-between outliers, assign them back to the first cluster,
## instead ofthe next maximum loading cluster
## -- If the m qualifying clusters are from n-m to n, they can be
## considered for either IQR or range-outlier case
## --
out_cl_idx_other <- NULL
out_cl_idx <- out_cl_idx_size
if(length(qual_cl_idx) > 0){
out_cl_idx_other <- .compare_clusters(samplesMat, clustMemberships,
qual_cl_idx, zscore_thresh = 5)
out_cl_idx <- unique(union(out_cl_idx_other, out_cl_idx_size))
}
## Account for when all available clusters qualify to be checked for
## overfitting cases
if(length(out_cl_idx) == length(len_clusts)){
out_cl_idx <- setdiff(out_cl_idx, 1)
}
if(length(out_cl_idx) > 0) return(out_cl_idx)
##
return(intersect(1,2))
}
## =============================================================================
.compare_clusters <- function(samplesMat, clustMemberships, qual_cl_idx,
zscore_thresh = 5){
clustwise_matlist <- lapply(seq_len(ncol(samplesMat)), function(x){
temp_mat <- matrix(samplesMat[clustMemberships == x,],
ncol = ncol(samplesMat), byrow = FALSE)
temp_mat
})
# Adjust qualifying IDs based on Update in compare_clusters func
# -- Allow any ID for IQR check
out_clust_iqr <- .compare_iqr_or_range(clustwise_matlist, qual_cl_idx,
zscore_thresh = zscore_thresh, iqr = TRUE)
# out_clust_range <- NULL
out_clust_range <- .compare_iqr_or_range(clustwise_matlist, qual_cl_idx,
zscore_thresh = zscore_thresh, iqr = FALSE)
##
## Those that qualify as outliers by IQR comparison, will also
## qualify by range comparison. But for those which do not qualify
## by IQR but by range, we need further checks for them.
## -- where does their box lie,
## -- near the maximum? Not an outlier
## -- near others? outlier
##
check_these <- setdiff(out_clust_range, out_clust_iqr)
if(length(check_these) > 0){
out_clust_range <- .add_check_range_outliers(check_these,
clustwise_matlist)
}
##
out_clust_size <- NULL
## This is commented and out_clust_size is set NULL because this is
## already handled above. See detect_overfitting/sizeThreshold
##
return(unique(c(out_clust_iqr, out_clust_range, out_clust_size)))
}
## =============================================================================
.add_check_range_outliers <- function(check_these, clustwise_matlist){
## get boxplot.stats
all_stats_max <- lapply(clustwise_matlist, function(x){
tempx <- lapply(seq_len(ncol(x)), function(y){
max(as.vector(x[,y]))
})
unlist(tempx)
})
all_stats_midpoints <- lapply(clustwise_matlist, function(x){
tempx <- lapply(seq_len(ncol(x)), function(y){
tempy <- grDevices::boxplot.stats(as.vector(x[,y]))
sum(tempy$stats[c(2,4)])/2
})
unlist(tempx)
})
#####
all_stats_midpoint_nl <- unlist(all_stats_midpoints)
#####
temp <- vapply(check_these, function(x){
## use midpoints to compare locations of boxes
check_midpoint <- all_stats_midpoints[[x]][x]
distWithMax <- abs(check_midpoint - all_stats_max[[x]][x])
distWithMedianMidpoints <- check_midpoint -
stats::median(all_stats_midpoint_nl)
if(distWithMax > distWithMedianMidpoints){
x
}else{
NA
}
}, FUN.VALUE = numeric(1))
out_cl_idx <- temp[which(!is.na(temp))]
##
return(out_cl_idx)
}
## =============================================================================
## Discard clusters with length smaller than 10 sequences
## This func is called when IQR and range fail to identify any outliers
## In which case if the IQRs are all zeros (or very small), that is sure shot
## overfitting occurrence
.compare_size <- function(clustwise_matlist, qual_cl_idx, threshold = 0.01){
all_iqr <- lapply(clustwise_matlist, function(x){
## get column-wise IQRs
matrixStats::colIQRs(x)
})
## detect if all IQRs for a cluster are 0
if_all_zeros <- unlist(lapply(all_iqr, function(x){
ifelse(all(!(x > threshold)), TRUE, FALSE)
## enter here if all zeros is detected
}))
## Alternatively, simply check the number of samples in this cluster,
## if < 10, discard.
## -- This is already handled in a function hierarchical above this func.
## See detect_overfitting/sizeThreshold
out_cl_idx <- intersect(which(if_all_zeros), qual_cl_idx)
return(out_cl_idx)
}
## =============================================================================
## return indices of outliers by comparing ranges or IQRs
## This func is a joint version of earlier separate funcs for each case
.compare_iqr_or_range <- function(clustwise_matlist, qual_cl_idx,
zscore_thresh = 5, iqr = TRUE){
ncl <- ncol(clustwise_matlist[[1]])
all_range_or_iqr <- lapply(clustwise_matlist, function(x){
## get column-wise ranges
apply(apply(as.matrix(x), MARGIN = 2, range), MARGIN = 2, diff)
})
if(iqr){
all_range_or_iqr <- lapply(clustwise_matlist, function(x){
## get column-wise IQRs
matrixStats::colIQRs(x)
})
}
all_range_or_iqr_vec <- unlist(all_range_or_iqr)
##
all_range_or_iqr_mad <- stats::mad(all_range_or_iqr_vec)
range_or_iqr_zscore <-
(all_range_or_iqr_vec -
stats::median(all_range_or_iqr_vec))/all_range_or_iqr_mad
##
out_idx <- which(range_or_iqr_zscore > zscore_thresh)
if(length(out_idx) > 0){
clust_id <- ceiling(out_idx / ncl)
return(intersect(clust_id, qual_cl_idx))
}else{
# do nothing
}
return(NULL)
}
## =============================================================================
.assign_samples_to_clusters <- function(clusterMembershipsVec, nClusters,
iChunksColl, iChunkIdx) {
##
returnClusterAsList <- lapply(seq_len(nClusters), function(x){
iChunksColl[[iChunkIdx]][clusterMembershipsVec == x]
})
return(returnClusterAsList)
}
## =============================================================================
#' @title Collate sequence IDs from an existing clustering according to a new,
#' given clustering of the existing clusters
#'
#' @description Collate sequences original divided across n clusters into a
#' new set of m clusters. These `m` clusters obtained by clustering the
#' original `n` clusters.
#' Assume a collection of 100 sequences across seven existing
#' clusters. These seven clusters are collated to obtain three resulting
#' clusters. Collating 100 sequences distributed across the seven clusters into
#' the resulting three clusters can be achieved with collate clusters.
#'
#' @param to_clust A list giving clustering of factors. In other words this is
#' the clustering of clusters of sequences given in `orig_clust`
#' @param orig_clust A list of sequence IDs in existing clusters
#'
#' @return A list with sequence IDs collated by the specified clustering
#' @export
#'
#' @examples
#'
#'
#'
#' set.seed(123)
#' n <- 7; nn <- 100
#' orig_clust_labels <- ceiling(n * runif(nn))
#' orig_clust <- seqArchR::get_seqs_clust_list(orig_clust_labels)
#'
#' to_clust <- list(c(1,4), c(2,3,5), c(6,7))
#'
#' collate_clusters(to_clust = to_clust, orig_clust = orig_clust)
#'
collate_clusters <- function(to_clust, orig_clust) {
if (is.null(to_clust)) {
## When/if factor to_clust is not performed, the
## orig_clust variable is directly assigned
return(orig_clust)
}else{
.assert_seqArchR_globClustAssignments(orig_clust)
##
nClusters <- length(to_clust)
clustSizes <- unlist(lapply(to_clust, length))
##
collClAssign <- lapply(seq_len(nClusters), function(x){
temp <- unlist(lapply(to_clust[[x]], function(y){
orig_clust[[y]]
}))
temp
})
##
return(collClAssign)
}
}
## =============================================================================
.msg_print <- function(vec_or_list){
if(is.list(vec_or_list)){
return(paste(vec_or_list, collapse = "\n"))
}
if(is.vector(vec_or_list)){
return(paste(vec_or_list, collapse = ", "))
}
}
## =============================================================================
.msg_pstr <- function(..., flg){
if(flg){
message(...)
}
}
## =============================================================================
## @title Update labels of sequences in a cluster
## @param oldSeqsClustLabels
##
## @param collatedClustAssignments
## @param flags List. Flags variable from seqArchR config
##
## @return newSeqsClustLabels Vector os clustLabels
.update_cluster_labels <- function(oldSeqsClustLabels,
collatedClustAssignments) {
## Important: The collatedClustAssignments variable is a list where each
## element holds the indices of sequences falling in the same cluster,
## like so: For a set of 200 sequences, collatedClustAssignments is
## [[1]]
## [1] 1 2 6 11 20 23 32 46 50 52 63 68 71 72 75
## 80 82 83 85 86 87 88 92 99 102 104 105
##
## [[2]]
## [1] 3 5 7 8 10 12 13 16 17 18 19 22 25 30 31
## 36 37 38 39 41 43 45 47 49 51 53 54
## --
## This is different than the globClustAssignment variable which will hold
## the indices of the factors that are combined into one cluster.
## For instance, when there are 25 factors/clusters which have combined
## to give 5 clusters, globClustAssignments will hold something like this:
## [[1]]
## [1] "3" "13" "11" "16" "6"
##
## [[2]]
## [1] "15" "1" "8" "25" "18"
##
.assert_seqArchR_seqsClustLabels(oldSeqsClustLabels)
.assert_seqArchR_globClustAssignments(collatedClustAssignments)
nClusters <- length(collatedClustAssignments)
## Use numerics w/ as.character and pre-sort to have them in the
## order that will be returned by levels (in get_seqs_clusters_in_list fn)
candidateClustLabels <- sort(as.character(seq_len(nClusters)))
newSeqsClustLabels <- oldSeqsClustLabels
for (i in seq_len(nClusters)) {
needUpdateIdx <- collatedClustAssignments[[i]]
newSeqsClustLabels[needUpdateIdx] <-
vapply(newSeqsClustLabels[needUpdateIdx], function(x) {
paste0(candidateClustLabels[i])}, character(1)
)
}
##
.assert_seqArchR_seqsClustLabels(newSeqsClustLabels)
return(newSeqsClustLabels)
}
## =============================================================================
# @title Prepare chunks out of given set of sequences (sequence IDs)
# @param total_set set of sequences to be chunked
# @param reqdChunkSize the given chunk size
# @param flags Specify the flags from the config param set for seqArchR
#
# @return A list with chunks of sequences (sequence IDs) as its elements
.prepare_chunks <- function(total_set, reqdChunkSize) {
## total_set is the set of seq_ids to be chunked (not array indices)
if (is.null(total_set)) {
stop("Preparing chunks, 'total_set' is NULL")
}
chunkLength <- length(total_set)
if (chunkLength == 0) {
stop("Preparing chunks, length of 'total_set' is 0")
}
.assert_seqArchR_chunkSize_independent(reqdChunkSize)
##
## When chunkLength (i.e., total sequences) < reqdChunkSize, the else
## condition, return the totalSet as the only chunk.
if (chunkLength > reqdChunkSize) {
chunkStarts <- seq(1, chunkLength, by = reqdChunkSize)
chunkEnds <- seq(reqdChunkSize, chunkLength, by = reqdChunkSize)
if (length(chunkStarts) > length(chunkEnds)) {
if ((chunkLength - chunkStarts[length(chunkStarts)]) >
round(0.5*reqdChunkSize)) {
chunkEnds <- append(chunkEnds, chunkLength)
} else {
chunkStarts <- chunkStarts[-length(chunkStarts)]
chunkEnds[length(chunkEnds)] <- chunkLength
}
}
##
preparedChunks <- lapply(seq_along(chunkStarts), function(x){
total_set[seq(chunkStarts[x],chunkEnds[x],by=1)]
})
##
} else {
preparedChunks <- vector("list", 1)
preparedChunks[[1]] <- total_set
}
if (length(preparedChunks) < 1) {
stop("Preparing chunks, length was 0")
}
return(preparedChunks)
}
## =============================================================================
# @title Handle clustering of NMF factors
#
# @param globFactorsMat Specify the NMF factors as a matrix with the individual
# factors along the columns.
#
# @param distMethod Specify the distance measure to be used. Default is cosine
# measure.
#
# @param clustMethod Specify hc. This is the only option available.
#
# @param linkage Specify one of linkage options for hclust.
#
# @param flags Specify the flags from the config param set for seqArchR.
#
# @param returnOrder
#
# @param useCutree By setting this arg to TRUE, the cutree version of
# get_clusters func is used. This func uses silhouette and/or
# Calinski-Harabasz index. See additional details there. Note: currently, this
# is the only avaiable func so useCutree should beset to TRUE always, FALSE
# will not work.
#
# @param minClusters Passed to get_clusters function. See explanation there.
#
# @param parentChunks
#
# @return List with clustered factors
#
# Change on 2020-12-28:
# - change default value for distMethod to 'euclid',
# - change default value for linkage to 'ward.D',
# - remove returnOrder arg. not needed; this was useful during development
# -
# Change on 2021-06-03:
# - Added back returnOrder argument
#
.handle_clustering_of_factors <- function(globFactorsMat,
clustMethod = "hc",
linkage = "average",
distMethod = "cosangle",
flags = list(debugFlag = FALSE,
verboseFlag = FALSE,
plotVerboseFlag = FALSE,
timeFlag = FALSE),
useCutree = TRUE,
minClusters = 2,
parentChunks = NULL,
returnOrder = FALSE,
...){
##
.assert_seqArchR_featSampMatrix(globFactorsMat, feat = TRUE)
.assert_seqArchR_flags(flags)
dbg <- flags$debugFlag
vrbs <- flags$verboseFlag
##
gfDisMat <- .compute_factor_distances(globFactorsMat, distMethod)
if(clustMethod == "hc"){
if(distMethod == "euclid") use_threshold <- 3
if(distMethod == "cor") use_threshold <- 0.75
as_dist_mat <- stats::as.dist(gfDisMat)
temp_hclust <- stats::hclust(as_dist_mat, method = linkage)
ord <- temp_hclust$order
.msg_pstr("Cluster order by ", linkage," linkage w/ ",
distMethod, " distance:", flg=dbg)
.msg_pstr("New order: ", .msg_print(ord), flg=dbg)
if(dbg){
invisible(lapply(seq_len(length(ord)-1), function(k){
.msg_pstr(paste(paste(ord[k], ord[k+1], sep=", "), "=",
gfDisMat[ord[k], ord[k+1]], collapse="\n"),
flg=dbg)
}))
}
.msg_pstr("New order: ", .msg_print(ord), flg=dbg)
if(returnOrder) return(temp_hclust)
.msg_pstr("=== Fetching clusters === ", flg=dbg)
if(useCutree){
clustList <- .get_clusters_from_hc_using_cutree(
hcObj = temp_hclust, distMat = as_dist_mat,
hStep = 0.05, parentChunks = parentChunks,
minClusters = minClusters,
distThreshold = use_threshold,
verbose = flags$debugFlag, ...)
}
if(length(clustList) == 1){
clustList <- lapply(seq_len(max(clustList[[1]])),
function(x){x})
}
.msg_pstr("== DONE ==", "\nClustList: ", .msg_print(clustList), flg=dbg)
return(clustList)
}
}
## =============================================================================
.unfurl_nodeList <- function(nodeList, vec_ver = FALSE, vrbs=FALSE){
##
.check_list_prop(nodeList)
##
if(vec_ver){
element_lengths <- unlist(lapply(nodeList, length))
}else{
element_lengths <- unlist(lapply(nodeList, function(elem){
ifelse(is.list(elem), length(elem), 1)
}))
}
if(any(element_lengths != 1)){
.msg_pstr("Needs unfurling...", flg=vrbs)
new_list <- vector("list", sum(element_lengths))
iter1 <- 0
iter2 <- 1
while(iter2 <= length(nodeList)){
if((vec_ver && length(nodeList[[iter2]]) == 1) ||
(!vec_ver && !is.list(nodeList[[iter2]]))) {
iter1 <- iter1 + 1
new_list[[iter1]] <- nodeList[[iter2]]
}else{
iter1 <- iter1 + 1
stIdx <- iter1
enIdx <- (iter1 + length(nodeList[[iter2]]) - 1)
new_list[stIdx:enIdx] <- nodeList[[iter2]]
iter1 <- enIdx
}
iter2 <- iter2 + 1
} ## while ends
return(new_list)
}else{
.msg_pstr("No unfurling...", flg=vrbs)
return(nodeList)
}
}
## =============================================================================
# @title Order cutree result by hierarchical clustering order
#
# @description Make clusters obtained by cutree to follow the
# hierarchical clustering order
#
# @param cutree_res Result object of cutree
# @param hcorder The ordering from hierarchical clustering that is accessible
# in its result object
#
# @return A list of clusters, including each clusters' elements, ordered by
# their appearance in the hierarchical clustering order
#
# @examples
#
# @export
fetch_cutree_by_hc_order <- function(clust_list, cutree_res = NULL, hcorder){
if(!is.null(cutree_res)){
names(cutree_res) <- NULL
clust_list <- lapply(unique(cutree_res),
function(x){which(cutree_res == x)})
}
## Arrange clusters themselves by the hc_order
starters <- lapply(clust_list, function(x){x[[1]]})
starters_pos <- unlist(lapply(unlist(starters), function(x){
which(x == hcorder)
}))
use_idx <- sort(starters_pos, index.return = TRUE)
clust_list <- lapply(seq_along(use_idx$ix), function(x){
clust_list[[use_idx$ix[x]]]
})
# For each ID in list elements, get their position in the hcorder vector
cl_order_idx <- unlist(lapply(unlist(clust_list), function(x){
which(x == hcorder)
}))
## Arrange each cluster elements by hc_order
cl_lens <- unlist(lapply(clust_list, length))
cl_ends <- cumsum(cl_lens)
cl_starts <- c(1, 1+cl_ends[seq(length(cl_ends)-1)])
stopifnot(tail(cl_ends, 1) == length(hcorder))
## w/o for loop
clust_list_new <- lapply(seq_along(cl_starts), function(x){
adjust_idx <- cl_order_idx[cl_starts[x]:cl_ends[x]]
if(x > 1){
adjust_idx <- cl_order_idx[ cl_starts[x]:cl_ends[x] ] - cl_ends[x-1]
}
clust_list[[x]][order(adjust_idx)]
})
stopifnot(length(unlist(clust_list_new)) == length(hcorder))
stopifnot(length(unique(unlist(clust_list_new))) == length(hcorder))
return_vec <- rep(0, length(hcorder))
return_vec <- lapply(seq_along(clust_list_new), function(x){
return_vec
})
return(clust_list_new)
}
############################## CUTREE VERSION ##################################
## This works best/is best referred with ward.D linkage
# @title Get clusters out of hierarchical clustering object using cutree.
# Used internally by seqArchR.
#
# @description Clusters from a hierarchical clustering object are obtained
# by using cutree at different heights of the tree. The optimum number of
# clusters are decided based on the average silhouette value of the clustering.
#
# @param hcObj The hierarchical clustering object as returned by
# \code{\link[stats]{hclust}}.
# @param distMat The distance matrix that was used by hclust, a
# \code{\link[stats]{dist}} object.
# @param hStep Numeric. The step size used to increment height values for
# cutree. Default value is 0.05.
# @param parentChunks List. Specify the factor numbers in the previous
# iteration of seqArchR that factors in the current iteration resulted from.
# Default value is NULL.
# @param keepSiblingsUncollated Logical. Specify TRUE if all clusters from a
# single parent chunk should not be collated, even when recommended so based
# on silhouette value computation. Default value FALSE. i.e. if they get
# collated, it is left as collated.
# @param enableSwitchSilToCH Logical. When and if the clustering result has
# singletons, using the average silhouette value may not be reliable to decide
# on the optimum number of clusters. In this case, if this argument is TRUE,
# the Calinski-Harabasz index is used instead. Setting it to FALSE, the
# switch is disabled, i.e., average silhouette value is used. Default is FALSE.
# @param minClusters Integer. Specify the minimum number of clusters to be
# fetched from the HAC solution.
# @param distThreshold Numeric. Specify a threshold of units of distance for
# considering any two elements as close enough to be merged in the same
# cluster. The default value is specified for Euclidean distance.
# @param verbose Logical. Specify TRUE for verbose output.
#
# @importFrom stats cutree
# @importFrom fpc calinhara
# @importFrom cluster silhouette
#
## Addtional comments:
## Keeping the min to 0 (as below) leads to as many clusters as
## the number of elements. This errors when at mean[, "sil_width"]
## cut_heights <- seq(0, max(hcObj$height), by = hStep)
##
## Important: Address the question of whether any merging is required?
## -- We could use information from seqArchR iteration
## -- Obtain the cutree-using-h clustering here.
## -- Is it combining consecutive factors, like 1-2-3, 3-4 etc.?
## a. This could mean that it is undoing the clusters
## identified by seqArchR in a previous iteration.
## b. But, if 1 came from chunk 1 and 2-3 came from chunk 2,
## and if the clustering results in combining 1-2, then this
## is a plausible merge and should be left alone.
## c. So, we use a parentChunks variable that notes the parent
## chunk for each of the current factors. This info can be
## used to make this decision unambiguously.
##
## Update: 2020-12-13: We currently let these get combined if
## HAC+cutree deems it fine, and we recommend that the clusters at the
## iteration of seqArchR can be left uncollated for reference.
## The final stage will then hold the best possible set of clusters
## guided by silhouette value. Now handled by logical argument
## keepSiblingsUncollated.
##
## Update: 2020-12-24: With singletons in a clustering, using
## silhouette values could be problematic, because it arbitrarily
## assigns s(i) = 0 for a singleton cluster. This can lead to
## smaller average silhouette value, artificially making that
## clustering look bad. Thus, if and when we detect such a case,
## test using the Calinski-Harabasz (CH) index automatically.
## See argument `enableSwitchSilToCH`.
.get_clusters_from_hc_using_cutree <- function(hcObj, distMat, hStep = 0.05,
parentChunks = NULL,
keepSiblingsUncollated = FALSE,
enableSwitchSilToCH = FALSE,
minClusters = 2,
## number of clusters in a
## previous iteration on
## which collation was performed
distThreshold = 0.75,
## the default value for
## - euclidean distance: 3
## - correlation distance: 0.75
verbose = FALSE){
##
if(minClusters < 2){
warning("minClusters < 2. Setting it to 2")
minClusters <- 2
}
.msg_pstr("minClusters is ", minClusters, flg=verbose)
##
if(min(distMat) > distThreshold){
clust_list <- lapply(hcObj$order, function(x) x)
.msg_pstr("No element pairs close enough by given dist threshold: ",
distThreshold, flg=verbose)
.msg_pstr("#Clusters: ", length(clust_list), flg=verbose)
return(clust_list)
}else{
##
cut_heights <- seq(min(hcObj$height), max(hcObj$height), by = hStep)
clust_list <- .get_clusts_sil_or_ch(cut_heights, hcObj, distMat,
minClusters, use_sil = TRUE, verbose = verbose)
##
clust_list <- fetch_cutree_by_hc_order(clust_list = clust_list,
hcorder = hcObj$order)
##
.msg_pstr("#Clusts using sil.vals:", length(clust_list), flg=verbose)
.msg_pstr(.msg_print(clust_list), flg=verbose)
clust_list_lengths <- unlist(lapply(clust_list, length))
if(enableSwitchSilToCH && any(clust_list_lengths == 1)){
.msg_pstr("But singleton(s) present: clusters ",
paste(which(clust_list_lengths == 1), collapse= " "), flg=verbose)
.msg_pstr("Using Calinski-Harabasz index instead", flg=verbose)
### Calinski-Harabasz Index
clust_list <- .get_clusts_sil_or_ch(cut_heights, hcObj, distMat,
minClusters, use_sil = FALSE, verbose = verbose)
}
## number of clusters decided, get final clustering result
.msg_pstr("Final #Clusters: ", length(clust_list), flg=verbose)
.msg_pstr(.msg_print(clust_list), flg=verbose)
##
if(!is.null(parentChunks) && keepSiblingsUncollated){
clust_list <- .check_and_uncollate_siblings(clust_list,
parentChunks, verbose)
}
.msg_pstr("#Clusters: ", length(clust_list), flg=verbose)
.msg_pstr(paste(clust_list), flg=verbose)
return(clust_list)
}
}
## =============================================================================
# .detect_just_for_sake_clust <- function(cheight_idx, cl_list, vrbs=FALSE){
# upd_cl_list <- cl_list
# cl_lens <- lapply(cl_list, length)
# if(cheight_idx == 1 && length(which(cl_lens == 2)) == 1){
# .msg_pstr("just for sake clustering detected!", flg=vrbs)
# upd_cl_list <- .unfurl_nodeList(cl_list, vec_ver=TRUE, vrbs)
# }
# upd_cl_list
# }
## =============================================================================
.check_and_uncollate_siblings <- function(clust_list, parentChunks, verbose){
.msg_pstr("Uncollating siblings, in case...", flg=verbose)
.msg_pstr("Checking parent chunks", flg=verbose)
childrenPerParent <- lapply(unique(parentChunks), function(x){
which(parentChunks == x)
})
updated_clust_list <- lapply(seq_along(clust_list), function(x){
parents <- unique(parentChunks[clust_list[[x]]])
thisX <- clust_list[[x]]
if(length(parents) == 1 && length(thisX) > 1){
## if cluster elements come from same parent chunk of previous
## iteration + now check if all elements in this cluster
## are the only children of that parent chunk (in other words
## nothing got separated and combined with other factors)
childrenThisParent <- childrenPerParent[[as.integer(parents)]]
if(identical(as.numeric(childrenThisParent), as.numeric(thisX))){
.msg_pstr("Parents are identical", flg=verbose)
## Update: 2020-12-13
## See update above.
## With keepSiblingsUncollated as FALSE, this check is not
## performed.
##
## Split the merge back into separate clusters
return(lapply(childrenThisParent, function(y){y}))
}else{
.msg_pstr("returning thisX", flg=verbose)
return(thisX)
}
}
else{
## cluster elements come from different parent chunks
## (of previous iteration)
return(thisX)
}
})
clust_list <- .unfurl_nodeList(updated_clust_list, vec_ver=FALSE)
clust_list
}
## =============================================================================
## This works best/is best referred with ward.D linkage
# @title Get clusters out of hierarchical clustering using either silhouette
# value or Calinski-Harabasz index.
# Used internally by seqArchR.
#
# @description Clusters from a hierarchical clustering object are obtained
# by using cutree at different heights of the tree. The optimum number of
# clusters are decided based on the average silhouette value of the clustering.
#
# @param hcObj The hierarchical clustering object as returned by
# \code{\link[stats]{hclust}}.
# @param distMat The distance matrix that was used by hclust, a
# \code{\link[stats]{dist}} object.
# @param hStep Numeric. The step size used to increment height values for
# cutree. Default value is 0.05.
# @param parentChunks List. Specify the factor numbers in the previous
# iteration of seqArchR that factors in the current iteration resulted from.
# Default value is NULL.
# @param keepSiblingsUncollated Logical. Specify TRUE if all clusters from a
# single parent chunk should not be collated, even when recommended so based
# on silhouette value computation. Default value FALSE. i.e. if they get
# collated, it is left as collated.
# @param enableSwitchSilToCH Logical. When and if the clustering result has
# singletons, using the average silhouette value may not be reliable to decide
# on the optimum number of clusters. In this case, if this argument is TRUE,
# the Calinski-Harabasz index is used instead. Setting it to FALSE, the
# switch is disabled, i.e., average silhouette value is used. Default is FALSE.
# @param minClusters Integer. Specify the minimum number of clusters to be
# fetched from the HAC solution.
# @param distThreshold Numeric. Specify a threshold of units of distance for
# considering any two elements as close enough to be merged in the same
# cluster. The default value is specified for Euclidean distance.
# @param verbose Logical. Specify TRUE for verbose output.
#
# @importFrom stats cutree
# @importFrom fpc calinhara
# @importFrom cluster silhouette
#
.get_clusts_sil_or_ch <- function(cut_heights, hcObj, distMat, minClusters,
use_sil = TRUE, verbose = FALSE){
measure_cut_h <- unlist(lapply(cut_heights, function(x){
foo_try <- stats::cutree(hcObj, h = x)
names(foo_try) <- NULL
if(length(unique(foo_try)) >= minClusters){
if(use_sil){
sils <- cluster::silhouette(foo_try, dist = distMat)
retVal <- base::mean(sils[, "sil_width"])
}else{
retVal <- fpc::calinhara(distMat, foo_try)
}
retVal
}else return(-100)
}))
## multiple matches, first match index is returned with which.max
cheight_idx <- which.max(measure_cut_h)
score_str <- ifelse(use_sil, "sil score", "CH index")
.msg_pstr("Max.", score_str, "at index =", cheight_idx, ", h:",
cut_heights[cheight_idx], flg=verbose)
## number of clusters decided, get final clustering result
cut_result <- stats::cutree(hcObj, h = cut_heights[cheight_idx])
names(cut_result) <- NULL
clust_list <- lapply(seq_along(unique(cut_result)),
function(x) which(cut_result == x))
clust_list
}
## =============================================================================
.regularizeMat <- function(basisMat, topN = 10){
basisMat2 <- apply(basisMat, MARGIN = 2, function(x){
sorted <- sort(x, decreasing = TRUE)
pickTopN <- sorted[seq_len(topN)]
threshold <- pickTopN[topN]
x[x < threshold] <- 0.0
x
})
return(basisMat2)
}
## =============================================================================
#' @title Collate raw clusters at the chosen iteration of seqArchR result
#'
#' @description We use hierarchical clustering for reordering/collating raw
#' clusters from seqArchR's given iteration.
#'
#' @param result The seqArchR result object.
#'
#' @param iter Specify clusters at which iteration of seqArchR are to be
#' reordered/collated. Default is the last iteration of the seqArchR result
#' object.
#'
#' @param clust_method Specify 'hc' for hierarchical clustering. Currently, only
#' hierarchical clustering is supported.
#'
#' @param aggl_method One of linkage values as specified for hierarchical
#' clustering with \code{\link[stats]{hclust}}. Default is 'ward.D'.
#'
#' @param dist_method Distance measure to be used with hierarchical clustering.
#' Available options are "euclid" (default), "cor" for correlation, "cosangle"
#' for cosine angle.
#'
#' @param regularize Logical. Specify TRUE if regularization is to be performed
#' before comparison. Default is FALSE. Also see argument 'topN'.
#'
#' @param topn Use only the top N dimensions of each basis vector for
#' comparing them. Note that since each basis vector has 4L or 16L (mono- or
#' dinucleotides) dimensions, each dimension is a combination of nucleotide and
#' its position in the sequence. This argument selects the top N dimensions of
#' the basis vector. This is ignored when argument 'regularize' is FALSE.
#'
#' @param collate Logical. Specify TRUE if collation using
#' hierarchical agglomerative clustering is to be performed, otherwise FALSE.
#'
#' @param return_order Logical. Use this argument when you want hierarchical
#' clustering to be performed but not collation of clusters. Therefore,
#' setting return_order to TRUE will return the hierarchical clustering
#' object itself. This enables custom downstream processing/analysis.
#'
#' @param flags Pass the flags object similar to the flags in configuration
#' of the seqArchR result object.
#'
#' @param ... ignored
#'
#' @return When `collate` is TRUE, a list with the following elements is
#' returned:
#' \describe{
#' \item{basisVectorsCLust}{A list storing collation information of the
#' basis vectors, i.e, IDs of basis vectors that were collated into one.}
#' \item{clusters}{A list of sequences in each collated cluster.}
#' \item{seqClustLabels}{Cluster labels for all sequences according to the
#' collated clustering.}
#' }
#'
#' When 'collate' is FALSE,
#' it returns the already existing basis vectors, each as singleton clusters.
#' The sequence cluster labels and sequence clusters are also handled
#' accordingly. All are available as part of the same list as the earlier case.
#'
#' When 'return_order' is set to TRUE, the hierarchical clustering result is
#' returned instead.
#'
#' @examples
#' res <- readRDS(system.file("extdata", "example_seqArchRresult.rds",
#' package = "seqArchR", mustWork = TRUE))
#'
#' # While the default settings for collation use Euclidean distance and
#' # ward.D agglomeration, one can choose to use different settings, say,
#' # correlation distance and complete linkage, and also regularizing to use
#' # only top 50 dimensions (nucleotide-positions combinations)
#' collated_res <- collate_seqArchR_result(result = res, iter = 2,
#' aggl_method = "complete", dist_method = "cor",
#' regularize = TRUE, topn = 50)
#'
#' names(collated_res)
#'
#' @importFrom stats hclust dist
#' @importFrom utils tail
#' @export
collate_seqArchR_result <- function(result,
iter = length(result$seqsClustLabels),
clust_method = "hc", aggl_method = "ward.D",
dist_method = "euclid", regularize = FALSE,
topn = 50, collate = TRUE, return_order = FALSE,
flags = list(debugFlag = FALSE, verboseFlag = TRUE),
...) {
##
dbg <- flags$debugFlag
vrbs <- flags$verboseFlag
stopifnot(iter > 0)
basisMat <- get_clBasVec_m(result, iter=iter)
clust_lab <- get_seqClLab(result, iter=iter)
##
## assert that topn value supplied is in the valid range
if(!(topn > 0 && topn <= nrow(basisMat))){
stop("Selected topn value ", topn," is outside expected range [",
.msg_print(c(1, nrow(basisMat))), "]")
}
if(!is.logical(regularize) && !is.logical(collate)){
stop("Arguments 'regularize' and 'collate' expect a logical ",
" value (TRUE/FALSE), found otherwise")
}
##
if(regularize) basisMat <- .regularizeMat(basisMat, topn)
##
clust_lab_pre <- NULL
if(iter > 1) clust_lab_pre <- get_seqClLab(result, iter=iter-1)
parentChunks <- .get_parent_chunks(clust_lab, clust_lab_pre, iter, dbg)
##
if(collate){
.msg_pstr("Collating clusters", flg=dbg)
factorsClustering <-
.handle_clustering_of_factors(basisMat, clustMethod = clust_method,
linkage = aggl_method, distMethod = dist_method,
flags = flags, parentChunks = parentChunks, ...)
.msg_pstr("Factor clustering done, returning:", flg=dbg)
.msg_pstr(paste("Returning: ", .msg_print(factorsClustering)), flg=dbg)
}else{
## Do not collate, but prepare the return object
nFactors <- get_clBasVec_k(result, iter)
factorsClustering <- vector("list", nFactors)
factorsClustering <- lapply(seq_len(nFactors), function(x){x})
.msg_pstr("No factor clustering, returning:", flg=dbg)
.msg_pstr(.msg_print(factorsClustering), flg=dbg)
}
if(return_order){
.msg_pstr("Reordering clusters", flg=dbg)
## Do not collate just reorder, but prepare the return object
nFactors <- get_clBasVec_k(result, iter)
factorsClustering <- vector("list", nFactors)
factorsClustering <- lapply(seq_len(nFactors), function(x){x})
temp <- .handle_clustering_of_factors(basisMat,
clustMethod = clust_method,
linkage = aggl_method,
distMethod = dist_method,
returnOrder = return_order,
flags = flags,
parentChunks = parentChunks, ...)
.msg_pstr("No collation of clusters, returning order:", flg=dbg)
.msg_pstr(.msg_print(factorsClustering), flg=dbg)
return(temp)
}
##
seqClusters <- get_seqs_clust_list(clust_lab)
clusters <- collate_clusters(factorsClustering, seqClusters)
clustLabels <- .update_cluster_labels(oldSeqsClustLabels = clust_lab,
collatedClustAssignments = clusters)
cluster_sol <- list(basisVectorsClust = factorsClustering,
clusters = clusters, seqsClustLabels = clustLabels)
return(cluster_sol)
}
## =============================================================================
.get_parent_chunks <- function(clust_lab, clust_lab_pre, iter, flg){
## Prepare info on parent chunks for each cluster/factor at given iteration
## We get this info from the seqsClustLabels
parentChunks <- NULL
if(iter > 1){
clustsThisIter <- sort(unique(clust_lab))
parentChunks <- unlist(lapply(clustsThisIter, function(x){
relSeqsIds <- which(clust_lab == x)
unique(clust_lab_pre[relSeqsIds])
}))
}
parentChunks
}
## =============================================================================
#' @title Retrieve sequence clusters as a list from the sequence labels
#'
#' @description Given the sequence cluster labels from the seqArchR result
#' object, returns the clusters separated as a list.
#'
#' @param seqs_clust_lab Sequences with cluster labels as in the seqArchR
#' result object.
#'
#' @return A list holding sequence IDs belonging in each cluster.
#'
#' @examples
#'
#' clustLabels <- sample(seq_len(4), 50, replace = TRUE)
#' print(clustLabels)
#' get_seqs_clust_list(clustLabels)
#'
#' @export
get_seqs_clust_list <- function(seqs_clust_lab){
## check that labels are not empty/NULL
.assert_seqArchR_seqsClustLabels(seqs_clust_lab)
clusterLevels <- levels(as.factor(seqs_clust_lab))
seqs_clusters_as_a_list <- lapply(seq_along(clusterLevels),
function(x){
which(seqs_clust_lab ==
clusterLevels[x])
})
return(seqs_clusters_as_a_list)
}
## =============================================================================
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.