R/gs_cluster.R

Defines functions cluster_markov

Documented in cluster_markov

#' Markov Clustering (MCL) for community detection
#'
#' This function implements the Markov Clustering (MCL) algorithm for finding community
#' structure, in an analogous way to other existing algorithms in `igraph`.
#'
#' This implementation has been driven by the nice explanations provided in
#' * https://sites.cs.ucsb.edu/~xyan/classes/CS595D-2009winter/MCL_Presentation2.pdf
#' * https://medium.com/analytics-vidhya/demystifying-markov-clustering-aeb6cdabbfc7
#' * https://github.com/GuyAllard/markov_clustering (python implementation)
#'
#' More info on the MCL: https://micans.org/mcl/index.html, and
#' https://micans.org/mcl/sec_description1.html
#'
#' @references van Dongen, S.M., Graph clustering by flow simulation (2000) PhD thesis,
#' Utrecht University Repository - https://dspace.library.uu.nl/handle/1874/848
#' @references  Enright AJ, van Dongen SM, Ouzounis CA, An efficient algorithm for
#' large-scale detection of protein families (2002) Nucleic Acids Research, Volume 30,
#' Issue 7, 1 April 2002, Pages 1575–1584, https://doi.org/10.1093/nar/30.7.1575
#'
#' @param g The input graph object
#' @param add_self_loops Logical, whether to add self-loops to the matrix by
#' setting the diagonal to `loop_value`
#' @param loop_value Numeric, the value to use for self-loops
#' @param mcl_expansion Numeric, cluster expansion factor for the Markov clustering
#' iteration - defaults to 2
#' @param mcl_inflation Numeric, cluster inflation factor for the Markov clustering
#' iteration - defaults to 2
#' @param allow_singletons Logical; if `TRUE`, single isolated vertices are allowed
#' to form their own cluster. If set to `FALSE`, all clusters of size = 1 are
#' grouped in one cluster (to be interpreted as background noise).
#' @param max_iter Numeric value for the maximum number of iterations for the
#' Markov clustering
#' @param return_node_names Logical, if the graph is named and set to `TRUE`, returns
#' the node names.
#' @param return_esm Logical, controlling whether the equilibrium state matrix should be returned
#'
#' @return This function returns a `communities` object, containing the numbers of
#' the assigned membership (in the slot `membership`). Please see the
#' [igraph::communities()] manual page for additional details
#'
#' @export
#'
#' @examples
#' library("igraph")
#' g <- make_full_graph(5) %du% make_full_graph(5) %du% make_full_graph(5)
#' g <- add_edges(g, c(1, 6, 1, 11, 6, 11))
#' cluster_markov(g)
#' V(g)$color <- cluster_markov(g)$membership
#' plot(g)
cluster_markov <- function(g,
                           add_self_loops = TRUE,
                           loop_value = 1,
                           mcl_expansion = 2,
                           mcl_inflation = 2,
                           allow_singletons = TRUE,
                           max_iter = 100,
                           return_node_names = TRUE,
                           return_esm = FALSE) {
  # g must be a graph
  if (!is(g, "igraph")) {
    stop("You need to provide an igraph object as input")
  }

  stopifnot(is.logical(add_self_loops))
  stopifnot(loop_value >= 0)
  stopifnot(mcl_expansion > 1)
  stopifnot(mcl_inflation > 1)
  stopifnot(loop_value >= 0)
  stopifnot(max_iter > 0)

  # graph to adjacency matrix
  adj_mat <- igraph::as_adjacency_matrix(g)
  adj_mat <- as.matrix(adj_mat) # to enforce non-sparse matrix

  converged <- FALSE

  # Initialize self-loops
  if (add_self_loops) {
    diag(adj_mat) <- loop_value
  }

  # Normalize (sum by col must be 1)
  adj_mat_norm <- t(adj_mat / colSums(adj_mat))

  cur_mat_norm <- adj_mat_norm
  for (i_mcl in seq_len(max_iter)) {
    # message(i_mcl)
    last_mat <- cur_mat_norm

    exp_mat <- cur_mat_norm %^% mcl_expansion
    inf_mat <- exp_mat^mcl_inflation
    # inf_mat_norm <- t(inf_mat/colSums(inf_mat))
    inf_mat_norm <- apply(inf_mat, MARGIN = 2, FUN = function(matcol) {
      matcol / sum(matcol)
    })

    ## TODO: optional pruning?
    # idea: inspect matrix and set small values directly to zero (assume they would have reached there eventually anyways).
    if (identical(inf_mat_norm, last_mat)) {
      converged <- TRUE
      break
    }

    cur_mat_norm <- inf_mat_norm
  }

  if (converged & is.na(cur_mat_norm[1, 1])) {
    stop("An error occurred after convergence - maybe you set `add_self_loops` to FALSE?")
  }

  # getting the attractors - non-zero elements of the matrix diagonal
  clu_attractors <- which(diag(cur_mat_norm) > 0)
  # store the clusters
  clusters <- vector(mode = "list", length(clu_attractors))

  # the nodes in the same row as each attractor form a cluster
  for (i in seq_along(clu_attractors)) {
    cur_att <- clu_attractors[i]
    cur_members <- which(cur_mat_norm[cur_att, ] > 0)
    clusters[[i]] <- cur_members
  }

  # chop off the identical ones
  clusters <- unique(clusters)

  # from clusters sets to membership as label
  clu_memberships <- rep(NA, nrow(adj_mat))
  for (i in seq_along(clusters)) {
    this_cluster <- clusters[[i]]
    clu_memberships[this_cluster] <- i
  }

  # handling the singletons
  if (!allow_singletons) {
    dub <- duplicated(clu_memberships) + duplicated(clu_memberships, fromLast = TRUE)
    n_singletons <- sum(dub == 0)
    clu_memberships[dub == 0] <- 0
    # reshifting to not having gaps in the cluster numbers
    clu_memberships[dub != 0] <- clu_memberships[dub != 0] - n_singletons
  }

  res <- list()
  if (return_node_names & igraph::is_named(g)) {
    res$names <- V(g)$name
  }
  res$vcount <- vcount(g)
  res$algorithm <- "mcl"
  res$iterations <- i_mcl - 1
  res$membership <- clu_memberships
  if (return_esm) {
    res$esm <- cur_mat_norm
  }

  class(res) <- "communities" # to take advantage of the goodies for printing and co

  return(res)
}


# nice to test on the set of igraph examples - say, louvain
federicomarini/GeneTonic documentation built on Oct. 10, 2024, 8:49 p.m.