R/gs_distances.R

Defines functions create_jaccard_matrix create_kappa_matrix

Documented in create_jaccard_matrix create_kappa_matrix

#' Compute the kappa matrix for enrichment results
#'
#' Compute the kappa matrix for enrichment results, as a measure of overlap
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to see the
#' formatting requirements.
#' @param gtl A `GeneTonic`-list object, containing in its slots the arguments
#' specified above: `dds`, `res_de`, `res_enrich`, and `annotation_obj` - the names
#' of the list _must_ be specified following the content they are expecting
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be included (from the top ranked ones). Defaults to the number of rows of
#' `res_enrich`
#' @param gs_ids Character vector, containing a subset of `gs_id` as they are
#' available in `res_enrich`. Lists the gene sets to be included, additionally to
#' the ones specified via `n_gs`. Defaults to NULL.
#'
#' @return A matrix with the kappa scores between gene sets
#'
#' @seealso [gs_mds()]
#'
#' @export
#'
#' @examples
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#'
#' kmat <- create_kappa_matrix(res_enrich[1:200, ])
#' dim(kmat)
create_kappa_matrix <- function(res_enrich,
                                gtl = NULL,
                                n_gs = nrow(res_enrich),
                                gs_ids = NULL) {
  if (!is.null(gtl)) {
    checkup_gtl(gtl)
    dds <- gtl$dds
    res_de <- gtl$res_de
    res_enrich <- gtl$res_enrich
    annotation_obj <- gtl$annotation_obj
  }

  n_gs <- min(n_gs, nrow(res_enrich))

  gs_to_use <- unique(
    c(
      res_enrich$gs_id[seq_len(n_gs)], # the ones from the top
      gs_ids[gs_ids %in% res_enrich$gs_id] # the ones specified from the custom list
    )
  )

  enrich2list <- lapply(gs_to_use, function(gs) {
    go_genes <- res_enrich[gs, "gs_genes"]
    go_genes <- unlist(strsplit(go_genes, ","))
    return(go_genes)
  })
  names(enrich2list) <- res_enrich[gs_to_use, "gs_id"]

  # creating binary matrix with gs-gene occurrency
  all_genes <- unique(unlist(enrich2list))
  binary_mat <- matrix(0,
    nrow = length(gs_to_use),
    ncol = length(all_genes),
    dimnames = list(names(enrich2list), all_genes)
  )
  # fill in the occurrency per gene set
  for (i in seq_along(gs_to_use)) {
    current_genes <- enrich2list[[i]]
    binary_mat[i, colnames(binary_mat) %in% current_genes] <- 1
  }

  # creating Kappa Matrix
  kappa_matrix <- matrix(0,
    nrow = nrow(binary_mat), ncol = nrow(binary_mat),
    dimnames = list(rownames(binary_mat), rownames(binary_mat))
  )
  diag(kappa_matrix) <- 1
  N <- length(gs_to_use)

  for (i in seq_len(N - 1)) {
    for (j in (i + 1):N) {
      genes_i <- enrich2list[[i]]
      genes_j <- enrich2list[[j]]

      set_both <- length(intersect(genes_i, genes_j))
      set_none <- sum(!all_genes %in% genes_i & !all_genes %in% genes_j)
      set_only_i <- sum(all_genes %in% genes_i & !all_genes %in% genes_j)
      set_only_j <- sum(!all_genes %in% genes_i & all_genes %in% genes_j)

      tot <- sum(set_none, set_only_i, set_only_j, set_both)

      observed <- (set_both + set_none) / tot
      chance <- (set_both + set_only_i) * (set_both + set_only_j) + (set_only_j + set_none) * (set_only_i + set_none)
      chance <- chance / tot^2
      kappa_matrix[j, i] <- kappa_matrix[i, j] <- (observed - chance) / (1 - chance)
    }
  }

  return(kappa_matrix)
}





#' Compute the overlap matrix for enrichment results
#'
#' Compute the overlap matrix for enrichment results, based on the Jaccard Index
#' between each pair of sets
#'
#' @param res_enrich A `data.frame` object, storing the result of the functional
#' enrichment analysis. See more in the main function, [GeneTonic()], to see the
#' formatting requirements.
#' @param gtl A `GeneTonic`-list object, containing in its slots the arguments
#' specified above: `dds`, `res_de`, `res_enrich`, and `annotation_obj` - the names
#' of the list _must_ be specified following the content they are expecting
#' @param n_gs Integer value, corresponding to the maximal number of gene sets to
#' be included (from the top ranked ones). Defaults to the number of rows of
#' `res_enrich`
#' @param gs_ids Character vector, containing a subset of `gs_id` as they are
#' available in `res_enrich`. Lists the gene sets to be included, additionally to
#' the ones specified via `n_gs`. Defaults to NULL.
#' @param return_sym Logical, whether to return the symmetrical matrix or just the
#' upper triangular - as needed by [enrichment_map()], for example.
#'
#' @return A matrix with the kappa scores between gene sets
#'
#' @seealso [gs_mds()], [enrichment_map()]
#'
#' @export
#'
#' @examples
#'
#' # res_enrich object
#' data(res_enrich_macrophage, package = "GeneTonic")
#' res_enrich <- shake_topGOtableResult(topgoDE_macrophage_IFNg_vs_naive)
#'
#' jmat <- create_jaccard_matrix(res_enrich[1:200, ])
#' dim(jmat)
create_jaccard_matrix <- function(res_enrich,
                                  gtl = NULL,
                                  n_gs = nrow(res_enrich),
                                  gs_ids = NULL,
                                  return_sym = FALSE) {
  if (!is.null(gtl)) {
    checkup_gtl(gtl)
    dds <- gtl$dds
    res_de <- gtl$res_de
    res_enrich <- gtl$res_enrich
    annotation_obj <- gtl$annotation_obj
  }

  n_gs <- min(n_gs, nrow(res_enrich))

  gs_to_use <- unique(
    c(
      res_enrich$gs_id[seq_len(n_gs)], # the ones from the top
      gs_ids[gs_ids %in% res_enrich$gs_id] # the ones specified from the custom list
    )
  )

  enrich2list <- lapply(gs_to_use, function(gs) {
    go_genes <- res_enrich[gs, "gs_genes"]
    go_genes <- unlist(strsplit(go_genes, ","))
    return(go_genes)
  })
  names(enrich2list) <- res_enrich[gs_to_use, "gs_id"]

  n <- length(enrich2list)
  overlap_matrix <- matrix(NA, nrow = n, ncol = n)
  rownames(overlap_matrix) <- colnames(overlap_matrix) <- names(enrich2list)

  for (i in seq_len(n)) {
    # no need to work on full mat, it is simmetric
    for (j in i:n) {
      overlap_matrix[i, j] <-
        overlap_jaccard_index(
          unlist(enrich2list[gs_to_use[i]]),
          unlist(enrich2list[gs_to_use[j]])
        )
    }
  }

  if (return_sym) {
    # symmetryze :)
    overlap_matrix[lower.tri(overlap_matrix)] <- t(overlap_matrix)[lower.tri(overlap_matrix)]
  }
  return(overlap_matrix)
}
federicomarini/GeneTonic documentation built on Oct. 10, 2024, 8:49 p.m.