R/convertIdentifiers.R

Defines functions .prep_babelgene_table

#' Converts internal feature identifiers in a GeneSetDb to a set of new ones.
#'
#' The various GeneSetDb data providers (MSigDb, KEGG, etc). limit the
#' identifier types that they return. Use this function to map the given
#' identifiers to whichever type you like.
#'
#' For best results, provide your own identifier mapping reference, but we
#' provide a convenience wrapper around the [babelgene::orthologs()] function to
#' change between identifier types and species.
#'
#' When there are multiple target id's for the source id, they will all be
#' returned. When there is no target id for the source id, the soucre feature
#' will be axed.
#'
#' @section Custom Mapping:
#' You need to provide a data.frame via the `xref` paramater that has a column
#' for the current identifiers and another column for the target identifiers.
#' The columns are specified by the `from` and `to` paramters, respectively.
#'
#' @section Convenience identifier and species mapping:
#' If you don't provide a data.frame, you can provide a species name. We will
#' rely on the `{babelgene}` package for the conversion, so you will have to
#' provide a species name that it recognizes.
#'
#' @section Species and Identifier Conversion via babelgene:
#' We plan to provide a quick wrapper to babelgene's ortholog mapping function
#' to make identifier conversion a easier through this function. You can track
#' this in [sparrow issue #2](https://github.com/lianos/sparrow/issues/2).
#'
#' @export
#' @param x The GeneSetDb with identifiers to convert
#' @param from,to If you are doing identifier and/orspecies conversion using
#'   babelgene, `to` is the species you want to convert to, and `from` is the
#'   species of `x`. If you are only doing id type conversion within the same
#'   species, specify the current species in `from`.
#'   If you are providing a data.frame map of identifiers in `xref`, `to` is
#'   the name of the column that holds the new identifiers, and `from` is the
#'   name of the column that holds the current identifiers.
#' @param id.type If you are using babelgene conversion, this specifies the
#'   type of identifier you want to convert to. It can be any of `"ensembl"`,
#'   `"entrez"`, or `"symbol"`.
#' @param xref a data.frame used to map current identifiers to target ones.
#' @param extra.cols a character vector of columns from `to` to add to the
#'   features of the new GeneSetDb. If you want to keep the original identifiers
#'   of the remapped features, include `"original_id"` as one of the values
#'   here.
#' @param allow.cartesian a boolean used to temporarily set the
#'   `datatable.allow.cartesian` global option. If you are doing a 1:many
#'   map of your identifiers, you may trigger this error. You can temporarily
#'   turn this option/error off by setting `allow.cartesian = TRUE`. The
#'   option will be restored to its "pre-function call" value `on.exit`.
#' @param min_support,top Parameters used in the internal call to
#'   [babelgene::orthologs()]
#' @param ... pass through args (not used)
#' @return A new GeneSetDb object with converted identifiers. We try to retain
#'   any metadata in the original object, but no guarantees are given. If
#'   `id_type` was stored previously in the collectionMetadata, that will be
#'   dropped.
#' @examples
#' # You can convert the identifiers within a GeneSetDb to some other type
#' # by providing a "translation" table. Check out the unit tests for more
#' # examples.
#' gdb <- exampleGeneSetDb() # this has no symbols in it
#'
#' # Define a silly conversion table.
#' xref <- data.frame(
#'   current_id = featureIds(gdb),
#'   new_id = paste0(featureIds(gdb), "_symbol"))
#' gdb2 <- convertIdentifiers(gdb, from = "current_id", to = "new_id",
#'                            xref = xref, extra.cols = "original_id")
#' geneSet(gdb2, name = "BIOCARTA_AGPCR_PATHWAY")
#'
#' # Convert entrez to ensembl id's using babelgene
#' \dontrun{
#' # The conversion functionality via babelgene isn't yet implemented, but
#' # will look like this.
#'
#' # 1. convert the human entrez identifiers to ensembl
#' gdb.ens <- convertIdentifiers(gdb, "human", id.type = "ensembl")
#'
#' # 2. convert the human entrez to mouse entrez
#' gdb.entm <- convertIdentifiers(gdb, "human", "mouse", id.type = "entrez")
#'
#' # 3. convert the human entrez to mouse ensembl
#' gdb.ensm <- convertIdentifiers(gdb, "human", "mouse", id.type = "ensembl")
#' }
setGeneric(
  "convertIdentifiers",
  signature = "x",
  function(x, from = NULL, to = NULL,
           id.type = c("ensembl", "entrez", "symbol"),
           xref = NULL, extra.cols = NULL,
           allow.cartesian = FALSE,
           min_support = 3, top = TRUE, ...)
    standardGeneric("convertIdentifiers"))

#' @describeIn convertIdentifiers converts identifiers in a BiocSet
setMethod("convertIdentifiers", c(x = "BiocSet"),
function(x, from = NULL, to = NULL,
         id.type = c("ensembl", "entrez", "symbol"),
         xref = NULL, extra.cols = NULL,
         allow.cartesian = FALSE,
         min_support = 3, top = TRUE, ...) {
  stop("Not yet implemented")
})

#' @describeIn convertIdentifiers converts identifiers in a GeneSetDb
setMethod("convertIdentifiers", c(x = "GeneSetDb"),
function(x, from = NULL, to = NULL,
         id.type = c("ensembl", "entrez", "symbol"),
         xref = NULL, extra.cols = NULL,
         allow.cartesian = FALSE,
         min_support = 3, top = TRUE, ...) {
  if (!missing(allow.cartesian)) {
    assert_logical(allow.cartesian)
    dt.opts <- options(datatable.allow.cartesian = allow.cartesian)
    on.exit(options(dt.opts))
  }
  if (is.null(xref)) {
    # User is attempting to do an automated identifier conversion via babelgene
    stop("Identifer & species conversion via babelgene coming soon, ",
         "cf. the 'Species and Identifier Conversion' section ",
         "`?convertIdentifiers`")
    id.type <- match.arg(id.type)
    bres <- .prep_babelgene_table(featureIds(x), to, id.type,
                                  from == "human", min_support, top)
    xref <- bres[["table"]]
    from <- bres[["id.col"]]
    to <- bres[["target.col"]]
  }
  assert_multi_class(xref, c("data.frame", "data.table", "tbl"))
  if (ncol(xref) < 2L) {
    stop("The xref conversion table needs at least two columns")
  }
  if (is.null(from)) from <- colnames(xref)[1L]
  if (is.null(to)) to <- colnames(xref)[2L]
  assert_string(from)
  assert_string(to)

  # columns to transfer over from `xref` ---------------------------------------
  take.cols <- c(from, to)
  assert_character(extra.cols, null.ok = TRUE)
  keep.original <- "original_id" %in% extra.cols
  if (!is.null(extra.cols) && length(extra.cols) > 0) {
    take.cols <- c(take.cols, setdiff(extra.cols, "original_id"))
  }
  assert_subset(take.cols, colnames(xref))
  if (is.data.table(xref)) {
    xref <- xref[, take.cols, with = FALSE]
  } else {
    xref <- xref[, take.cols]
  }
  xref[[from]] <- as.character(xref[[from]])
  xref[[to]] <- as.character(xref[[to]])
  if (to == "feature_id") {
    setnames(xref, to, "fid.new")
    to <- "fid.new"
  }
  xref <- unique(xref, by = c(from, to))

  db <- merge(x@db, xref, by.x = "feature_id", by.y = from,
              suffixes = c(".original", ""))
  setnames(db, "feature_id", "original_id")
  setnames(db, to, "feature_id")
  if (nrow(db) == 0) {
    stop("None of the identifers in your original GeneSetDb match to the ",
         "ones you are trying to convert to")
  }

  # handle non std eval NOTE in R CMD check when using `:=` mojo
  N <- n <- active <- name <- original_id <- NULL

  db <- db[!is.na(feature_id) & nchar(feature_id) > 0]
  db <- unique(db, by = c("collection", "name", "feature_id"))
  gs.dt <- merge(db, geneSets(x, as.dt = TRUE), by = c("collection", "name"))
  gs.dt[, N := NULL]
  gs.dt[, n := NULL]
  gs.dt[, active := NULL]
  if (!keep.original) {
    gs.dt[, original_id := NULL]
  }

  out <- GeneSetDb(gs.dt)
  out@collectionMetadata <- x@collectionMetadata[name != "id_type"]
  out
})

#' Internal helper function to handle bookkeeping tasks invovled to enable
#' species conversion from within convertIdentifiers
#'
#' TODO: Implement species conversion book keeping code for convertIdentifiers
#'
#' @noRd
.prep_babelgene_table <- function(ids, species, id.type, is.human,
                                  min_support, top) {
  # if (FALSE) {
  #   x <- exampleGeneSetDb()
  #   ids <- featureIds(x)
  #   species <- "rat"
  #   id.type <- "ensembl"
  #   is.human <- TRUE
  #   min_support <- 3
  #   top <- TRUE
  #
  #   ids <- c("P2ry12", "Trem2")
  #   is.human <- FALSE
  #
  #   ids <- c("ENSMUSG00000036353", "ENSMUSG00000023992")
  #   species <- "rat"
  #   is.human <- FALSE
  # }
  # # orthologs will always return a data.frame with the first 3 columns
  # # being human info: human_symbol, human_entrez, human_ensembl
  # if (!is.human && species != "human") {
  #   # If you're query isn't from or to human, you have to stick human in the
  #   # middle
  #   # 1. map query id's to human
  #   human <- babelgene::orthologs(ids, species, human = FALSE)
  #   # 2. map human ids to target
  #   #
  #   xmap <- babelgene::orthologs(xxx, species, human = FALSE)
  # }
  #
  # xmap <- babelgene::orthologs(ids, species, human = is.human,
  #                              min_support = min_support, top = top)
}
lianos/sparrow documentation built on Dec. 8, 2024, 2:19 a.m.