#' @name GeneToSymbol
#' @inherit AcidGenerics::GeneToSymbol description return title
#' @note Updated 2023-09-26.
#'
#' @details
#' For some organisms, gene identifiers and gene names do not map 1:1 (e.g.
#' *Homo sapiens* and *Mus musculus*). Refer to the `format` argument
#' here in the documentation for approaches that deal with this issue.
#'
#' @inheritParams AcidRoxygen::params
#' @inheritParams params
#'
#' @param format `character(1)`.
#' Formatting method to apply:
#'
#' - `"makeUnique"`: *Recommended.* Apply `make.unique` to the `geneName`
#' column. Gene names are made unique, while the identifiers remain
#' unmodified. `NA` gene names will be renamed to `"unannotated"`.
#' - `"1:1"`: For gene names that map to multiple gene identifiers, select
#' only the first annotated gene identifier. Incomplete elements with
#' `NA` gene name will be removed will be removed with an internal
#' `complete.cases` call.
#' - `"unmodified"`: Return `geneId` and `geneName` columns unmodified, in
#' long format. Incomplete elements with `NA` gene name will be removed
#' with an internal `complete.cases` call.
#'
#' @param ... Arguments pass through to `DFrame` method.
#'
#' @seealso [makeGeneToSymbol()].
#'
#' @examples
#' data(GRanges, package = "AcidTest")
#'
#' ## DFrame ====
#' df <- S4Vectors::DataFrame(
#' "geneId" = c(
#' "ENSG00000228572.7",
#' "ENSG00000182378.14"
#' ),
#' "geneName" = c(
#' "AL954722.1",
#' "PLCXD1"
#' )
#' )
#' x <- GeneToSymbol(df)
#' print(x)
#'
#' ## GRanges ====
#' object <- GRanges
#' x <- GeneToSymbol(object)
NULL
## Updated 2023-09-16.
`GeneToSymbol,DFrame` <- # nolint
function(object,
format = c("makeUnique", "1:1", "unmodified"),
quiet = FALSE) {
assert(
hasColnames(object),
isFlag(quiet)
)
format <- match.arg(format)
meta <- metadata(object)
meta[["format"]] <- format
cols <- c("geneId", "geneName")
if (!isSubset(cols, colnames(object))) {
colnames(object) <- camelCase(colnames(object), strict = TRUE)
}
assert(
isSubset(cols, colnames(object)),
hasRows(object)
)
object <- as(object, "DFrame")
object <- object[, cols, drop = FALSE]
object <- decode(object)
assert(allAreAtomic(object))
object <- unique(object)
## Remove incomplete elements, except for "makeUnique" mode.
if (!identical(format, "makeUnique")) {
keep <- complete.cases(object)
if (!all(keep)) {
## e.g. applies to Ensembl Mus musculus GRCm39 104.
meta[["dropped"]] <- which(!keep)
if (isFALSE(quiet)) {
n <- sum(!keep)
alertWarning(sprintf(
"Dropping %d %s without defined gene symbol.",
n,
ngettext(
n = n,
msg1 = "identifier",
msg2 = "identifiers"
)
))
}
object <- object[keep, , drop = FALSE]
}
}
assert(hasRows(object))
## Enforce coercion of integer gene identifiers (e.g. NCBI).
if (is.integer(object[["geneId"]])) {
object[["geneId"]] <- as.character(object[["geneId"]])
}
switch(
EXPR = format,
"makeUnique" = {
## Ensure we arrange by gene identifier prior to sanization.
object <- object[order(object), , drop = FALSE]
## Replace any "NA" gene names with "unannotated".
if (anyNA(object[["geneName"]])) {
object[["geneName"]][
which(is.na(object[["geneName"]]))
] <- "unannotated"
}
## Inform the user about how many symbols multi-map.
## Don't count "unannotated" genes as true duplicates here.
## Note that `duplicated()` doesn't work on Rle.
dupes <- duplicated(object[["geneName"]])
if (any(dupes)) {
dupes <- setdiff(
x = sort(unique(object[["geneName"]][dupes])),
y = "unannotated"
)
if (hasLength(dupes)) {
meta[["dupes"]] <- dupes
if (isFALSE(quiet)) {
n <- length(dupes)
alertInfo(sprintf(
"%d non-unique gene %s detected: %s.",
n,
ngettext(
n = n,
msg1 = "symbol",
msg2 = "symbols"
),
toInlineString(dupes, n = 10L, class = "val")
))
}
}
}
object[["geneName"]] <- make.unique(object[["geneName"]])
},
"1:1" = {
assert(all(complete.cases(object)))
alert(paste(
"Returning 1:1 mappings using oldest",
"gene identifier per symbol."
))
x <- split(x = object, f = object[["geneName"]])
assert(is(x, "SplitDFrameList"))
x <- SplitDataFrameList(lapply(
X = x,
FUN = function(x) {
idx <- order(
x = x[["geneId"]],
decreasing = FALSE,
na.last = TRUE
)
head(x[idx, , drop = FALSE], n = 1L)
}
))
object <- do.call(what = rbind, args = x)
},
"unmodified" = {
if (isFALSE(quiet)) {
alertInfo(paste(
"Returning with unmodified gene symbols",
"{.emph (may contain duplicates)}."
))
}
}
)
assert(
is(object, "DFrame"),
all(complete.cases(object)),
hasNoDuplicates(object[["geneId"]]),
msg = "Failed to generate GeneToSymbol object."
)
object <- object[order(object), , drop = FALSE]
meta[["date"]] <- Sys.Date()
meta[["packageVersion"]] <- .pkgVersion
meta <- meta[sort(names(meta))]
metadata(object) <- meta
new(Class = "GeneToSymbol", object)
}
## Updated 2023-04-26.
`GeneToSymbol,GRanges` <- # nolint
function(object, ...) {
df <- as(object, "DFrame")
metadata(df) <- metadata(object)
GeneToSymbol(df, ...)
}
#' @rdname GeneToSymbol
#' @export
setMethod(
f = "GeneToSymbol",
signature = signature(object = "DFrame"),
definition = `GeneToSymbol,DFrame`
)
#' @rdname GeneToSymbol
#' @export
setMethod(
f = "GeneToSymbol",
signature = signature(object = "GRanges"),
definition = `GeneToSymbol,GRanges`
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.