#' Annotates rows of a data.frame with geneset membership from a GeneSetDb
#'
#' This is helpful when you don't have a monsterly sized GeneSetDb. There will
#' be as many new columns added to `x` as there are active genesets in `gdb`.
#'
#' @export
#'
#' @param x A data.frame with genes/features in rows
#' @param gdb A [GeneSetDb()] object with geneset membership
#' @param x.ids The name of the column in `x` that holds the feautre
#' id's in `x` that match the feature_id's in `gdb`, or a vector
#' of id's to use for each row in `x` that represent these.
#' @param ... parameters passed down into [incidenceMatrix()]
#' @return Returns the original `x` with additional columns: each is a
#' logical vector that indicates membership for genesets defined in
#' `gdb`.
#'
#' @examples
#' vm <- exampleExpressionSet()
#' gdb <- exampleGeneSetDb()
#' mg <- seas(vm, gdb, design = vm$design, contrast = 'tumor')
#' lfc <- logFC(mg)
#' annotated <- annotateGeneSetMembership(lfc, gdb, 'feature_id')
#'
#' ## Show only genes that are part of 'HALLMARK_ANGIOGENESIS' geneset
#' angio <- subset(annotated, `c2;;BIOCARTA_AGPCR_PATHWAY`)
annotateGeneSetMembership <- function(x, gdb, x.ids=NULL, ...) {
stopifnot(is(x, 'data.frame'))
stopifnot(is(gdb, 'GeneSetDb'))
if (is.null(x.ids)) {
## Guess the column of x that has featureIds by identifying the column of
## x that has the highest percent membership in gdb@featureIdMap
membership <- sapply(x, function(col) {
if (!is.character(col)) 0 else mean(col %in% featureIdMap(gdb)$x.id)
})
x.ids <- names(x)[which.max(membership)]
}
if (!is.character(x.ids)) {
stop("character expected for x.ids")
}
if (length(x.ids) == 1L && is.character(x[[x.ids]])) {
## This is a column in df that has the gene IDs that `x` expects
x.ids <- x[[x.ids]]
}
if (nrow(x) != length(x.ids)) {
stop("x.ids must be a vector of gene ids, or a column name in x of these")
}
im <- incidenceMatrix(gdb, x.ids, ...)
storage.mode(im) <- 'logical'
cbind(x, t(im))
}
#' @describeIn geneSets Returns the number of genesets in a GeneSetDb
setMethod("length", "GeneSetDb", function(x) nrow(geneSets(x)))
#' (Re)-map geneset IDs to the rows in an expression object.
#'
#' @description `conform`-ing, a `GeneSetDb` to a target expression
#' object is an important step required prior to perform any type of GSEA. This
#' function maps the featureIds used in the GeneSetDb to the elements of a
#' target expression object (ie. the rows of an expression matrix, or the
#' elements of a vector of gene-level statistics).
#'
#' After `conform`-ation, each geneset in the `GeneSetDb` is flagged
#' as active (or inactive) given the number of its features that are
#' successfully mapped to `target` and the minimum and maximum number of
#' genes per geneset required as specified by the `min.gs.size` and
#' `max.gs.size` parameters, respectively.
#'
#' Only genesets that are marked with `active = TRUE` will be used in any
#' downstream gene set operations.
#'
#' @section Related Functions:
#'
#' * [unconform()]: Resets the conformation mapping.
#' * [is.conformed()]: If `to` is missing, looks for evidence that `conform` has
#' been called (at all) on `x`. If `to` is provided, specifically checks that
#' `x` has been conformed to the target object `to`.
#'
#' @rdname conform
#'
#' @param x The GeneSetDb
#' @param target The expression object/matrix to conform to. This could also
#' just be a character vector of IDs.
#' @param unique.by If there are multiple rows that map to the identifiers used
#' in the genesets, this is a means to pick the single row for that ID
#' @param min.gs.size Ensure that the genesets that make their way to the
#' `GeneSetDb@@table` are of a minimum size
#' @param max.gs.size Ensure that the genesets that make their way to the
#' `GeneSetDb@@table` are smaller than this size
#' @param match.tolerance Numeric value between \[0,1\]. If the fraction of
#' `feature_id`s used in `x` that match `rownames(y)` is below
#' this number, a warning will be fired.
#' @param ... moar args
#'
#' @return A [GeneSetDb()] that has been matched/conformed to an expression
#' object target `y`.
#'
#' @examples
#' es <- exampleExpressionSet()
#' gdb <- exampleGeneSetDb()
#' head(geneSets(gdb))
#' gdb <- conform(gdb, es)
#' ## Note the updated values `active` flag, and n (the number of features
#' ## mapped per gene set)
#' head(geneSets(gdb))
setMethod("conform", c(x="GeneSetDb"),
function(x, target, unique.by=c('none', 'mean', 'var'),
min.gs.size=2L, max.gs.size=Inf, match.tolerance=0.25, ...) {
unique.by <- match.arg(unique.by)
if (unique.by != 'none') {
stop("`unique.by` must be 'none' for now")
}
# if (min.gs.size == 1) {
# stop("Doing GSEA with 1 gene doesn't really make sense, does it?")
# }
if (max.gs.size < min.gs.size) {
stop("max.gs.size must be larger than min.gs.size")
}
## We are allowing y to be a character vector of featureIds here
if (is.vector(target) && is.character(target)) {
target <- matrix(1L, nrow=length(target), ncol=1L,
dimnames=list(target, NULL))
}
if (!any(sapply(.valid.x, function(claz) is(target, claz)))) {
stop("Illegal type of expression object to conform to")
}
# x <- copy(x)
fm <- featureIdMap(x, as.dt=TRUE)
fm$x.idx <- match(fm$x.id, rownames(target))
fraction.match <- mean(!is.na(fm$x.idx))
if (fraction.match <= match.tolerance) {
warning("Fraction of gene set IDs that match rownames in the expression ",
"object are low: ", sprintf("%.2f%% ", fraction.match * 100),
immediate.=TRUE)
}
if (fraction.match == 0) {
stop("None of the rownames of the expression object match the featureIds ",
"for the genesets")
}
otable <- x@table
ntable <- x@db[, {
f <- feature_id
xref <- fm[list(f)]
n <- sum(!is.na(xref$x.idx))
active <- n >= min.gs.size && n <= max.gs.size
list(active=active, N=.N, n=n)
}, by=c('collection', 'name')]
## Add back columns from original x@table that you just nuked
kosher <- all.equal(
otable[, list(collection, name)],
ntable[, list(collection, name)])
stopifnot(kosher)
add.me <- setdiff(colnames(otable), colnames(ntable))
for (col in add.me) ntable[, (col) := otable[[col]]]
x@table <- ntable
inactive <- !x@table$active
if (any(inactive)) {
msg <- paste("Deactivating %d gene sets because conformation of GeneSetDb",
"to the target creates gene sets smaller than %s or greater",
"than %s\n")
msg <- sprintf(msg, sum(inactive), as.character(min.gs.size),
as.character(max.gs.size))
warning(msg, immediate.=TRUE)
}
x@featureIdMap <- fm
x
})
#' @export
#' @rdname conform
setMethod("unconform", "GeneSetDb", function(x, ...) {
x@table <- transform(x@table, active=FALSE)
x@featureIdMap <- transform(x@featureIdMap, x.idx=NA_integer_)
x
})
#' @export
#' @describeIn conform Checks to see if GeneSetDb `x` is conformed to a target
#' object `to`
#' @param to the object to test conformation to
is.conformed <- function(x, to) {
if (!is(x, 'GeneSetDb')) {
stop("Only works on the GeneSetDb")
}
if (missing(to)) {
ans <- any(!is.na(featureIdMap(x, as.dt=TRUE)$x.idx))
} else {
## Verify that gsd is properly conformed to x
fm <- subset(featureIdMap(x, as.dt=TRUE), !is.na(x.idx))
to.ids <- if (is.character(to))
to
else if (is.vector(to))
names(to)
else
rownames(to)
if (!is.character(to.ids)) {
stop("featureIds unsuccessfully extracted from `to`")
}
ans <- nrow(fm) > 0 && all(to.ids[fm$x.idx] == fm$x.id)
}
ans
}
#' Creates a 1/0 matrix to indicate geneset membership to target object.
#'
#' Generates an inidcator matrix to indicate membership of genes (columns)
#' to gene sets (rows). If `y` is provided, then the incidence is mapped
#' across the entire feature-space of `y`.
#'
#' @export
#'
#' @param x A [GeneSetDb()]
#' @param y (optional) A target (expression) object `x` is (or can be)
#' conformed to
#' @param ... parameters passed down into [conform()].
#' @return incidence matrix with nrows = number of genesets and columns are
#' featureIDs. If `y` is passed in, the columns of the returned value
#' match the rows of `y`.
#'
#' @examples
#' vm <- exampleExpressionSet()
#' gdb <- exampleGeneSetDb()
#' im <- incidenceMatrix(gdb)
#' imv <- incidenceMatrix(gdb, vm)
incidenceMatrix <- function(x, y, ...) {
stopifnot(is(x, 'GeneSetDb'))
gs <- NULL
if (missing(y)) {
val <- 'feature_id'
ynames <- unique(x@db$feature_id)
ncol <- length(ynames)
} else {
val <- 'x.idx'
x <- conform(x, y, ...)
if (is.vector(y)) {
ynames <- y
ncol <- length(y)
} else {
ynames <- rownames(y)
ncol <- nrow(y)
}
}
gs <- geneSets(x, as.dt=TRUE)
dimnames <- list(encode_gskey(gs), ynames)
out <- matrix(0L, nrow(gs), ncol, dimnames=dimnames)
for (i in seq_len(nrow(gs))) {
fids <- featureIds(x, gs$collection[i], gs$name[i], value=val)
out[i, fids] <- 1L
}
out
}
#' Interrogate "active" status of a given geneset.
#'
#' Returns the `active` status of genesets, which are specified by
#' their collection,name compound keys. This function is vectorized and
#' supports query of multiple gene sets at a time. If a requested
#' collection,name gene set doesn't exist, this throws an error.
#'
#' @export
#' @param x [GeneSetDb()]
#' @param i collection of geneset(s)
#' @param j name of geneset(s) (must be same length as `i`.
#' @return logical indicating if geneset is active. throws an error if
#' any requested geneset does not exist in `x`.
#' @examples
#' dge.stats <- exampleDgeResult()
#' y <- exampleExpressionSet(do.voom = FALSE)
#' gdb <- conform(exampleGeneSetDb(), y, min.gs.size = 10)
#' # size 9 geneset:
#' geneSet(gdb, "c2", "BYSTRYKH_HEMATOPOIESIS_STEM_CELL_IL3RA")
#' is.active(gdb, "c2", "BYSTRYKH_HEMATOPOIESIS_STEM_CELL_IL3RA")
#' # geneset with >100 genes
#' is.active(gdb, "c7", "GSE3982_MAC_VS_NEUTROPHIL_LPS_STIM_DN")
is.active <- function(x, i, j) {
stopifnot(is(x, 'GeneSetDb'))
stopifnot(is.character(i), is.character(j), length(i) == length(j))
gsx <- geneSets(x, active.only=FALSE, as.dt=TRUE)[list(i,j), nomatch=NA]
res <- gsx$active
isna <- is.na(res)
if (any(isna)) {
unk <- paste(i[isna], j[isna], sep=":", collapse=",")
stop(sprintf("Unknown genesets: %s", unk))
}
res
}
#' @describeIn subsetByFeatures subset GeneSetDb by feature id's
setMethod("subsetByFeatures", c(x="GeneSetDb"),
function(x, features, value=c('feature_id', 'x.id', 'x.idx'), ...) {
value <- match.arg(value)
## some good old data.table voodoo going on inside here
unk.f <- setdiff(features, featureIds(x, value=value))
if (length(unk.f)) {
warning(length(unk.f), "/", length(features), " do not exist in GeneSetDb")
features <- setdiff(features, unk.f)
}
dat <- merge(x@db, featureIdMap(x, as.dt=TRUE), by='feature_id')
hits <- unique(dat[dat[[value]] %in% features, list(collection, name)])
gs.all <- geneSets(x, active.only=FALSE, as.dt=TRUE)
keep <- rep(FALSE, nrow(gs.all))
gs.idx <- gs.all[hits, which=TRUE]
if (length(gs.idx)) {
keep[gs.idx] <- TRUE
}
x[keep]
})
#' @rdname featureIds
setMethod("featureIds", c(x="GeneSetDb"),
function(x, i, j, value=c('feature_id', 'x.id', 'x.idx'),
active.only=is.conformed(x), ...) {
if (missing(value)) {
value <- if (is.conformed(x)) 'x.id' else 'feature_id'
}
value <- match.arg(value, c('feature_id', 'x.id', 'x.idx'))
if (missing(i) && missing(j)) {
## User isn't asking about any particular collection, but just wants all
## features in the GeneSetDb as a whole ... OK(?)
fmap <- x@featureIdMap
if (is.conformed(x) && active.only) {
fmap <- fmap[!is.na(x.idx)]
}
out <- unique(fmap, by=value)[[value]]
return(out[!is.na(out)])
}
if (!isSingleCharacter(i)) {
stop("collection (i) must be length 1 character vectors")
}
if (missing(j)) {
whole.collection <- TRUE
} else {
if (!isSingleCharacter(j)) {
stop("gene set name (j) must be length 1 character vectors")
}
whole.collection <- FALSE
}
gs <- geneSets(x, active.only=active.only, as.dt=TRUE)
gs <- gs[, key(gs), with=FALSE]
gs <- gs[list(i)]
if (nrow(gs) == 0L) {
stop("There are no ", if (active.only) "active " else NULL,
"genesets in collection: ", i)
}
if (whole.collection) {
db <- unique(x@db[gs], by='feature_id')
} else {
## I am purposefully not using `hasGeneSet` here for performance reasons
## hasGeneSet(x, i, j, as.error=TRUE)
db <- x@db[list(i, j)]
if (is.na(db$feature_id[1L])) {
msg <- sprintf("collection=%s, name=%s does not exist in GeneSetDb db",
i, j)
stop(msg)
}
}
fid.map <- featureIdMap(x, as.dt=TRUE)[db$feature_id]
if (is.conformed(x) && active.only) {
fid.map <- fid.map[!is.na(x.idx)]
}
fid.map[[value]]
})
#' @describeIn featureIdMap extract featureIdMap from a GeneSetDb
#' @template asdt-param
setMethod("featureIdMap", c(x="GeneSetDb"), function(x, as.dt = FALSE) {
out <- x@featureIdMap
if (!as.dt) out <- setDF(copy(out))
out
})
#' Replacing the featureIdMap blows away the "conform"-ation status of x
#'
#' This method ensures that there is only one feature_id <-> x.id mapping value.
#' Note that this does not mean that this enforces a 1:1 mapping, it's just
#' that the same mapping is not listed more than once.
#'
#' @noRd
setReplaceMethod('featureIdMap', 'GeneSetDb', function(x, value) {
if (!is.data.frame(value)) {
stop("data.frame/table required for featureIdMap<-")
}
if (!ncol(value) == 2) {
stop("featureIdMap must be a 2 column data.frame")
}
if (!all(x@db$feature_id %in% value[[1]])) {
stop("Some @db$feature_id's are not in first column of new featureIdMap")
}
value <- as.data.table(value)
setnames(value, c('feature_id', 'x.id'))
value <- unique(value, by=c('feature_id', 'x.id'))
setkeyv(value, 'feature_id')
value[, x.idx := NA_integer_]
x@featureIdMap <- value
unconform(x)
})
#' @describeIn geneSets return all genesets from a GeneSetDb
setMethod("geneSets", c(x="GeneSetDb"),
function(x, active.only=is.conformed(x), ... , as.dt=FALSE) {
out <- if (active.only[1L]) x@table[active == TRUE] else x@table
if (!as.dt) out <- setDF(copy(out))
out
})
#' @rdname geneSet
#' @param collection using `i` as the parameter for "collection" isn't intuitive
#' so if speficially set this paramter, it will replace the value for `i`.
#' @param name the same for the `collection`:`i` parameter relationship, but for
#' `j`:`name`.
#' @examples
#' gdb <- exampleGeneSetDb()
#' geneSet(gdb, "c2", "KOMMAGANI_TP63_GAMMA_TARGETS")
#' geneSet(gdb, collection = "c2", name = "KOMMAGANI_TP63_GAMMA_TARGETS")
#' geneSet(gdb, name = "KOMMAGANI_TP63_GAMMA_TARGETS")
setMethod("geneSet", c(x="GeneSetDb"),
function(x, i, j, active.only=is.conformed(x), with.feature.map=FALSE, ...,
collection = NULL, name = NULL, as.dt = FALSE) {
if (!is.null(collection) && missing(i)) i <- collection
if (!is.null(name) && missing(j)) j <- name
if (missing(i) && is.null(collection)) {
stopifnot(isSingleCharacter(j))
gs <- geneSets(x, as.dt = TRUE)[name == j]
if (nrow(gs) != 1L) {
stop("Cannot resolve geneset using only `name` == `'", j, "'`")
}
i <- gs[["collection"]]
} else if (missing(j) && is.null(name)) {
stopifnot(isSingleCharacter(i))
gs <- geneSets(x, as.dt = TRUE)[collection == i]
if (nrow(gs) != 1L) {
stop("Cannot resolve geneset using only `collection` == `'", i, "'`")
}
j <- gs[["name"]]
}
stopifnot(isSingleCharacter(i), isSingleCharacter(j))
fids <- featureIds(x, i, j, value='feature_id', active.only=active.only, ...)
info <- geneSets(x, active.only=FALSE, as.dt=TRUE)[list(i, j)]
info <- info[, c("collection", "name", "active", "N", "n"), with=FALSE]
## Fetch information from x@db. Extra information per feature are stored here
dbx <- x@db[list(i,j,fids)]
out <- cbind(info[rep(1L, nrow(dbx))], dbx[, -(1:2), with=FALSE])
## Add featureIdMap info
if (with.feature.map) {
fminfo <- featureIdMap(x, as.dt=TRUE)[list(fids)]
out <- cbind(out, fminfo[, -1L, with=FALSE])
}
if (!as.dt) out <- setDF(copy(out))
out
})
#' Subset GeneSetDb to only include specified genesets.
#'
#' This is a utility function that is called by \code{[.GeneSetDb} and is not
#' exported because it is not meant for external use.
#'
#' DEBUG: If `keep` is all FALSE, this will explode. What does an empty
#' GeneSetDb look like, anyway? Something ...
#'
#' We want to support a better, more fluent subsetting of GeneSetDb objects.
#' See Issue #12 (https://github.com/lianos/multiGSEA/issues/12)
#'
#' @param x a [GeneSetDb()]
#' @param keep logical vector as long as `nrow(geneSets(x, active.only=FALSE)`
#' @return a `GeneSetDb` that has only the results for the specified
#' genesets.
#' @examples
#' gdb.all <- exampleGeneSetDb()
#' gs <- geneSets(gdb.all)
#' gdb <- gdb.all[gs$collection %in% c("c2", "c7")]
subset.GeneSetDb <- function(x, keep) {
stopifnot(is(x, 'GeneSetDb'))
if (all(keep == FALSE)) {
stop("Cannot subset GeneSetDb down to empty (`keep` is all FALSE)")
}
nr <- nrow(geneSets(x, active.only=FALSE, as.dt=TRUE))
if (!is.logical(keep) && length(keep) != nr) {
stop("The `keep` vector is FUBAR'd")
}
## 1. Remove rows from x@table
## 2. Remove rows in x@db that belong to collection,name that do not exist
## due to (1)
## 3. remove entries in x@featureIdMap for features that no longer exist in
## updated db from (2)
## 4. Update x@collectionMetadata to:
## a. remove all metadata for collections that are completely gone
## b. update remaining collection,count entries for remaining collections
## 1
keep.table <- x@table[keep]
## 2
gs.keys <- keep.table[, key(keep.table), with=FALSE]
setkeyv(gs.keys, key(keep.table))
keep.db <- x@db[gs.keys, nomatch=0] ## only keep entries in db in gs.keys
## 3
keep.featureIdMap <- subset(x@featureIdMap, feature_id %in% keep.db$feature_id)
## 4a
keep.cm <- subset(x@collectionMetadata, collection %in% keep.db$collection)
## 4b
# NOTE: remove count collectionMetadata
# cc <- keep.table[, list(name='count', value=.N), by='collection']
# setkeyv(cc, key(keep.cm))
## Currently (data.table v1.9.4( there's nothing I can do to make i.value a
## list element and this `set` mojo doesn't work either
# value <- i.value <- NULL # silence R CMD check NOTEs
# suppressWarnings(keep.cm[cc, value := list(i.value)])
## update.idxs <- keep.cm[cc, which=TRUE]
## val.idx <- which(colnames(keep.cm) == 'value')
## for (idx in seq_along(update.idxs)) {
## set(keep.cm, update.idxs[idx], val.idx, list(cc$value[idx]))
## }
out <- .GeneSetDb(table=keep.table,
db=keep.db,
featureIdMap=keep.featureIdMap,
collectionMetadata=keep.cm)
out
}
#' Subset whole genesets from a GeneSetDb
#' @exportMethod [
#' @param x GeneSetDb
#' @param i a logical vector as long as `nrow(geneSets(x))` indicating which
#' geneSets to keep
#' @param j ignored
#' @param ... pass through arguments
#' @return GeneSetDb `x` with a subset of the genesets it came in with.k
#' @param drop ignored
setMethod("[", "GeneSetDb", function(x, i, j, ..., drop = FALSE) {
if (length(list(...)) > 0) {
stop("parameters in '...' not supported")
}
if (!missing(drop)) {
warning("drop argument ignored", immediate.=TRUE)
}
if (!missing(j)) {
warning("j is ignored", immediate.=TRUE)
}
if (!is.logical(i) && length(i) != nrow(geneSets(x))) {
stop("i must be a logical vector as long as nrow(geneSets(x))")
}
subset.GeneSetDb(x, i)
})
## -----------------------------------------------------------------------------
## Functions over collections
#' Check if a collection exists in the \code{GeneSetDb}
#'
#' @export
#' @param x A [GeneSetDb()]
#' @param collection character vector of name(s) of the collections to query
#' @param as.error logical if TRUE, this will error instead of returning FALSE
#' @return logical indicating if this collection exists
#' @examples
#' gdb <- exampleGeneSetDb()
#' hasGeneSetCollection(gdb, "c2")
#' hasGeneSetCollection(gdb, "unknown collection")
hasGeneSetCollection <- function(x, collection, as.error=FALSE) {
stopifnot(is(x, 'GeneSetDb'))
stopifnot(is.character(collection))
meta.idxs <- match(collection,
collectionMetadata(x, as.dt=TRUE)$collection)
gsc.exists <- !is.na(meta.idxs)
if (!all(gsc.exists) && as.error) {
bad <- paste(" * ", collection[!gsc.exists], collapse='\n', sep='')
stop("The following collections to not exist:\n", bad)
}
gsc.exists
}
#' Check to see if the GeneSetDb has a collection,name GeneSet defined
#'
#' @export
#' @param x GeneSetDb
#' @param collection character indicating the collection
#' @param name character indicating the name of the geneset
#' @param as.error If `TRUE`, a test for the existance of the geneset will throw
#' an error if the geneset does not exist
#' @return logical indicating whether or not the geneset is defined.
#'
#' @examples
#' gdb <- exampleGeneSetDb()
#' hasGeneSet(gdb, c('c2', 'c7'), c('BIOCARTA_AGPCR_PATHWAY', 'something'))
hasGeneSet <- function(x, collection, name, as.error=FALSE) {
gs.exists <- !is.na(.gsd.row.index(x, collection, name))
if (as.error && !all(gs.exists)) {
msg <- "The follwing %d gene sets do not exist: \n%s"
not <- paste(collection[!gs.exists], name[!gs.exists],
sep=":", collapse="\n")
stop(sprintf(msg, sum(!gs.exists), not))
}
gs.exists
}
#' @describeIn collectionMetadata Returns metadata for all collections
setMethod("collectionMetadata",
c(x="GeneSetDb", collection="missing", name="missing"),
function(x, collection, name, as.dt=FALSE) {
out <- x@collectionMetadata
if (!as.dt) {
warning("The collectionMetadata has a list column ('value'). This ",
"may not play well with data.frame based operation. Consider ",
"returning as a data.table by setting `as.dt = TRUE`",
immediate. = TRUE)
out <- setDF(copy(out))
}
out
})
#' @describeIn collectionMetadata Returns all metadata for a specific collection
setMethod("collectionMetadata",
c(x="GeneSetDb", collection="character", name="missing"),
function(x, collection, name, as.dt=FALSE) {
stopifnot(isSingleCharacter(collection))
hasGeneSetCollection(x, collection, as.error=TRUE)
out <- x@collectionMetadata[collection]
if (!as.dt) {
warning("The collectionMetadata has a list column ('value'). This ",
"may not play well with data.frame based operation. Consider ",
"returning as a data.table by setting `as.dt = TRUE`",
immediate. = TRUE)
out <- setDF(copy(out))
}
out
})
#' @describeIn collectionMetadata Returns the `name` metadata value for a given
#' `collection`.
setMethod("collectionMetadata",
c(x="GeneSetDb", collection="character", name="character"),
function(x, collection, name, as.dt=FALSE) {
stopifnot(isSingleCharacter(collection))
stopifnot(isSingleCharacter(name))
hasGeneSetCollection(x, collection, as.error=TRUE)
.col <- collection
.name <- name
cmd <- x@collectionMetadata
idx <- cmd[list(.col, .name), which=TRUE]
if (is.na(idx[1L])) {
msg <- sprintf("metadata not defined for collection:%s, varname:%s",
.col, .name)
stop(msg)
}
cmd$value[[idx]]
})
#' @describeIn collectionMetadata returns the URL for a geneset
setMethod("geneSetURL", c(x = "GeneSetDb"), function(x, i, j, ...) {
stopifnot(is.character(i), is.character(j), length(i) == length(j))
collections <- unique(i)
col.exists <- hasGeneSetCollection(x, collections)
url.fns <- Map(collections, col.exists, f = function(col, exists) {
if (exists) {
geneSetCollectionURLfunction(x, col)
} else {
function(x, y, ...) NA_character_
}
})
mapply(i, j, FUN = function(col, name, ...) {
url.fns[[col]](col, name, gdb = x, ...)
})
})
#' @describeIn geneSetCollectionURLfunction returns the gene set collection
#' url function from a GeneSetDb
#' @param x The GeneSetDb
#' @param i The collection to get the url function from
#' @param ... pass through arguments (not used)
#' @return the function that maps collection,name combinations to an
#' informational URL.
#' @examples
#' gdb <- exampleGeneSetDb()
#' geneSetCollectionURLfunction(gdb, "c2", "BIOCARTA_AGPCR_PATHWAY")
setMethod("geneSetCollectionURLfunction", "GeneSetDb", function(x, i, ...) {
stopifnot(isSingleCharacter(i))
fn.dt <- x@collectionMetadata[list(i, 'url_function'), nomatch = 0]
if (nrow(fn.dt) == 0) {
## validObject(x) : this call is slow and should never be FALSE anyway
stop(sprintf("No url_function for collection '%s' found", i))
}
if (nrow(fn.dt) > 1) {
## validObject(x) : this call is slow and should never be FALSE anyway
stop(sprintf("Multiple url_function defined for collection '%s' found", i))
}
fn <- fn.dt$value[[1L]]
if (test_string(fn)) {
fn <- get_function(fn)
}
if (!is.function(fn)) {
## validObject(x) : this call is slow and should never be FALSE anyway
stop(sprintf("The URL function for collection '%s' is missing", i))
}
fn
})
#' @describeIn geneSetCollectionURLfunction sets the gene set collection url
#' function for a `GeneSetDb : Collection` combination.
#' @param value the function to set as the geneset url function for the given
#' collection `i`. This can be an actual function object, or the (string)
#' name of the function to pull out of "the ether"
#' (`"pkgname::functionname"` can work, too). The latter is preferred as
#' it results in smaller serialized GeneSetDb objects.
setReplaceMethod("geneSetCollectionURLfunction", "GeneSetDb",
function(x, i, value) {
valid <- function(v) {
v <- get_function(v)
if (!isTRUE(is.function(v))) return(FALSE)
args <- formalArgs(v)
# We didn't specify previously that the names of the arguments had to be
# collection and name, so don't check those. Just require >= 3, and that
# one of them is "...
if (length(args) < 3 || !"..." %in% args) {
warning("geneSetURL functions must take at least two named arguments, ",
"and must also handle `...`")
return(FALSE)
}
url.test <- tryCatch(v("a", "b", x), error = function(e) NULL)
if (!test_string(url.test)) {
warning("geneSetURL does not return a string when called")
return(FALSE)
}
TRUE
}
# We explicitly strip the calling environment associated with the function
# as this can result in huge objects. Sometimes saveRDS'ing small GeneSetDb
# objects resulted in multi 100's of MB's worth of an *.rds file
#
# If the function's envirnment has a `.fn.local.vars` character vector, then
# only those arguments are kept in the environment, otherwise we only keep
# objects that are character vectors
if (is.function(value)) {
warning("Storing actual functions as a url_function inflates serialized ",
"consider storing the name of the function instead")
environment(value) <- new.env(parent = parent.env(environment(value)))
}
added <- addCollectionMetadata(x, i, 'url_function', value, valid)
added
})
#' @describeIn collectionMetadata sets the feature id type for a collection
setReplaceMethod("featureIdType", "GeneSetDb", function(x, i, value) {
valid <- function(v) is(v, 'GeneIdentifierType')
addCollectionMetadata(x, i, 'id_type', value, valid)
})
#' @describeIn collectionMetadata retrieves the feature id type for a collection
setMethod("featureIdType", "GeneSetDb", function(x, i, ...) {
x@collectionMetadata[list(i, 'id_type')]$value[[1L]]
})
#' @section Adding arbitrary collectionMetadata:
#'
#' Adds arbitrary metadata to a gene set collection of a GeneSetDb
#'
#' Note that this is not a replacement method! You must catch the returned
#' object to keep the one with the updated `collectionMetadata`. Although this
#' function is exported, I imagine this being used mostly through predefined
#' replace methods that use this as a utility function, such as the replacement
#' methods `featureIdType<-`, `geneSetURLfunction<-`, etc.
#'
#' ```
#' gdb <- getMSigGeneSetDb('H')
#' gdb <- addCollectionMetadata(gdb, 'H', 'foo', 'bar')
#' ```
#'
#' @export
#' @rdname collectionMetadata
#'
#' @param x [GeneSetDb()]
#' @param xcoll The collection name
#' @param xname The name of the metadata variable
#' @param value The value of the metadata variable
#' @param validate.value.fn If a function is provided, it is run on
#' `value` and msut return `TRUE` for addition to be made
#' @param allow.add If `FALSE`, this xcoll,xname should be in the
#' `GeneSetDb` already, and this will fail because something is
#' deeply wrong with the world
#' @return The updated `GeneSetDb`.
addCollectionMetadata <- function(x, xcoll, xname, value,
validate.value.fn=NULL, allow.add=TRUE) {
stopifnot(is(x, 'GeneSetDb'))
stopifnot(is.character(xcoll))
if (!hasGeneSetCollection(x, xcoll)) {
stop("GeneSetDb does not have collection: ", xcoll)
}
if (is.function(validate.value.fn)) {
if (!isTRUE(validate.value.fn(value))) {
stop(sprintf("Invalid value used to update %s,%s", xcoll, xname))
}
}
# update or replace
idx <- x@collectionMetadata[list(xcoll, xname), which=TRUE]
if (is.na(idx)) {
if (!allow.add) {
msg <- sprintf("%s,%s metadata does not exist in the GeneSetDb, but it",
"should be there. Your GeneSetDb is hosed")
stop(msg)
}
# the variable you want to enter here is not there yet, so we create an
# empty, single-row data.table that will be added to the current metadata
add.me <- data.table(
collection = xcoll,
name = xname,
value = list(value))
cm <- rbind(x@collectionMetadata, add.me)
setkeyv(cm, key(x@collectionMetadata))
x@collectionMetadata <- cm
} else {
# Need to use list(list()) because data.table uses list(.) to look for
# values to assign to columns by reference.
# https://stackoverflow.com/a/22536321/83761
set(x@collectionMetadata, i = idx, j = "value", value = list(list(value)))
}
x
}
#' Add metadata at the geneset level.
#'
#' This function adds/updates columns entries in the `geneSets(gdb)` table.
#' If there already are defined meta values for the columns of `meta` in `x`,
#' these will be updated with the values in `meta`.
#'
#' TODO: should this be a setReplaceMethod, Issue #13 (?)
#' https://github.com/lianos/multiGSEA/issues/13
#'
#' @export
#'
#' @param x a `GeneSetDb` object
#' @param meta a `data.frame`-like object with `"collection"`, `"name"`, and
#' an arbitrary amount of columns to add as metadata for the genesets.
#' @param ... not used yet
#' @return the updated `GeneSetDb` object `x`.
#' @examples
#' gdb <- exampleGeneSetDb()
#' meta.info <- transform(
#' geneSets(gdb)[, c("collection", "name")],
#' someinfo = sample(c("one", "two"), nrow(gdb), replace = TRUE))
#' gdb <- addGeneSetMetadata(gdb, meta.info)
addGeneSetMetadata <- function(x, meta, ...) {
stopifnot(is(x, "GeneSetDb"))
k <- key(x@table)
stopifnot(is(meta, "data.frame"))
stopifnot(all(k %in% colnames(meta)))
special <- c("N", "n", "active")
special <- intersect(special, colnames(meta))
if (length(special)) {
warning("Ignoring the following protected columns in the meta table:\n",
paste(special, collapse = ","), immediate. = TRUE)
}
meta <- as.data.table(meta)
setkeyv(meta, k)
mdt <- unique(meta, by = k)
if (nrow(mdt) != nrow(meta)) {
stop("You have duplicate collection,name entries in the updated `meta` ",
"data.frame. This is currently not allowed.")
}
# handle non std eval NOTE in R CMD check when using `:=` mojo
.idx. <- NULL
xtable <- copy(x@table)[, .idx. := 1:.N]
xref <- xtable[mdt]
bad.meta <- is.na(xref$N)
if (any(bad.meta)) {
warning(sum(bad.meta), " unknown gene sets in the meta update, ignoring.",
immediate. = TRUE)
xref <- xref[!bad.meta]
meta <- meta[!bad.meta]
}
mcols <- setdiff(colnames(meta), c(k, special))
if (length(mcols)) {
idxs <- xref[[".idx."]]
for (mc in mcols) {
vals <- meta[[mc]]
xtable[idxs, (mc) := vals]
}
}
stopifnot(
all.equal(xtable[, list(collection, name)], x@table[, list(collection, name)]),
all.equal(xtable$N, x@table$N))
xtable[[".idx."]] <- NULL
x@table <- xtable
x
}
#' Combines two GeneSetDb objects together
#'
#' @importMethodsFrom BiocGenerics combine
#' @exportMethod combine
#' @param x a GeneSetDb object
#' @param y a GeneSetDb object
#' @param ... more things
#' @return a new GeneSetDb that contains all genesets from `x` and `y`
#' @examples
#' gdb1 <- exampleGeneSetDb()
#' gdb2 <- GeneSetDb(exampleGeneSetDF())
#' gdb <- combine(gdb1, gdb2)
setMethod("combine", c(x = "GeneSetDb", y = "GeneSetDb"), function(x, y, ...) {
## Combine the db and featureIdMap(s)
db <- rbindlist(list(x@db, y@db), use.names=TRUE, fill=TRUE)
db <- unique(db, by=c('collection', 'name', 'feature_id'))
db <- setkeyv(db, key(x@db))
fms <- list(featureIdMap(x, as.dt=TRUE), featureIdMap(y, as.dt=TRUE))
fm <- rbindlist(fms, use.names=TRUE, fill=TRUE)
# ensure that a feature_id entry maps to only one x.id entry
# DEBUG: Is this uniquification necessary?
fm <- unique(fm, by=c('feature_id', 'x.id'))
fm[, x.idx := NA_integer_] ## blow out any `conform`-ation information
setkeyv(fm, 'feature_id')
cmeta <- rbind(x@collectionMetadata, y@collectionMetadata)
cmeta <- unique(cmeta, by=key(x@collectionMetadata))
setkeyv(cmeta, key(x@collectionMetadata))
out <- .GeneSetDb(db=db, featureIdMap=fm, table=init.gsd.table.from.db(db),
collectionMetadata=cmeta)
# Transfer over any extra metadata (columns) of the @table slots from
# the two inputs incase the user stored extra data at the geneset level
# in them.
add.gs.cols <- setdiff(
c(names(x@table), names(y@table)),
names(out@table))
if (length(add.gs.cols)) {
gs.keys <- key(out@table)
# If there were duplicate collection,name entries among the two GeneSetDb
# objects, let's pull them out here. If both share the same extra meta
# data names, x will win.
x.cols <- c(gs.keys, intersect(add.gs.cols, names(x@table)))
y.cols <- c(gs.keys, intersect(add.gs.cols, names(y@table)))
duped <- merge(
x@table[, x.cols, with = FALSE],
y@table[, y.cols, with = FALSE],
by = gs.keys,
suffixes = c("", ".axeme"))
# now just mash together the rest.
gs <- rbindlist(list(x@table, y@table), use.names = TRUE, fill = TRUE)
gs <- gs[, union(x.cols, y.cols), with = FALSE]
setkeyv(gs, gs.keys)
if (nrow(duped)) {
# duplicate collection,name gene sets in x and y create repeated rows
# in gs. Those are removed and replaced with `duped`
duped <- duped[, !grepl(".axeme", names(duped)), with = FALSE]
gs <- rbindlist(list(duped, gs[!duped]), use.names = TRUE, fill = TRUE)
setkeyv(gs, gs.keys)
}
out@table <- merge(out@table, gs, by = gs.keys, all.x = TRUE)
}
out
})
#' @importMethodsFrom BiocGenerics nrow
#' @exportMethod nrow
#' @describeIn geneSets return number of genesets in GeneSetDb
setMethod("nrow", "GeneSetDb", function(x) nrow(geneSets(x, as.dt=TRUE)))
#' Checks equality (feature parity) between GeneSetDb objects
#'
#' @export
#'
#' @method all.equal GeneSetDb
#' @param target The reference `GeneSetDb` to compare against
#' @param current The `GeneSetDb` you wan to compare
#' @param features.only Only compare the "core" columns of `target@db`
#' and `target@@table`. It is possible that you added additional columns
#' (to keep track of symbols in `target@@db`, for instance) that you
#' want to ignore for the purposes of the equality test.
#' @param ... moar args.
#' @return `TRUE` if equal, or \code{character} vector of messages if not.
all.equal.GeneSetDb <- function(target, current, features.only = TRUE, ...) {
stopifnot(is(target, 'GeneSetDb'))
stopifnot(is(current, 'GeneSetDb'))
msg <- TRUE
dbt <- setkeyv(copy(target@db), c('collection', 'name', 'feature_id'))
gst <- geneSets(target, active.only=FALSE, as.dt=TRUE)
dbc <- setkeyv(copy(current@db), key(dbt))
gsc <- geneSets(current, active.only=FALSE, as.dt=TRUE)
proto <- new("GeneSetDb")
if (features.only) {
dbt <- dbt[, names(proto@db), with=FALSE]
dbc <- dbc[, names(proto@db), with=FALSE]
gst <- gst[, names(geneSets(proto, as.dt=TRUE)), with=FALSE]
gsc <- gsc[, names(gst), with=FALSE]
}
msg <- all.equal(dbt, dbc)
if (!isTRUE(msg)) {
return(msg)
}
msg <- all.equal(gst, gsc)
if (!isTRUE(msg) || features.only) {
return(msg)
}
all.equal(collectionMetadata(target, as.dt=TRUE),
collectionMetadata(current, as.dt=TRUE))
}
# Conversion Methods -----------------------------------------------------------
#' Convert a GeneSetDb to other formats.
#'
#' @description
#' As awesome as a GeneSetDb is, you might find a time when you'll need your
#' gene set information in an other format. To do that, we provide the
#' following functions:
#'
#' * `as(gdb, "BiocSetf')`: convert to a [BiocSet::BiocSet()].
#' * `as(gdb, "GeneSetCollection")`: Convert to a
#' [GSEABase::GeneSetCollection()] object.
#' * `as.data.(table|frame)(gdb)`: Perhaps the most natural format to convert to in
#' order to save locally and examine outside of Bioconductor's GSEA universe,
#' but not many other tools accept gene set definitions in this format.
#' * `as.list(gdb)`: A named list of feature identifiers. This is the format
#' that many of the limma gene set testing methods use
#'
#' @details
#' The `as.*` functions accept a `value` parameter which indicates the type of
#' IDs you want to export in the conversion:
#'
#' * `"feature_id"`: The ID used as originally entered into the `GeneSetDb`.
#' * `"x.idx"`: Only valid if the GeneSetDb `x` has been `conform`-ed to an
#' expession container. This option will export the features as the integer
#' rows of the expression container.
#' * `"x.id"`: Only valid if the GeneSetDb `x` has been `conform`-ed. The
#' target expression container might use feature identifiers that are
#' different than what is in the GeneSetDb. If an active featureMap is
#' set on the GeneSetDb, this will convert the original feature identifiers
#' into a different target space (entrez to ensembl, for instance). Using
#' this option, the features will be provided in the target space.
#'
#' @export
#' @rdname conversion
#' @name conversion
#' @aliases as.data.frame as.list as.data.table
#' @method as.data.table GeneSetDb
#'
#' @param x A `GeneSetDb` object
#' @param keep.rownames included here just for consistency with
#' `data.table::as.data.table`, but it is not used
#' @param value The value type to export for the feature ids, defaults to
#' `"feature_id"`.
#' @param active.only If the `GeneSetDb` is conformed, do you want to only
#' return the features and genests that match target and are "active"?
#' @param ... pass through arguments (not used)
#' @return a converted `GeneSetDb`
#'
#' @examples
#' es <- exampleExpressionSet()
#' gdb <- conform(exampleGeneSetDb(), es)
#' bs <- as(gdb, "BiocSet")
#' gdf <- as.data.frame(gdb)
#' gdb <- conform(gdb, es)
#' gdfi <- as.data.frame(gdb, value = 'x.idx')
#' gdl <- as.list(gdb)
as.data.table.GeneSetDb <- function(x, keep.rownames = FALSE,
value = c('feature_id', 'x.id', 'x.idx'),
active.only = is.conformed(x), ...) {
stopifnot(is(x, 'GeneSetDb'))
value <- match.arg(value)
if (!is.conformed(x) && value %in% c('x.id', 'x.idx')) {
stop("must use value='feature_id' for non-conformed GeneSetDb'")
}
fid.map <- featureIdMap(x, as.dt=TRUE)
if (is.conformed(x) && active.only) {
# x.idx <- NULL # silence R CMD check NOTEs
fid.map <- fid.map[!is.na(x.idx)]
}
gs <- copy(geneSets(x, active.only=active.only, as.dt=TRUE))
gene2cat <- merge(x@db, fid.map, by='feature_id')
gene2cat <- gene2cat[!is.na(gene2cat[[value]]),]
gene2cat$finalId <- gene2cat[[value]]
# collection <- name <- finalId <- feature_id <- NULL # silence R CMD check NOTEs
out <- gene2cat[, list(collection, name, feature_id=finalId)]
more.cols <- setdiff(names(gene2cat),
c(names(out), names(fid.map), 'finalId'))
if (length(more.cols)) {
for (col in more.cols) {
out[, (col) := gene2cat[[col]]]
}
}
setkeyv(out, key(x@db))
out[gs[, list(collection, name)]]
}
#' @describeIn conversion convert a GeneSetDb to data.frame
#' @export
#' @method as.data.frame GeneSetDb
#' @param row.names,optional included here for consistency with `as.data.frame`
#' generic function definition, but these are not used.
as.data.frame.GeneSetDb <- function(x, row.names = NULL, optional = NULL,
value = c('feature_id', 'x.id', 'x.idx'),
active.only = is.conformed(x), ...) {
value <- match.arg(value)
setDF(as.data.table(x, value = value, active.only = active.only, ...))
}
#' Split and conserve ordering
#'
#' Using base::split will turn f into a factor and won't preserve the ordering
#' of the elements in f. This function preserves the split order in f
#'
#' @noRd
csplit <- function(x, f) {
f <- as.character(f)
ff <- factor(f, levels=unique(f))
split(x, ff)
}
#' @noRd
#' @method as.list GeneSetDb
#' @export
as.list.GeneSetDb <- function(x, value = c('feature_id', 'x.id', 'x.idx'),
active.only = is.conformed(x), nested = FALSE,
...) {
stopifnot(is(x, 'GeneSetDb'))
value <- match.arg(value)
df <- as.data.frame(x, value = value, active.only = active.only)
if (nested) {
colls <- unique(df$collection)
## Using the "naive split" call converts xdf$name to a factor and doesn't
## preserve ordering
out <- sapply(colls, function(coll) {
xdf <- df[df[['collection']] == coll,,drop=FALSE]
csplit(xdf$feature_id, xdf$name)
}, simplify=FALSE)
} else {
df$key <- encode_gskey(df)
out <- csplit(df$feature_id, df$key)
}
out
}
#' @noRd
#' @importClassesFrom GSEABase GeneSetCollection
#' @importFrom GSEABase GeneSetCollection GeneSet NullIdentifier
setAs("GeneSetDb", "GeneSetCollection", function(from) {
gs <- geneSets(from, as.dt=TRUE)
n.coll <- length(unique(gs$collection))
## We explicitly set the key type after subsetting here in the event that
## a 0 row data.table is returned -- this isn't keyed in 1.9.4, which seems
## like a bug
cmd <- collectionMetadata(from, as.dt=TRUE)
org <- subset(cmd, name == 'organism')
id.type <- subset(cmd, name == 'id_type')
setkeyv(id.type, 'collection')
setkeyv(org, 'collection')
gsl <- lapply(seq_len(nrow(gs)), function(i) {
name <- gs$name[i]
coll <- gs$collection[i]
idt <- id.type[coll]$value[[1]]
if (!is(idt, 'GeneIdentifierType')) {
idt <- NullIdentifier()
}
ids <- featureIds(from, coll, name, 'feature_id')
xorg <- org[coll]$value[[1]]
if (is.null(xorg)) {
xorg <- ""
}
set.name <- name
if (n.coll > 1L) {
set.name <- paste0(coll, ';', set.name)
}
GeneSet(ids, setName=set.name, geneIdType=idt, organism=xorg)
})
gsc <- GeneSetCollection(gsl)
gsc
})
#' @noRd
#' @importClassesFrom BiocSet BiocSet
setAs("GeneSetDb", "BiocSet", function(from) {
# we can transfer a lot of things over, like the geneSetURL's and other
# collectionMetadata, but for now let's just get the basics going
# Append collectio names or no? The BiocSet doesn't have this concept, and
# primary keys there are just the set names.
#
# Let's check if the geneset names only are unique, in which case we won't
# include the collection prefix, otherwise we must.
rm.prefix <- !any(duplicated(geneSets(from)$name))
gs.list <- as.list(from)
if (rm.prefix) {
names(gs.list) <- sub(".*?;;", "", names(gs.list))
}
BiocSet::BiocSet(gs.list)
})
## -----------------------------------------------------------------------------
## indexing
#' Identify the row number of the geneset being queried.
#'
#' The method only allows querying by single length character vectors
#'
#' @noRd
#'
#' @param x A GeneSetDb
#' @param i the collection of the geneset
#' @param j the name of the geneset
#'
#' @return The row index of the geneset under question. If i,j do not match
#' a given geneset, then NA is returned.
.gsd.row.index <- function(x, i, j) {
stopifnot(is(x, 'GeneSetDb'), is.character(i), is.character(j))
geneSets(x, active.only=FALSE, as.dt=TRUE)[list(i, j), which=TRUE]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.