##' @title Update a gene-centric msdb object for GREAT-style enrichment analysis using a specified CRISPR annotation.
##'
##' @description Update a gene-centric `GeneSetDb` object for GREAT-style enrichment analysis using a specified annotation.
##'
##' Often, pooled screening libraries are constructed such that the gene targets of interest are associated
##' with variable numbers of semi-independent screen signals (associated with, e.g., sets of alternative promoters
##' or cis regulatory units). Such an arrangement is often unavoidable but produces to complications when
##' performing gene set enrichent analyses. This function conforms a standard `GeneSetDb` object to appropriately
##' consider this form of ultiple testing during ontological enrichment analyses according to the GREAT strategy
##' outlined by [McLean et al. (2009)](https://doi.org/10.1038/nbt.1630).
##'
##' Operationally, this means that genewise sets in the provided object will be translated to the corresponding
##' `geneSymbol` sets provided in the annotation file.
##'
##' @param annotation an annotation object returned by \code{ct.prepareAnnotation()}.
##' @param gsdb A gene-centric \code{GeneSetDb} object to conform to the relevant peakwise dataset.
##' @param minsize Minimum number of targets required to consider a geneset valid for analysis.
##' @param ... Additional arguments to be passed to `ct.prepareAnnotation()`.
##' @return A new \code{GeneSetDb} object with the features annotated genewise to pathways.
##' @examples
##' data(resultsDF)
##' data(ann)
##' gsdb <- ct.GREATdb(ann, gsdb = sparrow::getMSigGeneSetDb(collection = 'h', species = 'human', id.type = 'entrez'))
##' show(sparrow::featureIds(gsdb))
##' @export
ct.GREATdb <- function(annotation, gsdb = sparrow::getMSigGeneSetDb(collection = c("h", "c2"), species = "human", id.type = "ensembl"), minsize = 10, ...) {
.Deprecated('convertIdentifiers', package = 'sparrow',
msg = 'Consider using sparrow for collapsing targets to genes (see sparrow::convertIdentifiers()).')
# input check
if (!is(gsdb, "GeneSetDb")) {
if (is(gsdb, "GeneSetCollection") || is(gsdb, "GeneSet")) {
stop("A GeneSetDb is required. GeneSetCollections can be converted to a ", "GeneSetDb via the GeneSetDb constructor, or a call to ", "`as(gene_set_collection, 'GeneSetDb')`. See ?GeneSetDb for more ",
"details.")
}
stop("GeneSetDb required. Please see `?GeneSetDb` for ways to turn your ", "gene sets into a GeneSetDb, or ")
}
stopifnot(is(gsdb, "GeneSetDb"), is(minsize, "numeric"), (length(minsize) == 1))
# check the annotation object
annotation <- ct.prepareAnnotation(annotation, ...)
# Check to make sure that the geneID column values are present in the provided gsdb
genes <- na.omit(unique(annotation$geneID))
message(length(intersect(sparrow::featureIds(gsdb), genes)), " valid genes found in the supplied gene database (of ", length(genes), ")")
# Tabulate targets per gene
targetsPerGene <- lapply(genes, function(x) {
return(as.character(unique(annotation$geneSymbol[annotation$geneID %in% x])))
})
names(targetsPerGene) <- genes
minsetsize <- floor(minsize/max(unlist(lapply(targetsPerGene, length))))
# Pull the lists from the gsdb
gsdb <- sparrow::conform(gsdb, target = sparrow::featureIds(gsdb), min.gs.size = minsetsize)
gsdb_list <- as.list(gsdb)
new_gs <- lapply(gsdb_list, function(x) {
return(as.character(unique(unlist(targetsPerGene[x]))))
})
message(length(new_gs), " adjusted genesets created.")
if (length(new_gs) == 0) {
stop("Exiting.")
}
collections <- vapply(names(new_gs), function(x) {
strsplit(x, split = ";;")[[1]][1]
}, character(1))
sets <- vapply(names(new_gs), function(x) {
strsplit(x, split = ";;")[[1]][2]
}, character(1))
names(new_gs) <- sets
out <- lapply(unique(collections), function(z) {
new_gs[collections %in% z]
})
out_gdb <- sparrow::GeneSetDb(out, ...)
out_gdb <- sparrow::conform(out_gdb, target = sparrow::featureIds(out_gdb), min.gs.size = minsize)
return(out_gdb)
}
##' @title Prepare one or more resultsDF objects for analysis via Sparrow.
##'
##' @description Take in a list of results objects and return an equivalently-named list of input `data.frames` appropriate for `sparrow::seas()`.
##' By construction, the relevant target unit is extracted from the `geneSymbol` column of the provided results objects, which may. Note that the
##' genewise `@logFC` slot in the returned object will contain the appropriately-signed Z transformation of the P-value
##' assigned to the target. In most applications this is arguably more interpretable than e.g., the median gRNA log2 fold change.
##'
##' @param dflist A list of gCrisprTools results `data.frames` to be formatted.
##' @param collapse.on Should targets be annotated as `geneSymbol`s or `geneID`s (default)?
##' @param cutoff Numeric maximum value of `statistic` to define significance.
##' @param statistic Should cutoffs be calculated based on FDR (`best.q`) or P-value (`best.p`)?
##' @param regularize Logical indicating whether to regularize the result objects in `dflist` (e.g., use intersection set of
##' all targets), or keep as-is.
##' @param gdb Optionally, a `GeneSetDb` object to enable proper registration of the output. If provided, the
##' collapsing features in the provided `simpleDF`s must be present in the `gsd@db$feature_id` slot. Note that a GREAT-style `GeneSetDb` that
##' has been conformed via `ct.GREATdb()` will use `geneID`s as the `feature_id`.
##' @param active Optionally, the name of a logical column present in the provided result that will be used to define significant signals.
##' This is set to `replicated` by default to If a valid column name is provided, this overrides the specification of `cutoff` and `statistic`.
##' @return A list of `data.frames` formatted for evaluation with `sparrow::seas()`.
##' @examples
##' data(resultsDF)
##' ct.seasPrep(list('longer' = resultsDF, 'shorter' = resultsDF[1:10000,]), collapse.on = 'geneSymbol')
##' @export
ct.seasPrep <- function(dflist, collapse.on = c("geneID", "geneSymbol"), cutoff = 0.1, statistic = c("best.q", "best.p"), regularize = FALSE, gdb = NULL, active = 'replicated') {
# Input check
collapse.on <- match.arg(collapse.on)
stopifnot(is(cutoff, "numeric"), cutoff <= 1, cutoff >= 0, is(regularize, "logical"), is.character(active))
if (regularize) {
dflist <- ct.regularizeContrasts(dflist, collapse = collapse.on)
} else {
dflist <- lapply(dflist, ct.simpleResult, collapse = collapse.on)
}
statistic <- match.arg(statistic)
if (!is.null(gdb)) {
dflist <- lapply(dflist, function(x) {
x[(row.names(x) %in% sparrow::featureIds(gdb)), ]
})
message("Removed genes absent from the provided GeneSetDb.")
}
if (any(unlist(lapply(dflist, nrow)) == 0)) {
stop("No valid `feature_id`s observed for one or more of the provided contrasts! check `collapse.on`?")
}
out <- lapply(dflist, function(x) {
z <- 10^-(ct.softLog(x$best.p))
z <- qnorm(z/2) * ifelse(x$direction == "enrich", -1, 1)
sig <- (x[, statistic] <= cutoff)
if(active %in% names(x)){
stopifnot(is(x[,active], 'logical'))
sig <- x[,active]
message("Using provided active set: ", active)
}
df <- data.frame(feature_id = row.names(x), logFC = z, significant = sig, direction = x$direction, rank_by = z, stringsAsFactors = FALSE)
return(df)
})
names(out) <- names(dflist)
return(out)
}
##' @title Geneset Enrichment within a CRISPR screen using `sparrow`
##'
##' @description This function is a wrapper for the `sparrow::seas()` function, which identifies differentially enriched/depleted ontological
##' categories within the hits identified by a pooled screening experiment, given a provided `GenseSetDb()` object and a list of
##' results objects created by `ct.generateResults()`. By default testing is performed using `fgsea` and a hypergeometric test
##' (`sparrow::ora()`), and results are returned as a `SparrowResult` object.
##'
##' This function will attempt to coerce them into inputs appropriate for the above analyses via `ct.seasPrep()`, after checking
##' the relevant parameters within the provided `GeneSetDb`. This is generally easier than going through the individual steps yourself,
##' especially when the user is minimally postprocessing the contrast results in question.
##'
##' Sometimes, it can be useful to directly indicate the set of targets to be included in an enrichment analysis (e.g., if you wish to expand
##' or contract the set of active targets based on signal validation or other secondary information about the experiment). To accomodate this
##' use case, users may include a logical column in the provided result object(s) indicating which elements should be included among the
##' positive signals exposed to the test.
##'
##' Note that many pooled libraries specifically target biased sets of genes, often focusing on genes involved
##' in a particular pathway or encoding proteins with a shared biological property. Consequently, the enrichment results
##' returned by this function represent the disproportionate enrichment or depletion of targets annotated to pathways *within the context
##' of the screen*, and may or may not be informative of the underlying biology in question. This means that
##' pathways not targeted by a library will obviously never be enriched a positive target set regardless of
##' their biological relevance, and pathways enriched within a focused library screen are similarly expected to partially
##' reflect the composition of the library and other confounding issues (e.g., number of targets within a pathway).
##' Analysts should therefore use this function with care. For example, it might be unsurprising to detect pathways related
##' to histone modification within a screen employing a crispr library primarily targeting epigenetic regulators.
##'
##' @param dflist A result object created by `ct.generateResults()`, or a named list containing many of them; will be passed as a list to
##' `ct.seasPrep()` with the associated `...` arguments.
##' @param gdb A `GenseSetDb` object containing annotations for the targets specified in `result`.
##' @param active Name of a column in the supplied result(s) that should be used to indicate active/selected targets.
##' @param ... Additional arguments to pass to `ct.seasPrep()` or `sparrow::seas()`.
##' @return A named list of `SparrowResults` objects.
##' @examples
##' data('resultsDF')
##' gdb <- sparrow::getMSigGeneSetDb(collection = 'h', species = 'human', id.type = 'entrez')
##' ct.seas(list('longer' = resultsDF, 'shorter' = resultsDF[1:10000,]), gdb)
##' @author Steve Lianoglou for seas; Russell Bainer for GeneSetDb processing and wrapping functions.
##' @export
ct.seas <- function(dflist, gdb, active = 'replicated', ...) {
# Check GSDB and determine feature set
stopifnot(is(gdb, "GeneSetDb"))
# listify results as needed
if (!is(dflist, "list")) {
if (ct.resultCheck(dflist)) {
dflist <- list(result = dflist)
}
}
# Infer whether Gsdb is ID or feature centric
gids <- sum(sparrow::featureIdMap(gdb)$feature_id %in% dflist[[1]]$geneID)
gsids <- sum(sparrow::featureIdMap(gdb)$feature_id %in% dflist[[1]]$geneSymbol)
if (all(c(gsids, gids) == 0)) {
stop("None of the features in the GeneSetDb are present in either the geneID or geneSymbol slots of the first provided result.")
}
identifier <- ifelse(gids > gsids, "geneID", "geneSymbol")
message("GeneSetDb feature_ids coded as ", identifier, "s.")
if (identifier %in% "geneID") {
message("Depending on the composition of your library, you might consider switching to a target-level analysis; see ?ct.GREATdb() for details.")
}
# Check that all provided objects are keyed to the proper values
ipts <- ct.seasPrep(dflist, collapse.on = identifier, gdb = gdb, active = active)
outs <- lapply(ipts, function(ipt) {
sparrow::seas(x = ipt, gsd = gdb, methods = c("ora", "fgsea"), rank_by = "rank_by", selected = "significant", groups = "direction", ...)
})
return(outs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.