Nothing
##' @title Run a (limited) Pathway Enrichment Analysis on the results of a Crispr experiment.
##' @description This function enables some limited geneset enrichment-type analysis of data derived from a pooled Crispr
##' screen using the PANTHER pathway database. Specifically, it identifies the set of targets significantly enriched or
##' depleted in a \code{summaryDF} object returned from \code{ct.generateResults} and compares that set to the remaining
##' targets in the screening library using a hypergeometric test.
##'
##' Note that many Crispr gRNA libraries specifically target biased sets of genes, often focusing on genes involved
##' in a particular pathway or encoding proteins with a shared biological property. Consequently, the enrichment results
##' returned by this function represent the pathways containing genes disproportionately targeted *within the context
##' of the screen*, and may or may not be informative of the underlying biology in question. This means that
##' pathways not targeted by a Crispr library will obviously never be enriched within the positive target set regardless of
##' their biological relevance, and pathways enriched within a focused library screen are similarly expected to partially
##' reflect the composition of the library and other confounding issues (e.g., number of targets within a pathway).
##' Analysts should therefore use this function with care. For example, it might be unsurprising to detect pathways related
##' to histone modification within a screen employing a crispr library targeting epigenetic regulators.
##'
##' @param summaryDF A dataframe summarizing the results of the screen, returned by the function \code{\link{ct.generateResults}}.
##' @param pvalue.cutoff A gene-level p-value cutoff defining targets of interest within the screen. Note that this is a
##' nominal p-value cutoff to preserve end-user flexibility.
##' @param enrich Logical indicating whether to consider guides that are enriched (default) or depleted within the screen.
##' @param organism The species of the cell line used in the screen; currently only 'human' or 'mouse' are supported.
##' @param db.cut Minimum number of genes annotated to a given to a pathway within the screen in order to consider it in the enrichment test.
##' @return A dataframe of enriched pathways.
##' @author Russell Bainer, Steve Lianoglou
##' @examples data('resultsDF')
##' ct.PantherPathwayEnrichment(resultsDF, organism = 'mouse')
##' @export
ct.PantherPathwayEnrichment <- function(summaryDF, pvalue.cutoff = 0.01, enrich = TRUE, organism = 'human', db.cut = 10){
if (!requireNamespace("PANTHER.db")) {
stop("The PANTHER.db bioconductor package is required")
}
if (!requireNamespace("AnnotationDbi")) {
stop("The AnnotationDbi bioconductor package is required")
}
#Check the input:
organism <- match.arg(organism, c('human', 'mouse'))
if(!(enrich %in% c(TRUE, FALSE))){
stop('enrich must be either TRUE or FALSE.')
}
if(!ct.resultCheck(summaryDF)){
stop("Execution halted.")
}
if(!(pvalue.cutoff <= 1 & pvalue.cutoff >= 0 & is.numeric(pvalue.cutoff))){
stop("pvalue.cutoff must be a numeric value between 0 and 1.")
}
if(!(is.numeric(db.cut))){
stop("db.cut must be a numeric value.")
}
message("WARNING: The interpretation of gene set enrichment analyses in Crispr screens is tricky. See the man page for further details.")
#Condense the summary frame to gene-level estimates and isolate the ones that we are testing
summaryDF <- summaryDF[!duplicated(summaryDF$geneID),]
summaryDF <- summaryDF[!is.na(summaryDF$geneID),]
summaryDF <- summaryDF[!grepl('^[A-Za-z]+$', summaryDF$geneID),]
universe <- summaryDF$geneID
#Pull out the significant hits
selected <- summaryDF$geneID[summaryDF[,"Target-level Enrichment P"] <= pvalue.cutoff]
if(enrich == FALSE){
selected <- summaryDF$geneID[summaryDF[,"Target-level Depletion P"] <= pvalue.cutoff]
}
#make the database and conform it to the provided resultDF:
pdb <- suppressMessages(ct.getPanther(organism))
pathways <- names(pdb)
pdb.cut <- lapply(pdb, function(x){x[x %in% universe]})
genesInCats <- vapply(pdb.cut, length, numeric(1))
pdb.cut <- pdb.cut[genesInCats > db.cut]
genesInCats <- genesInCats[genesInCats > db.cut]
message(paste('Omitting PANTHER pathways containing fewer than', db.cut, 'genes.'))
if(length(pdb.cut) == 0){
stop(paste('No acceptable gene sets found; consider reducing db.cut? \nExiting.'))
}
message(paste('Performing Hypergeometric tests with', length(pdb.cut), 'gene sets...'))
#Set up the output and run the tests
out <- data.frame('PATHWAY' = names(pdb.cut),
'nGenes' = genesInCats,
'sigGenes' = vapply(pdb.cut, function(x){sum(x %in% selected, na.rm = TRUE)}, numeric(1)),
stringsAsFactors = FALSE)
testResults <- t(mapply(.doHyperGInternal,
'numW' = out$nGenes,
'numB' = rep(length(universe), nrow(out)),
'numDrawn' = rep(length(selected), nrow(out)),
'numWdrawn' = out$sigGenes,
'over' = rep(TRUE, nrow(out)),
SIMPLIFY =TRUE))
out <- cbind(out, testResults[,3:1])
out[,'FDR'] <- p.adjust(out$p, 'BH')
out[,2:7] <- apply(out[,2:7], 2, as.numeric)
out <- out[order(out$p, decreasing = FALSE),]
row.names(out) <- 1:nrow(out)
return(out)
}
##' @title Extract a Named List of Entrez IDs Annotated to Each Pathway in \code{PANTHER.db}
##' @description This is a function that invokes the \link[PANTHER.db]{PANTHER.db} Bioconductor library to extract a list of pathway mappings
##' to be used in gene set enrichment tests. Specifically, the function returns a named list of pathways, where each element contains
##' Entrez IDs. Users should not generally call this function directly as it is invoked internally by the higher-level
##' \code{ct.PantherPathwayEnrichment()} function.
##' @param species The species of the cells used in the screen. Currently only 'human' or 'mouse' are supported.
##' @return A named list of pathways from \code{PANTHER.db}.
##' @author Russell Bainer, Steve Lianoglou.
##' @keywords internal
ct.getPanther <- function (species = c("human", "mouse")){
species <- match.arg(species, c("human", "mouse"))
if (!requireNamespace("PANTHER.db")) {
stop("The PANTHER.db bioconductor package is required")
}
if (!requireNamespace("AnnotationDbi")) {
stop("The AnnotationDbi bioconductor package is required")
}
if (species == "human") {
org.pkg <- "org.Hs.eg.db"
xorg <- "Homo_sapiens"
}
else {
org.pkg <- "org.Mm.eg.db"
xorg <- "Mus_musculus"
}
if (!requireNamespace(org.pkg)) {
stop(org.pkg, " bioconductor package required for this species query")
}
p.db <- PANTHER.db
#This is a prefiltering step that appears to be unnecessary now
#but potentially breaks if the user doesn't have proper database permissions.
#pthOrganisms(p.db) <- toupper(species)
org.db <- getFromNamespace(org.pkg, org.pkg)
p.all <- select(p.db, keys(p.db, keytype="PATHWAY_ID"),
columns=c("PATHWAY_ID", "PATHWAY_TERM", "UNIPROT"),
'PATHWAY_ID')
# Map uniprot to entrez
umap <- suppressMessages(select(org.db, p.all$UNIPROT, c('UNIPROT', 'ENTREZID'), 'UNIPROT'))
m <- merge(p.all, umap, by='UNIPROT')
m <- subset(m, !is.na(m$ENTREZID))
paths <- split(m[,3:4], as.factor(m$PATHWAY_TERM))
lapply(paths, function(x){unique(x$ENTREZID)})
}
##' @title Test Whether a Specified Target Set is Enriched Within a Pooled Screen
##' @description This function takes in a \code{resultsDF} and a vector of \code{targets} (contained in the \code{geneID} column of
##' \code{resultsDF}) and determines whether the specified targets are enriched within the set of all significantly altered targets.
##' It does this by iteratively testing whether \code{targets} are more likely to be among the set of enriched or depleted targets
##' at various significance thresholds using a hypergeometric test. Note that the returned Hypergeometric P-values are not corrected
##' for multiple testing.
##'
##' Returns a list detailing the \code{targets} used in the tests, and tables indicating the results of the hypergeometric test
##' at various significance thresholds.
##' @param summaryDF A dataframe summarizing the results of the screen, returned by the function \code{\link{ct.generateResults}}.
##' @param targets A character vector containing the names of the targets to be tested. Only targets contained in the \code{geneID}
##' column of the provided \code{summaryDF} are considered.
##' @param enrich Logical indicating whether to consider guides that are enriched (default) or depleted within the screen.
##' @param ignore Optionally, a character vector containing elements of the \code{geneID} column of the provided \code{summaryDF}
##' that should be ignored in the analysis (e.g., unassignable or nonfunctional targets, such as nontargeting controls). By default,
##' this function omits targets with \code{geneSymbol} 'NoTarget'.
##' @return A named list containing the tested target set and tables detailing the hypergeometric test results using various P-value and
##' Q-value thresholds.
##' @examples data(resultsDF)
##' tar <- sample(unique(resultsDF$geneID), 20)
##' res <- ct.targetSetEnrichment(resultsDF, tar)
##' @author Russell Bainer
##' @export
ct.targetSetEnrichment <- function(summaryDF, targets, enrich = TRUE, ignore = NULL){
#input checks
if(!ct.resultCheck(summaryDF)){
stop("Execution halted.")
}
if(!(enrich %in% c(TRUE, FALSE))){
stop('enrich must be either TRUE or FALSE.')
}
valid <- intersect(targets, summaryDF$geneID)
if(length(valid) == 0){
stop('None of the targets in the supplied vector are contained in the geneID column of the summary dataframe.')
}
if(length(setdiff(targets, valid)) != 0){
warning(paste('Not all of the supplied targets are present in the geneID column of the summary dataframe. Proceeding with',
length(valid), 'targets.'))
}
#Condense the summary frame to gene-level estimates and isolate the ones that we are testing
summaryDF <- summaryDF[!duplicated(summaryDF$geneID),]
summaryDF <- summaryDF[!is.na(summaryDF$geneID),]
summaryDF <- summaryDF[!grepl('NoTarget', summaryDF$geneSymbol),] #Remove NoTarget Genes rather than constraining to Entrez
if(!is.null(ignore)){
summaryDF <- summaryDF[!(summaryDF$geneID %in% ignore),]
}
#Pull out the P-values
if(enrich){
selected <- summaryDF[,c("Target-level Enrichment P", "Target-level Enrichment Q")]
} else {
selected <- summaryDF[,c("Target-level Depletion P", "Target-level Depletion Q")]
}
row.names(selected) <- summaryDF$geneID
#Set the cutoffs
cuts <- c(0,1/(10^(5:1)), 0.5, 1)
out <- list('targets' = valid)
out$P.values <- cbind(cuts,
vapply(cuts, function(x){sum(selected[valid,1] <= x)}, numeric(1)),
vapply(cuts, function(x){
.doHyperGInternal(length(valid),
nrow(selected),
sum(selected[,1] <= x),
sum(selected[valid,1] <= x),
TRUE)$p},
numeric(1))
)
out$Q.values <- cbind(cuts,
vapply(cuts, function(x){sum(selected[valid,2] <= x)}, numeric(1)),
vapply(cuts, function(x){
.doHyperGInternal(length(valid),
nrow(selected),
sum(selected[,2] <= x),
sum(selected[valid,2] <= x),
TRUE)$p},
numeric(1))
)
colnames(out$P.values) <- colnames(out$Q.values) <- c('Cutoff', 'Sig', 'Hypergeometric_P')
return(out)
}
## -----------------------------------------------------------------------------
## These were copied from the Bioconductor collection package
## We envision the test as follows:
##
## The urn contains genes from the gene universe. Genes annotated at a
## given collection term are white and the rest black.
##
## The number drawn is the size of the selected gene list. The
## number of white drawn is the size of the intersection of the
## selected list and the genes annotated at the collection.
## Here's a diagram based on using GO as the collection:
##
## inGO notGO
## White Black
## selected n11 n12
## not n21 n22
##
## numW: number of genes in GO category
## numB: size of universe
## numDrawn: number of differentially expressed genes
## numWdrawn: the number of genes differentially expressed in category
##' @keywords internal
.doHyperGInternal <- function(numW, numB, numDrawn, numWdrawn, over) {
n21 <- numW - numWdrawn
n12 <- numDrawn - numWdrawn
n22 <- numB - n12
odds_ratio <- (numWdrawn * n22) / (n12 * n21)
expected <- (numWdrawn + n12) * (numWdrawn + n21)
expected <- expected / (numWdrawn + n12 + n21 + n22)
if (over) {
## take the -1 because we want evidence for as extreme or more
pvals <- phyper(numWdrawn - 1L, numW, numB,
numDrawn, lower.tail=FALSE)
} else {
pvals <- phyper(numWdrawn, numW, numB,
numDrawn, lower.tail=TRUE)
}
list(p=pvals, odds=odds_ratio, expected=expected)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.