#' Run Gene Set Enrichment Analysis on cells
#'
#' Calculate cells gene specificty ranking and then perform geneset enrichment analysis on it.
#'
#' @param X Seurat or SingleCellExperiment object
#' @param pathways List of gene sets to check
#' @param reduction Which dimensionality reduction to use, must be based on MCA.
#' @param dims A vector of integers indicating which dimensions to use with reduction embeddings and loadings for distance calculation.
#' @param features Character vector of feature names to subset feature coordinates. If not specified will take all features available from specified reduction Loadings.
#' @param cells Character vector of cell names to subset cell coordinates. If not specified will take all features available from specified reduction Embeddings
#' @param nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @param minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param gseaParam GSEA parameter value, all gene-level statis are raised to the power of 'gseaParam' before calculation of GSEA enrichment scores
#' @param n.core A single integer to specify the number of core for parallelisation.
#'
#' @return A data.table with geneset enrichment analysis statistics.
#' @export
#'
#' @examples
#' seuratPbmc <- RunMCA(seuratPbmc, nmcs = 5)
#' GSEAResults <- RunCellGSEA(seuratPbmc, Hallmark, dims = 1:5)
RunCellGSEA <- function(X, pathways, reduction, dims, features, cells, nperm, minSize, maxSize, gseaParam, n.core) {
UseMethod("RunCellGSEA", X)
}
#' @rdname RunCellGSEA
#' @export
RunCellGSEA.Seurat <-
function(X,
pathways,
reduction = "mca",
dims = seq(50),
features = NULL,
cells = NULL,
nperm = 1000,
minSize = 10,
maxSize = 500,
gseaParam = 0,
n.core = 1) {
# Check -------------------------------------------------------------------
check <-
checkCellIDArg(
X = X,
reduction = reduction,
dims = dims,
features = features,
cells = cells
)
features <- check$features
dims <- check$dims
cells <- check$cells
# Parallel ----------------------------------------------------------------
BPPARAM <- BiocParallel::bpparam()
BPPARAM$workers <- n.core
BPPARAM$tasks <- as.integer(n.core * 5)
BPPARAM$progressbar <- TRUE
# Ranking -----------------------------------------------------------------
CellGeneDist <-
GetCellGeneDistance(
X,
reduction = reduction,
dims = dims,
features = features,
cells = cells
)
CellGeneDist <- as.data.table(CellGeneDist, "features")
features <- CellGeneDist$features
message("starting GSEA")
gseaResults <- bplapply(CellGeneDist[,-1], fgseaCellIDPar,
pathways = pathways,
features = features,
nperm = nperm,
minSize = minSize,
maxSize = maxSize,
BPPARAM = BPPARAM
)
gseaResults <- rbindlist(gseaResults, idcol = "cell")
return(gseaResults)
}
#' @rdname RunCellGSEA
#' @export
RunCellGSEA.SingleCellExperiment <-
function(X,
pathways,
reduction = "mca",
dims = seq(50),
features = NULL,
cells = NULL,
nperm = 1000,
minSize = 10,
maxSize = 500,
gseaParam = 0,
n.core = 1) {
# Check -------------------------------------------------------------------
check <-
checkCellIDArg(
X = X,
reduction = reduction,
dims = dims,
features = features,
cells = cells
)
features <- check$features
dims <- check$dims
cells <- check$cells
# Parallel ----------------------------------------------------------------
BPPARAM <- BiocParallel::bpparam()
BPPARAM$workers <- n.core
BPPARAM$tasks <- as.integer(n.core * 5)
BPPARAM$progressbar <- TRUE
# Ranking -----------------------------------------------------------------
CellGeneDist <-
GetCellGeneDistance(
X,
reduction = reduction,
dims = dims,
features = features,
cells = cells
)
CellGeneDist <- as.data.table(CellGeneDist, "features")
features <- CellGeneDist$features
message("starting GSEA")
bplapply(CellGeneDist[,-1], fgseaCellIDPar,
pathways = pathways,
features = features,
nperm = nperm,
minSize = minSize,
maxSize = maxSize,
BPPARAM = BPPARAM
)
gseaResults <- rbindlist(gseaResults, idcol = "cell")
return(gseaResults)
}
#' Run GSEA on cluster/groups
#'
#' Calculate group gene specificty ranking and then perform geneset enrichment analysis on it.
#'
#' @param X pathways List of gene sets to check
#' @param pathways reduction Which dimensionality reduction to use, must be based on MCA.
#' @param group.by dims A vector of integers indicating which dimensions to use with reduction embeddings and loadings for distance calculation.
#' @param reduction features Character vector of feature names to subset feature coordinates. If not specified will take all features available from specified reduction Loadings.
#' @param dims cells Character vector of cell names to subset cell coordinates. If not specified will take all features available from specified reduction Embeddings
#' @param features cells Character vector of cell names to subset cell coordinates. If not specified will take all features available from specified reduction Embeddings
#' @param nperm nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @param minSize minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param gseaParam gseaParam GSEA parameter value, all gene-level statis are raised to the power of 'gseaParam' before calculation of GSEA enrichment scores
#'
#' @return A data.table with geneset enrichment analysis statistics.
#' @export
#'
#' @examples
#' seuratPbmc <- RunMCA(seuratPbmc, nmcs = 5)
#' GSEAResults <- RunGroupGSEA(seuratPbmc, Hallmark, group.by = "seurat_clusters", dims = 1:5)
RunGroupGSEA <-
function(X, pathways, group.by, reduction, dims, features, nperm, minSize, maxSize, gseaParam) {
UseMethod("RunGroupGSEA", X)
}
#' @rdname RunGroupGSEA
#' @export
RunGroupGSEA.Seurat <-
function(X, pathways, group.by = NULL, reduction = "mca", dims = seq(50), features = NULL, nperm = 1000, minSize = 10, maxSize = 500, gseaParam = 0) {
check <-
checkCellIDArg(
X = X,
group.by = group.by,
reduction = reduction,
dims = dims,
features = features,
cells = NULL
)
features <- check$features
dims <- check$dims
group.by <- check$group.by
message("\nranking genes \n")
# Ranking -----------------------------------------------------------------
GroupGeneRanking <-
GetGroupGeneRanking(
X = X,
group.by = group.by,
reduction = reduction,
dims = dims,
features = features
)
GroupGeneRanking <-
lapply(GroupGeneRanking, function(i)
(i + 0.1)^-1)
gseaResultsAll <- lapply(GroupGeneRanking, function(x,
pathways,
nperm,
minSize,
maxSize) {
stats <- x
gseaResults <- CellID::fgseaCellID(
pathways = pathways,
stats = stats,
nperm = nperm,
minSize = minSize,
maxSize = maxSize
)}, pathways = pathways,
nperm = nperm,
minSize = minSize,
maxSize = maxSize)
gseaResultsAll <-
rbindlist(gseaResultsAll, idcol = "cell")
setnames(gseaResultsAll, "cell", "group")
return(gseaResultsAll)
}
#' @rdname RunGroupGSEA
#' @export
RunGroupGSEA.SingleCellExperiment <-
function(X, pathways, group.by, reduction = "MCA", dims = seq(50), features = NULL, nperm = 1000, minSize = 10, maxSize = 500, gseaParam = 0) {
check <-
checkCellIDArg(
X = X,
group.by = group.by,
reduction = reduction,
dims = dims,
features = features,
cells = NULL
)
features <- check$features
dims <- check$dims
group.by <- check$group.by
message("\nranking genes \n")
# Ranking -----------------------------------------------------------------
GroupGeneRanking <-
GetGroupGeneRanking(
X = X,
group.by = group.by,
reduction = reduction,
dims = dims,
features = features
)
GroupGeneRanking <-
lapply(GroupGeneRanking, function(i)
(i + 0.1)^-1)
gseaResultsAll <- lapply(GroupGeneRanking, function(x,
pathways,
features,
nperm,
minSize,
maxSize) {
stats <- x
gseaResults <- CellID::fgseaCellID(
pathways = pathways,
stats = stats,
nperm = nperm,
minSize = minSize,
maxSize = maxSize,
.progress = TRUE
)})
gseaResultsAll <-
rbindlist(gseaResultsAll, idcol = "cell")
setnames(gseaResultsAll, "cell", "group")
return(gseaResultsAll)
}
#' Get Matrix from Enrichment Results
#'
#' @param X an enrichment results obtained by RunGroupGSEA or RunCellGSEA
#' @param metric a character indicating which metric to use as value of matrix (ES, NES, padj, pval)
#'
#' @return A matrix of geneset enrichment metric with cell/group at columns and pathways/genesets at rows
#' @export
#'
#' @examples
#' seuratPbmc <- RunMCA(seuratPbmc, nmcs = 5)
#' GSEAResults <- RunGroupGSEA(seuratPbmc, Hallmark, group.by = "seurat_clusters", dims = 1:5)
#' GSEAMatrix <- GetGSEAMatrix(GSEAResults)
GetGSEAMatrix <- function(X, metric = "ES") {
names(X)[1] <- "cell"
dt <- dcast(X, pathway ~ cell, value.var = metric)
gsea_mat <- as.matrix(dt[, -1])
rownames(gsea_mat) <- dt$pathway
return(gsea_mat)
}
#' GSEA Results Integration in Seurat object
#'
#' @param X a Seurat object
#' @param gseaResults Results of GSEA obtained by RunCellGSEA or RunGroupGSEA
#' @param assay.name The name of the new Seurat assay slot to include GSEA results
#' @return a Seurat object with a new assay slot filled with GSE analysis resultsq
integrateGSEASeurat <- function(X, gseaResults, assay.name) {
message("\n\ngenerating seurat object\n")
ES_matrix <-
GetGSEAMatrix(gseaResults, metric = "ES")
coln <- sort(colnames(ES_matrix))
rown <- sort(rownames(ES_matrix))
ES_matrix <- ES_matrix[rown, coln]
NES_matrix <-
GetGSEAMatrix(gseaResults, metric = "NES")
Scaled_NES_matrix <- t(apply(NES_matrix, 1, scale))
colnames(Scaled_NES_matrix) <- colnames(NES_matrix)
message("\nreturning seurat object\n")
X[[assay.name]] <-
CreateAssayObject(counts = Matrix::Matrix(ES_matrix, sparse = TRUE))
X <-
SetAssayData(X, slot = "data", NES_matrix, assay = assay.name)
X <-
SetAssayData(X, slot = "scale.data", Scaled_NES_matrix, assay = assay.name)
return(X)
}
# fgsea -------------------------------------------------------------------
# Extracted from fgsea
calcGseaStatCumulativeBatch <-
getFromNamespace("calcGseaStatCumulativeBatch", "fgsea")
#' reworked fgsea for ram and speed efficiency in CellID
#'
#' @param pathways List of gene sets to check
#' @param stats Named vector of gene-level stats. Names should be the same as in 'pathways'
#' @param nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @param minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param gseaParam GSEA parameter value, all gene-level statis are raised to the power of 'gseaParam' before calculation of GSEA enrichment scores
#'
#' @return A table with GSEA results. Each row corresponds to a tested pathway.
#' The columns are the following:
#' \itemize{
#' \item pathway -- name of the pathway as in `names(pathway)`;
#' \item pval -- an enrichment p-value;
#' \item padj -- a BH-adjusted p-value;
#' \item ES -- enrichment score, same as in Broad GSEA implementation;
#' \item NES -- enrichment score normalized to mean enrichment of random samples of the same size;
#' \item nMoreExtreme` -- a number of times a random gene set had a more
#' extreme enrichment score value;
#' \item size -- size of the pathway after removing genes not present in `names(stats)`.
#' \item leadingEdge -- vector with indexes of leading edge genes that drive the enrichment, see \url{http://software.broadinstitute.org/gsea/doc/GSEAUserGuideTEXT.htm#_Running_a_Leading}.
#' }
#' @export
fgseaCellID <-
function(pathways, stats, nperm = 1000, minSize = 10, maxSize = 500, gseaParam = 0) {
# remove note
. <- NULL
ties <- sum(duplicated(stats[stats != 0]))
if (any(duplicated(names(stats)))) {
warning("There are duplicate gene names, fgsea may produce unexpected results")
}
granularity <- 1000
seeds <- 1
minSize <- max(minSize, 1)
stats <- sort(stats, decreasing = TRUE)
stats <- abs(stats)^gseaParam
pathwaysFiltered <- lapply(pathways, function(p) {
as.vector(na.omit(fastmatch::fmatch(p, names(stats))))
})
pathwaysSizes <-
vapply(X = pathwaysFiltered, FUN = length, FUN.VALUE = as.integer(0)
)
toKeep <-
which(minSize <= pathwaysSizes & pathwaysSizes <= maxSize)
m <- length(toKeep)
if (m == 0) {
return(
data.table(
pathway = character(),
pval = numeric(),
padj = numeric(),
ES = numeric(),
NES = numeric()
)
)
}
pathwaysFiltered <- pathwaysFiltered[toKeep]
pathwaysSizes <- pathwaysSizes[toKeep]
K <- max(pathwaysSizes)
gseaStatRes <-
vapply(
X = pathwaysFiltered,
FUN = calcGseaStat,
FUN.VALUE = 0,
stats = stats
)
pathwayScores <- gseaStatRes
universe <- seq_along(stats)
leEs <- rep(0, m)
geEs <- rep(0, m)
leZero <- rep(0, m)
geZero <- rep(0, m)
leZeroSum <- rep(0, m)
geZeroSum <- rep(0, m)
if (m == 1) {
for (i in seq_len(nperm)) {
randSample <- sample.int(length(universe), K)
randEsP <- calcGseaStat(
stats = stats,
selectedStats = randSample,
gseaParam = gseaParam
)
leEs <- leEs + (randEsP <= pathwayScores)
geEs <- geEs + (randEsP >= pathwayScores)
leZero <- leZero + (randEsP <= 0)
geZero <- geZero + (randEsP >= 0)
leZeroSum <- leZeroSum + pmin(randEsP, 0)
geZeroSum <- geZeroSum + pmax(randEsP, 0)
}
} else {
aux <- calcGseaStatCumulativeBatch(
stats = stats,
gseaParam = gseaParam,
pathwayScores = pathwayScores,
pathwaysSizes = pathwaysSizes,
iterations = nperm,
seed = 1
)
leEs <- get("leEs", aux)
geEs <- get("geEs", aux)
leZero <- get("leZero", aux)
geZero <- get("geZero", aux)
leZeroSum <- get("leZeroSum", aux)
geZeroSum <- get("geZeroSum", aux)
}
counts <- data.table(
pathway = seq_len(m),
leEs = leEs,
geEs = geEs,
leZero = leZero,
geZero = geZero,
leZeroSum = leZeroSum,
geZeroSum = geZeroSum
)
leEs <- leZero <- geEs <- geZero <- leZeroSum <- geZeroSum <- NULL
pathway <-
padj <- pval <- ES <- NES <- geZeroMean <- leZeroMean <- NULL
nMoreExtreme <- nGeEs <- nLeEs <- size <- NULL
leadingEdge <- NULL
pvals <- counts[, list(
pval = min(
(1 + sum(leEs)) / (1 + sum(leZero)),
(1 + sum(geEs)) / (1 + sum(geZero))
),
leZeroMean = sum(leZeroSum) / sum(leZero),
geZeroMean = sum(geZeroSum) / sum(geZero),
nLeEs = sum(leEs),
nGeEs = sum(geEs)
), by = .(pathway)]
pvals[, `:=`(padj, p.adjust(pval, method = "BH"))]
pvals[, `:=`(ES, pathwayScores[pathway])]
pvals[, `:=`(NES, ES / ifelse(ES > 0, geZeroMean, abs(leZeroMean)))]
pvals[, `:=`(leZeroMean, NULL)]
pvals[, `:=`(geZeroMean, NULL)]
pvals[, `:=`(nLeEs, NULL)]
pvals[, `:=`(nGeEs, NULL)]
pvals[, `:=`(pathway, names(pathwaysFiltered)[pathway])]
pvals <- pvals[]
return(pvals)
}
fgseaCellIDPar <- function(x, pathways, features, nperm, minSize, maxSize) {
x <- (x+0.1)^-1
names(x) <- features
gseaResults <- CellID::fgseaCellID(
pathways = pathways,
stats = x,
nperm = nperm,
minSize = minSize,
maxSize = maxSize
)}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.