#' @importFrom data.table fread setDF
#' @importFrom fgsea fgsea
#' @importFrom clusterProfiler enricher
NULL
#' Read a GMT file
#'
#' @param fname GMT file name.
#'
#' @return A list containing genes and description of each pathway
#' @examples
#' # Read example gmt file
#' gmt_fname <- system.file("extdata", "pathways.gmt", package = "CEMiTool")
#' gmt_in <- read_gmt(gmt_fname)
#'
#' @export
read_gmt <- function(fname){
res <- list(genes=list(), desc=list())
gmt <- file(fname)
gmt_lines <- readLines(gmt)
close(gmt)
gmt_list <- lapply(gmt_lines, function(x) unlist(strsplit(x, split="\t")))
gmt_names <- sapply(gmt_list, '[', 1)
gmt_desc <- lapply(gmt_list, '[', 2)
gmt_genes <- lapply(gmt_list, function(x){x[3:length(x)]})
names(gmt_desc) <- names(gmt_genes) <- gmt_names
res <- do.call(rbind, lapply(names(gmt_genes),
function(n) cbind.data.frame(term=n, gene=gmt_genes[[n]], stringsAsFactors=FALSE)))
res$term <- as.factor(res$term)
return(res)
}
# Performs Over Representation Analysis for a list of genes and a GMT
#
# @keywords internal
#
# @param topgenes a vector of genes
# @param gmt.list a gmt from prepare.gmt function
# @param allgenes a vector containing all genes to be considered as universe
#
# @return a data.frame containing the results
#
#
ora <- function(mod_name, gmt_list, allgenes, mods){
if(missing(allgenes)) {
message("Using all genes in GMT file as universe.")
allgenes <- unique(gmt_list[, "gene"])
}
topgenes <- mods[[mod_name]]
enriched <- clusterProfiler::enricher(gene = topgenes,
pvalueCutoff = 1,
qvalueCutoff = 1,
universe = allgenes,
TERM2GENE = gmt_list)
# TERM2NAME = gmt_list[['term2name']])
if (!is.null(enriched) && !is.logical(enriched)) {
result <- enriched@result
} else {
if(mod_name != "Not.Correlated"){
warning("Enrichment for module ", mod_name, " is NULL")
}
result <- data.frame(Module=character(), ID=character(),
Description=character(),
GeneRatio=numeric(), BgRatio=numeric(),
pvalue=numeric(), p.adjust=numeric(),
qvalue=numeric(), geneID=character(),
Count=numeric(), stringsAsFactors=FALSE)
}
return(result)
}
#' Module Overrepresentation Analysis
#'
#' Performs overrepresentation analysis for each co-expression module found.
#'
#' @param cem Object of class \code{CEMiTool}.
#' @param gmt Object of class \code{data.frame} with 2 columns, one with
#' pathways and one with genes
#' @param verbose logical. Report analysis steps.
#' @param ... Optional parameters.
#'
#' @return Object of class \code{CEMiTool}
#'
#' @seealso \code{\link{ora_data}}
#'
#' @examples
#' # Get example CEMiTool object
#' data(cem)
#' # Read gmt file
#' gmt <- read_gmt(system.file('extdata', 'pathways.gmt',
#' package='CEMiTool'))
#' # Run module overrepresentation analysis
#' cem <- mod_ora(cem, gmt)
#' # Check results
#' head(ora_data(cem))
#'
#' @rdname mod_ora
#' @export
setGeneric('mod_ora', function(cem, ...) {
standardGeneric('mod_ora')
})
#' @rdname mod_ora
setMethod('mod_ora', signature(cem='CEMiTool'),
function(cem, gmt, verbose=FALSE) {
#cem <- get_args(cem, vars=mget(ls()))
if(!"gene" %in% names(gmt) | !"term" %in% names(gmt)){
stop("The gmt object must contain two columns named 'term' and 'gene'")
}
if (verbose) {
message('Running ORA')
message("Using all genes in GMT file as universe.")
}
allgenes <- unique(gmt[, "gene"])
if(is.null(module_genes(cem))){
warning("No modules in CEMiTool object! Did you run find_modules()?")
return(cem)
}
mods <- split(cem@module[, "genes"], cem@module[, "modules"])
res_list <- lapply(names(mods), ora, gmt, allgenes, mods)
if (all(lapply(res_list, nrow) == 0)){
warning("Enrichment is NULL. Either your gmt file is inadequate or your modules really aren't enriched for any of the pathways in the gmt file.")
return(cem)
}
names(res_list) <- names(mods)
res <- lapply(names(res_list), function(x){
if(nrow(res_list[[x]]) > 0){
as.data.frame(cbind(x, res_list[[x]]))
}
})
res <- do.call(rbind, res)
names(res)[names(res) == "x"] <- "Module"
rownames(res) <- NULL
cem@ora <- res
return(cem)
}
)
#' Retrieve over representation analysis (ORA) results
#'
#' @param cem Object of class \code{CEMiTool}
#'
#' @details This function returns the results of the \code{mod_ora} function on the
#' \code{CEMiTool} object. The ID column corresponds to pathways in the gmt file for which
#' genes in the modules were enriched. The Count column shows the number of genes in the
#' module that are enriched for each pathway. The GeneRatio column shows the proportion of
#' genes in the module enriched for a given pathway out of all the genes in the module
#' enriched for any given pathway. The BgRatio column shows the proportion of genes in a
#' given pathway out of all the genes in the gmt file. For more details, please refer to
#' the \code{clusterProfiler} package documentation.
#'
#' @return Object of class \code{data.frame} with ORA data
#'
#' @references Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler:
#' an R package for comparing biological themes among gene clusters. OMICS:
#' A Journal of Integrative Biology. 2012, 16(5):284-287.
#'
#' @examples
#' # Get example CEMiTool object
#' data(cem)
#' # Read gmt file
#' gmt <- read_gmt(system.file('extdata', 'pathways.gmt',
#' package='CEMiTool'))
#' # Run module overrepresentation analysis
#' cem <- mod_ora(cem, gmt)
#' # Check results
#' head(ora_data(cem))
#' @rdname ora_data
#' @export
setGeneric("ora_data", function(cem) {
standardGeneric("ora_data")
})
#' @rdname ora_data
setMethod("ora_data", signature("CEMiTool"),
function(cem){
return(cem@ora)
})
#' Module Gene Set Enrichment Analysis
#'
#' Perfoms Gene Set Enrichment Analysis (GSEA) for each co-expression module found.
#'
#' @param cem Object of class \code{CEMiTool}.
#' @param gsea_scale If TRUE, transform data using z-score transformation. Default: TRUE
#' @param rank_method Character string indicating how to rank genes. Either "mean"
#' (the default) or "median".
#' @param gsea_min_size Minimum gene set size (Default: 15).
#' @param gsea_max_size Maximum gene set size (Default: 1000).
#' @param verbose logical. Report analysis steps.
#' @param ... Optional parameters.
#'
#' @return GSEA results.
#'
#' @examples
#' # Get example CEMiTool object
#' data(cem)
#' # Look at example annotation file
#' sample_annotation(cem)
#' # Run GSEA on network modules
#' cem <- mod_gsea(cem)
#' # Check results
#' gsea_data(cem)
#'
#' @seealso \code{\link{plot_gsea}}
#'
#' @rdname mod_gsea
#' @export
setGeneric('mod_gsea', function(cem, ...) {
standardGeneric('mod_gsea')
})
#' @rdname mod_gsea
setMethod('mod_gsea', signature(cem='CEMiTool'),
function(cem, gsea_scale=TRUE, rank_method="mean",
gsea_min_size=15, gsea_max_size=1000, verbose=FALSE) {
if(!tolower(rank_method) %in% c("mean", "median")){
stop("Invalid rank_method type. Valid values are 'mean' and 'median'")
}
if(nrow(expr_data(cem, filter=FALSE, apply_vst=FALSE)) == 0){
warning("CEMiTool object has no expression file!")
return(cem)
}
if (nrow(cem@sample_annotation)==0) {
warning('Looks like your sample_annotation slot is empty. Cannot proceed with gene set enrichment analysis.')
return(cem)
}
if(is.null(module_genes(cem))){
warning("No modules in CEMiTool object! Did you run find_modules()?")
return(cem)
}
#cem <- get_args(cem, vars=mget(ls()))
if (verbose) {
message('Running GSEA')
}
# creates gene sets from modules
modules <- unique(cem@module[, 'modules'])
gene_sets <- lapply(modules, function(mod){
return(cem@module[cem@module[, 'modules']==mod, 'genes'])
})
names(gene_sets) <- modules
annot <- cem@sample_annotation
class_col <- cem@class_column
sample_col <- cem@sample_name_column
classes <- unique(annot[, class_col])
# Check if expression samples are all in sample_annotation
expr_samples <- names(expr_data(cem, filter=FALSE, apply_vst=FALSE))
annot_samples <- sample_annotation(cem)[, sample_col]
if(!all(expr_samples %in% annot_samples)){
stop("Sample annotation file does not contain all samples in expression file. Please input new sample annotation file using function sample_annotation()")
}
if(length(expr_samples) < length(annot_samples)){
warning("Expression file has less samples than annotation file. Cutting annotation file.")
annot <- annot[annot[,sample_col] %in% expr_samples,]
annot_samples <- annot[, sample_col]
}
# expression to z-score
if(gsea_scale){
z_expr <- data.frame(t(scale(t(expr_data(cem, filter=cem@parameters$filter,
apply_vst=cem@parameters$apply_vst,
filter_pval=cem@parameters$filter_pval)),
center=TRUE,
scale=TRUE)),
check.names = FALSE,
stringsAsFactors=FALSE)
}else{
z_expr <- expr_data(cem, filter=cem@parameters$filter,
apply_vst=cem@parameters$apply_vst,
filter_pval=cem@parameters$filter_pval)
}
# calculates enrichment for each module for each class in annot
gsea_list <- lapply(classes, function(class_group){
if (verbose) {
message('Calculating modules enrichment analysis for class ',
class_group)
}
# samples of class == class_group
class_samples <- annot[annot[, class_col]==class_group, sample_col]
# genes ranked by rank_method
genes_ranked <- apply(z_expr[, class_samples, drop=FALSE], 1, tolower(rank_method))
genes_ranked <- sort(genes_ranked, decreasing=TRUE)
# BiocParallel setting up
BiocParallel::register(BiocParallel::SerialParam())
gsea_results <- fgsea::fgsea(pathways=gene_sets,
stats=genes_ranked,
minSize=gsea_min_size,
maxSize=gsea_max_size,
eps=0,
#nperm=10000,
nproc=0)
data.table::setDF(gsea_results)
gsea_results[, 'leadingEdge'] <- unlist(lapply(gsea_results[, 'leadingEdge'],
function(ledges){
ledges <- paste(ledges, collapse=",")
}))
columns <- colnames(gsea_results)
colnames(gsea_results) <- c(columns[1], paste0(columns[-1], "_", class_group))
return(gsea_results)
})
# merging all classes gsea results into one data.frame
all_classes_df <- Reduce(function(x,y) {
merge(x,y, all=TRUE, by='pathway')
}, gsea_list)
# separating ES / NES / pval
patterns <- list('es'='^ES_','nes'='^NES_', 'padj'='^padj_')
out_gsea <- lapply(patterns, function(pattern) {
desired_stat <- all_classes_df[, c('pathway',
grep(pattern, colnames(all_classes_df),value=TRUE))]
colnames(desired_stat) <- gsub(pattern, '', colnames(desired_stat))
return(desired_stat)
})
names(out_gsea) <- names(patterns)
if(all(unlist(lapply(out_gsea, nrow))) == 0){
warning("Unable to enrich any module for any given class")
}
cem@enrichment <- out_gsea
return(cem)
})
#' Retrieve Gene Set Enrichment Analysis (GSEA) results
#'
#' @param cem Object of class \code{CEMiTool}
#'
#' @return Object of class \code{list} with GSEA data
#' @examples
#' # Get example CEMiTool object
#' data(cem)
#' # Look at example annotation file
#' sample_annotation(cem)
#' # Run GSEA on network modules
#' cem <- mod_gsea(cem)
#' # Check results
#' gsea_data(cem)
#' @rdname gsea_data
#' @export
setGeneric("gsea_data", function(cem) {
standardGeneric("gsea_data")
})
#' @rdname gsea_data
setMethod("gsea_data", signature("CEMiTool"),
function(cem){
return(cem@enrichment)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.