#' Gene Set Enrichment Analysis
#'
#' \code{test_gsea} tests for enriched gene sets
#' in the differentially enriched proteins.
#' This can be done independently for the different contrasts.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param databases Character,
#' Databases to search for gene set enrichment.
#' See http://amp.pharm.mssm.edu/Enrichr/ for available databases.
#' @param contrasts Logical(1),
#' Whether or not to perform the gene set enrichment analysis
#' independently for the different contrasts.
#' @return A data.frame with enrichment terms (generated by \code{\link[enrichR]{enrichr}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' \dontrun{
#'
#' # Test enrichments
#' gsea_results_per_contrast <- test_gsea(dep)
#' gsea_results <- test_gsea(dep, contrasts = FALSE)
#'
#' gsea_kegg <- test_gsea(dep, databases = "KEGG_2016")
#'
#' }
#' @export
test_gsea <- function(dep,
databases = c("GO_Molecular_Function_2017b",
"GO_Cellular_Component_2017b",
"GO_Biological_Process_2017b"),
contrasts = TRUE) {
# Show error if inputs are not the required classes
assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
is.character(databases),
is.logical(contrasts),
length(contrasts) == 1)
# Check whether 'enrichR' is installed
if(!"enrichR" %in% rownames(installed.packages())) {
stop("test_enrichR() requires the 'enrichR' package",
"\nTo install the package run: install.packages('enrichR')")
}
row_data <- rowData(dep, use.names = FALSE)
# Show error if inputs do not contain required columns
if(any(!c("name", "ID") %in% colnames(row_data))) {
stop("'name' and/or 'ID' columns are not present in '",
deparse(substitute(dep)),
"'\nRun make_unique() and make_se() to obtain the required columns",
call. = FALSE)
}
if(length(grep("_p.adj|_diff", colnames(row_data))) < 1) {
stop("'[contrast]_diff' and/or '[contrast]_p.adj' columns are not present in '",
deparse(substitute(dep)),
"'\nRun test_diff() to obtain the required columns",
call. = FALSE)
}
# Check for valid databases
libraries <- enrichR::listEnrichrDbs()$libraryName
if(all(!databases %in% libraries)) {
stop("Please run `test_gsea()` with valid databases as argument",
"\nSee http://amp.pharm.mssm.edu/Enrichr/ for available databases")
}
if(any(!databases %in% libraries)) {
databases <- databases[databases %in% libraries]
message("Not all databases found",
"\nSearching the following databases: '",
paste0(databases, collapse = "', '"), "'")
}
# Run background list
message("Background")
background <- gsub("[.].*", "", row_data$name)
background_enriched <- enrichR::enrichr(background, databases)
df_background <- NULL
for(database in databases) {
temp <- background_enriched[database][[1]] %>%
mutate(var = database)
df_background <- rbind(df_background, temp)
}
df_background$contrast <- "background"
df_background$n <- length(background)
OUT <- df_background %>%
mutate(bg_IN = as.numeric(gsub("/.*", "", Overlap)),
bg_OUT = n - bg_IN) %>%
select(Term, bg_IN, bg_OUT)
if(contrasts) {
# Get gene symbols
df <- row_data %>%
as.data.frame() %>%
select(name, ends_with("_significant")) %>%
mutate(name = gsub("[.].*", "", name))
# Run enrichR for every contrast
df_enrich <- NULL
for(contrast in colnames(df[2:ncol(df)])) {
message(gsub("_significant", "", contrast))
significant <- df[df[[contrast]],]
genes <- significant$name
enriched <- enrichR::enrichr(genes, databases)
# Tidy output
contrast_enrich <- NULL
for(database in databases) {
temp <- enriched[database][[1]] %>%
mutate(var = database)
contrast_enrich <- rbind(contrast_enrich, temp)
}
contrast_enrich$contrast <- contrast
contrast_enrich$n <- length(genes)
# Background correction
cat("Background correction... ")
contrast_enrich <- contrast_enrich %>%
mutate(IN = as.numeric(gsub("/.*", "", Overlap)),
OUT = n - IN) %>%
select(-n) %>%
left_join(OUT, by = "Term") %>%
mutate(log_odds = log2((IN * bg_OUT) / (OUT * bg_IN)))
cat("Done.")
df_enrich <- rbind(df_enrich, contrast_enrich) %>%
mutate(contrast = gsub("_significant", "", contrast))
}
} else {
# Get gene symbols
significant <- row_data %>%
as.data.frame() %>%
select(name, significant) %>%
filter(significant) %>%
mutate(name = gsub("[.].*", "", name))
# Run enrichR
genes <- significant$name
enriched <- enrichR::enrichr(genes, databases)
# Tidy output
df_enrich <- NULL
for(database in databases) {
temp <- enriched[database][[1]] %>%
mutate(var = database)
df_enrich <- rbind(df_enrich, temp)
}
df_enrich$contrast <- "significant"
df_enrich$n <- length(genes)
# Background correction
cat("Background correction... ")
df_enrich <- df_enrich %>%
mutate(IN = as.numeric(gsub("/.*", "", Overlap)),
OUT = n - IN) %>%
select(-n) %>%
left_join(OUT, by = "Term") %>%
mutate(log_odds = log2((IN * bg_OUT) / (OUT * bg_IN)))
cat("Done.")
}
return(df_enrich)
}
#' Plot enriched Gene Sets
#'
#' \code{plot_gsea} plots enriched gene sets
#' from Gene Set Enrichment Analysis.
#'
#' @param gsea_results Data.frame,
#' Gene Set Enrichment Analysis results object.
#' (output from \code{\link{test_gsea}()}).
#' @param number Numeric(1),
#' Sets the number of enriched terms per contrast to be plotted.
#' @param alpha Numeric(1),
#' Sets the threshold for the adjusted P value.
#' @param contrasts Character,
#' Specifies the contrast(s) to plot.
#' If 'NULL' all contrasts will be plotted.
#' @param databases Character,
#' Specifies the database(s) to plot.
#' If 'NULL' all databases will be plotted.
#' @param nrow Numeric(1),
#' Sets the number of rows for the plot.
#' @param term_size Numeric(1),
#' Sets the text size of the terms.
#' @return A barplot of the enriched terms
#' (generated by \code{\link[ggplot2]{ggplot}}).
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' \dontrun{
#'
#' # Test enrichments
#' gsea_results <- test_gsea(dep)
#' plot_gsea(gsea_results)
#'
#' }
#' @export
plot_gsea <- function(gsea_results, number = 10, alpha = 0.05,
contrasts = NULL, databases = NULL,
nrow = 1,term_size = 8) {
assertthat::assert_that(is.data.frame(gsea_results),
is.numeric(number),
length(number) == 1,
is.numeric(alpha),
length(alpha) == 1,
is.numeric(term_size),
length(term_size) == 1,
is.numeric(nrow),
length(nrow) == 1)
# Check gsea_results object
if(any(!c("Term", "var",
"contrast","Adjusted.P.value")
%in% colnames(gsea_results))) {
stop("'", deparse(substitute(gsea_results)),
"' does not contain the required columns",
"\nRun test_enrichment() to obtain the required columns",
call. = FALSE)
}
if(!is.null(contrasts)) {
assertthat::assert_that(is.character(contrasts))
valid_contrasts <- unique(gsea_results$contrast)
if(!all(contrasts %in% valid_contrasts)) {
valid_cntrsts_msg <- paste0("Valid contrasts are: '",
paste0(valid_contrasts, collapse = "', '"),
"'")
stop("Not a valid contrast, please run `plot_gsea()`",
"with a valid contrast as argument\n",
valid_cntrsts_msg,
call. = FALSE)
}
if(!any(contrasts %in% valid_contrasts)) {
contrasts <- contrasts[contrasts %in% valid_contrasts]
message("Not all contrasts found",
"\nPlotting the following contrasts: '",
paste0(contrasts, collapse = "', '"), "'")
}
gsea_results <- filter(gsea_results, contrast %in% contrasts)
}
if(!is.null(databases)) {
assertthat::assert_that(is.character(databases))
valid_databases <- unique(gsea_results$var)
if(all(!databases %in% valid_databases)) {
valid_cntrsts_msg <- paste0("Valid databases are: '",
paste0(valid_databases, collapse = "', '"),
"'")
stop("Not a valid database, please run `plot_gsea()`",
"with valid databases as argument\n",
valid_cntrsts_msg,
call. = FALSE)
}
if(any(!databases %in% valid_databases)) {
databases <- databases[databases %in% valid_databases]
message("Not all databases found",
"\nPlotting the following databases: '",
paste0(databases, collapse = "', '"), "'")
}
gsea_results <- filter(gsea_results, var %in% databases)
}
# Get top enriched gene sets
terms <- gsea_results %>%
group_by(contrast, var) %>%
filter(Adjusted.P.value <= alpha) %>%
arrange(Adjusted.P.value) %>%
dplyr::slice(seq_len(number)) %>%
.$Term
subset <- gsea_results %>%
filter(Term %in% terms) %>%
arrange(var, Adjusted.P.value)
subset$Term <- parse_factor(subset$Term, levels = unique(subset$Term))
subset$var <- parse_factor(subset$var, levels = unique(subset$var))
# Plot top enriched gene sets
ggplot(subset, aes(Term,
log_odds)) +
#geom_tile(aes(fill = var)) +
geom_col(aes(fill = -log10(Adjusted.P.value))) +
facet_wrap(~contrast, nrow = nrow) +
coord_flip() +
labs(y = "Log2 odds ratio (versus current background)",
fill = "-Log10 adjusted P value") +
theme_DEP1() +
theme(legend.position = "top",
axis.text.y = element_text(size = term_size),
legend.text = element_text(size = 9))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.