## nolint start suppressPackageStartupMessages({ ## CRAN packages. library(R.utils) library(ggnewscale) library(stringi) library(withr) ## Bioconductor packages. library(DESeq2) library(DOSE) library(clusterProfiler) library(enrichplot) library(pathview) ## Acid Genomics packages. library(goalie) library(AcidGenomes) library(basejump) library(DESeqAnalysis) library(AcidGSEA) library(bcbioRNASeq) }) prepareTemplate() source("_setup.R") ## May encounter warning from ggrepel package otherwise. options("ggrepel.max.overlaps" = Inf) ## Avoid logical coercion error in DOSE. Sys.setenv( "_R_CHECK_LENGTH_1_CONDITION_" = "false", "_R_CHECK_LENGTH_1_LOGIC2_" = "false" ) ## nolint end keggPlotsDir <- file.path(params[["results_dir"]], "kegg-plots") invisible(lapply( X = c( params[["data_dir"]], params[["results_dir"]], keggPlotsDir ), FUN = initDir )) assert(isSubset(params[["go_class"]], c("BP", "CC", "MF"))) pathview <- params[["pathview"]]
First, we need to load the DESeqAnalysis
object, which has been generated
from our bcbioRNASeq
object during the differential expression step.
## Ensure that our DESeqAnalysis object contains NCBI gene identifiers. deseq <- import(params[["deseq_file"]]) deseq <- updateObject(deseq) assert( is(deseq, "DESeqAnalysis"), validObject(deseq), isSubset(c("geneId", "ncbiGeneId"), colnames(rowData(deseq@data))) ) rowRanges(deseq@data) <- EnsemblGenes(rowRanges(deseq@data)) if (!isSubset("geneIdNoVersion", colnames(rowData(deseq@data)))) { rowData(deseq@data)[["geneIdNoVersion"]] <- rowData(deseq@data)[["geneId"]] rowData(deseq@transform)[["geneIdNoVersion"]] <- rowData(deseq@transform)[["geneId"]] } rownames(deseq@data) <- as.character(rowData(deseq@data)[["geneIdNoVersion"]]) rownames(deseq@transform) <- as.character(rowData(deseq@transform)[["geneIdNoVersion"]]) if (is.list(deseq@results)) { deseq@results <- lapply( X = deseq@results, rownames = rownames(deseq@data), FUN = function(res, rownames) { rownames(res) <- rownames res } ) } if (is.list(deseq@lfcShrink)) { deseq@lfcShrink <- lapply( X = deseq@lfcShrink, rownames = rownames(deseq@data), FUN = function(res, rownames) { rownames(res) <- rownames res } ) } assert(validObject(deseq)) print(deseq)
We need to extract the corresponding DESeqDataSet
and DESeqResults
objects.
dds <- as.DESeqDataSet(deseq) print(dds)
## > help(topic = "results", package = "DESeqAnalysis") ## > help(topic = "resultsNames", package = "DESeqAnalysis") res <- DESeqAnalysis::results(object = deseq, i = params[["results_name"]]) assert( is(res, "DESeqResults"), validObject(res) ) ## Not allowing the user to apply post-hoc LFC filtering in params. alphaThreshold <- alphaThreshold(res) lfcThreshold <- lfcThreshold(res) print(res)
We need to get the relevant Bioconductor OrgDb
package from the full Latin
organism name (e.g. "Homo sapiens"
to "org.Hs.eg.db"
).
organism <- organism(dds) match <- stri_match_first_regex( str = organism, pattern = "^([A-Z])[a-z]+\\s([a-z])[a-z]+$" ) orgDb <- paste( "org", paste(match[, 2L:3L], collapse = ""), "eg", "db", sep = "." ) assert(isInstalled(orgDb)) rm(match)
Now we can prepare the input required for enrichment analysis.
## Subset NA adjusted P values. res <- res[!is.na(res[["padj"]]), , drop = FALSE] res <- res[order(res[["padj"]]), , drop = FALSE] ## Match the DESeqDataSet. dds <- dds[rownames(res), , drop = FALSE] ## Set our background genes vector (aka universe). allEnsemblGenes <- rownames(res) ## > help(topic = "deg", package = "DESeqAnalysis") sigEnsemblGenes <- deg(object = res, direction = "both") sigEnsemblRes <- res[sigEnsemblGenes, , drop = FALSE] ## LFC ranked list vector. ensemblLfcVec <- sigEnsemblRes[["log2FoldChange"]] names(ensemblLfcVec) <- rownames(sigEnsemblRes) ## Sort from upregulated to downregulated. ensemblLfcVec <- sort(ensemblLfcVec, decreasing = TRUE) summary(ensemblLfcVec)
NCBI gene (Entrez) identifiers are required for [Kyoto Encyclopedia of Genes and Genomes (KEGG)][KEGG] analysis. Here we are defining 1:1 mappings of the Ensembl gene identifiers to NCBI gene identifiers. For genes that map to multiple NCBI identifiers, we are using the oldest identifier to define the 1:1 mapping.
map <- EnsemblToNcbi(dds) assert(isSubset(rownames(map), rownames(dds))) allNcbiGenes <- map[["ncbiGeneId"]] names(allNcbiGenes) <- rownames(map) str(allNcbiGenes)
Now we can obtain a vector of significant genes and LFCs that map to Entrez.
idx <- na.omit(match(x = sigEnsemblGenes, table = names(allNcbiGenes))) sigNcbiGenes <- allNcbiGenes[idx] ncbiLfcVec <- sigEnsemblRes[names(sigNcbiGenes), "log2FoldChange", drop = TRUE] names(ncbiLfcVec) <- unname(sigNcbiGenes) ## Sort from upregulated to downregulated. ncbiLfcVec <- sort(ncbiLfcVec, decreasing = TRUE) str(ncbiLfcVec)
A common approach in analyzing gene expression profiles was identifying differential expressed genes that are deemed interesting. The enrichment analysis we demonstrated previously were based on these differentially expressed genes. This approach will find genes where the difference is large, but it will not detect a situation where the difference is small, but evidenced in coordinated way in a set of related genes. [Gene Set Enrichment Analysis (GSEA)][GSEA] directly addresses this limitation. All genes can be used in [GSEA][]; [GSEA][] aggregates the per gene statistics across genes within a gene set, therefore making it possible to detect situations where all genes in a predefined set change in a small but coordinated way. Since it is likely that many relevant phenotypic differences are manifested by small but consistent changes in a set of genes.
## > help(topic = "RankedList", package = "AcidGSEA") ## Assuming the DESeq analysis input contains Ensembl reference genome. The ## gene identifiers should be defined as `"geneId"` in DESeqDataSet rowData. rankedListEnsembl <- RankedList( object = deseq, value = "stat", keyType = "ensemblGeneId" ) rankedListEnsembl <- rankedListEnsembl[[params[["results_name"]]]] ## NCBI gene identifiers must be defined in the `"ncbiGeneId"` column of ## DESeqDataSet rowData, otherwise this step will error. rankedListNcbi <- RankedList( object = deseq, value = "stat", keyType = "ncbiGeneId" ) rankedListNcbi <- rankedListNcbi[[params[["results_name"]]]] assert( is.numeric(rankedListEnsembl), is.numeric(rankedListNcbi) )
[Gene Ontology (GO)][GO] term enrichment is a technique for interpreting sets of genes making use of the [Gene Ontology][GO] system of classification, in which genes are assigned to a set of predefined bins depending on their functional characteristics.
## > help(topic = "enrichGO", package = "clusterProfiler") enrichGo <- enrichGO( gene = sigEnsemblGenes, OrgDb = orgDb, keyType = "ENSEMBL", ont = params[["go_class"]], universe = allEnsemblGenes, qvalueCutoff = 0.05, readable = TRUE ) enrichGoDf <- enrichGo |> slot("result") |> as.data.frame() |> camelCase() saveData( enrichGo, enrichGoDf, dir = params[["data_dir"]] ) export( object = enrichGoDf, con = file.path( params[["results_dir"]], paste0( paste( "go", tolower(params[["go_class"]]), "clusterprofiler", "padj", alphaThreshold, "lfc", lfcThreshold, sep = "_" ), ".csv.gz" ) ) )
## > help(topic = "dotplot", package = "enrichplot") try({ suppressWarnings({ dotplot(enrichGo, showCategory = 25L) }) })
## > help(topic = "emapplot", package = "enrichplot") try({ suppressWarnings({ emapplot( x = pairwise_termsim(enrichGo), showCategory = 25L ) }) })
In order to consider the potentially biological complexities in which a gene may belong to multiple annotation categories, and provide information of numeric changes if available.
Here we are plotting genes colored by LFC for top significant [GO][] terms.
## > help(topic = "cnetplot", package = "enrichplot") try({ suppressWarnings({ cnetplot(enrichGo, foldChange = ensemblLfcVec) }) })
Now we're ready to run GSEA using the GO terms.
## > help(topic = "gseGO", package = "clusterProfiler") gseaGo <- gseGO( geneList = rankedListEnsembl, ont = params[["go_class"]], OrgDb = get(orgDb, envir = asNamespace(orgDb)), keyType = "ENSEMBL", pvalueCutoff = 0.05 ) gseaGoDf <- gseaGo |> slot("result") |> as.data.frame() |> camelCase() saveData( gseaGo, gseaGoDf, dir = params[["data_dir"]] ) export( object = gseaGoDf, con = file.path( params[["results_dir"]], paste0( paste( "gsea", "clusterprofiler", "padj", alphaThreshold, "lfc", lfcThreshold, sep = "_" ), ".csv.gz" ) ) )
Match the KEGG organism code (e.g. "hsa"
for Homo sapiens).
## > help(topic = "search_kegg_organism", package = "clusterProfiler") keggCode <- search_kegg_organism(organism, "scientific_name") |> getElement("kegg_code") assert( isString(keggCode), isMatchingRegex(pattern = "^[a-z]{3}$", x = keggCode) )
## > help(topic = "enrichKEGG", package = "clusterProfiler") kegg <- enrichKEGG( gene = as.character(sigNcbiGenes), organism = keggCode, keyType = "ncbi-geneid", universe = as.character(allNcbiGenes), qvalueCutoff = 0.05 ) keggDf <- kegg |> slot("result") |> as.data.frame() |> camelCase() saveData( kegg, keggDf, dir = params[["data_dir"]] ) export( object = keggDf, con = file.path( params[["results_dir"]], paste0( paste( "kegg", "clusterprofiler", "padj", alphaThreshold, "lfc", lfcThreshold, sep = "_" ), ".csv.gz" ) ) )
[GSEA][] analysis is performed with the [clusterProfiler][] tool using KEGG gene sets and using the log2 fold changes as input. By using the log2 fold changes as the input, we are identifying pathways with genes that exhibit coordinated fold changes that are larger than might be expected by chance. The significant pathways can be visualized using the log2 fold changes with the Pathview tool.
Gene set enrichment analysis tools use ranked lists of genes (here ranked by log2FC) without using a threshold. This allows the tools to use more information to identify enriched biological processes. The introduction to gene set enrichment analysis goes into more detail about some of the advantages of this approach. By using the log2 fold changes as the input, we are identifying pathways with genes that exhibit coordinated fold changes that are larger than might be expected by chance. The significant pathways can be visualized using the log2 fold changes with the [pathview][] tool.
The significantly dysregulated pathways (q-value (FDR) less than 0.05
) are
displayed below in the pathway images, which show the degree of dysregulation
of the genes (the minus direction (green) is down-regulated, while the positive
direction (red) is up-regulated).
When performing [GSEA][] analysis it may be useful to adjust the minGSSize
and/or maxGSSize
parameter based on the pathways you would like to search for
significance. If you are interested in smaller pathways (e.g. a gene set size
of 24 genes), then you would want to adjust the minGSSize
to less than 24. If
you are only interested in larger pathways, then you would want to adjust the
GSSize
to a larger number. The fewer pathways tested, the less we need to
correct for multiple test correction, so by adjusting the minGSSize
and
maxGSSize
parameters you can test fewer pathways and limit testing to the
pathways of interest.
## > help(topic = "gseKEGG", package = "clusterProfiler") gseaKegg <- gseKEGG( geneList = ncbiLfcVec, organism = keggCode, keyType = "ncbi-geneid" ) gseaKeggDf <- gseaKegg |> slot("result") |> as.data.frame() |> camelCase() saveData( gseaKegg, gseaKeggDf, dir = params[["data_dir"]] ) export( object = gseaKeggDf, con = file.path( params[["results_dir"]], paste0( paste( "gsea", "kegg", "clusterprofiler", "padj", alphaThreshold, "lfc", lfcThreshold, sep = "_" ), ".csv.gz" ) ) )
## > help(topic = "pathview", package = "pathview") ## ## This requires Entrez identifiers to be defined in the DESeqAnalysis object ## defined in params. ## ## The `gene.data` vector must be numeric with Entrez identifiers as names. ## ## There is currently no way to set the output path of the pathview PNG files, ## so we're changing the working directory temporarily using the withr package. ## ## Also, We're using `tryCatch()` here to return to the user any pathways that ## didn't output graphics correctly. ## ## Using `withTimeout` here to harden against very large pathway sets ## (e.g.`"mmu01100"` for F1000 paper example) that can fail to plot. ## ## dplyr may need to be unloaded for pathview to work. pathways <- gseaKeggDf[["id"]] if (hasLength(pathways)) { ## > suppressWarnings({ ## > detach("package:dplyr", unload = TRUE, force = TRUE)) ## > }) with_dir( new = keggPlotsDir, code = { invisible(lapply( X = pathways, FUN = function(pathway) { tryCatch( expr = withTimeout( expr = { pathview( gene.data = ncbiLfcVec, pathway.id = pathway, gene.idtype = "entrez", gene.annotpkg = NULL, species = keggCode, ## Use bidirectional coloring. limit = list(gene = 2L) ) }, timeout = 60L ), error = function(e) { warning(sprintf("%s failed to plot.", pathway)) } ) } )) } ) figures <- list.files( path = keggPlotsDir, pattern = "pathview", full.names = TRUE ) invisible(lapply( X = figures, FUN = function(figure) { markdownHeader(basename(figure), level = 4L, asis = TRUE) cat(paste0("<img src=\"", figure, "\">\n")) } )) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.