In this vignette, we show you how to create and use your own custom pathway annotations or gene sets with pagoda.
library(knitr) opts_chunk$set( warning = FALSE, message = FALSE, fig.show = 'hold', fig.path = 'figures/genesets-', cache.path = 'cache/genesets-', cache = TRUE ) library(scde) data(pollen) cd <- pollen
# Use the org.Hs.eg.db package for GO annotations library(org.Hs.eg.db) # Translate gene names to ids ids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1])) # Reverse map rids <- names(ids) names(rids) <- ids # Convert ids per GO category to gene names go.env <- eapply(org.Hs.egGO2ALLEGS, function(x) as.character(na.omit(rids[x]))) go.env <- clean.gos(go.env) # Remove GOs with too few or too many genes go.env <- list2env(go.env) # Convert to an environment # Test class(go.env) head(ls(go.env)) # Look at gene set names head(get(ls(go.env)[1], go.env)) # Look at one gene set
Alternatively, we can use Ensembl's BioMart service to get the GO annotations.
library(biomaRt) library(GO.db) # Initialize the connection to the Ensembl BioMart Service # Available datasets can be listed with # listDatasets(useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")) # Use mmusculus_gene_ensembl for mouse ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host="www.ensembl.org") # Constructs a dataframe with two columns: hgnc_symbol and go_id # If rownames are Ensembl IDs, use ensembl_gene_id as filter value go <- getBM(attributes = c("hgnc_symbol", "go_id"), filters = "hgnc_symbol", values = rownames(cd), mart = ensembl) # Use the GO.db library to add a column with the GO-term to the dataframe go$term <- Term(go$go_id) # Create a named list of character vectors out of the df s = split(go$hgnc_symbol, paste(go$go_id,go$term)) # Saves the list as a R environment go.env <- list2env(s) # Test class(go.env) head(ls(go.env)) # Look at gene set names head(get(ls(go.env)[1], go.env)) # Look at one gene set
The GMT file format is a tab delimited file format that describes gene sets. GMT files for Broad's MSigDB and other gene sets can be downloaded from the Broad Website.
## read in Broad gmt format library(GSA) filename <- 'https://raw.githubusercontent.com/JEFworks/genesets/master/msigdb.v5.0.symbols.gmt' gs <- GSA.read.gmt(filename) ## number of gene sets n <- length(gs$geneset.names) ## create environment env <- new.env(parent=globalenv()) invisible(lapply(1:n,function(i) { genes <- as.character(unlist(gs$genesets[i])) name <- as.character(gs$geneset.names[i]) assign(name, genes, envir = env) })) go.env <- env # Test class(go.env) head(ls(go.env)) # Look at gene set names head(get(ls(go.env)[1], go.env)) # Look at one gene set
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.