library(msigdbr)
cat.descriptions <- c("H" = "Hallmark gene sets summarize and represent specific well-defined biological states or processes and display coherent expression. These gene sets were generated by a computational methodology based on identifying overlaps between gene sets in other MSigDB collections and retaining genes that display coordinate expression.",
"C1" = "Gene sets corresponding to each human chromosome and each cytogenetic band",
"C2" = "Gene sets in this collection are curated from various sources, including online pathway databases and the biomedical literature. Many sets are also contributed by individual domain experts.",
"C3" = "Gene sets representing potential targets of regulation by transcription factors or microRNAs. The sets consist of genes grouped by elements they share in their non-protein coding regions. The elements represent known or likely cis-regulatory elements in promoters and 3'-UTRs.",
"C4" = "Computational gene sets defined by mining large collections of cancer-oriented microarray data.",
"C5" = "Gene sets that contain genes annotated by the same GO term.",
"C6" = "Gene sets that represent signatures of cellular pathways which are often dis-regulated in cancer. The majority of signatures were generated directly from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation of known cancer genes.",
"C7" = "Gene sets that represent cell states and perturbations within the immune system. The signatures were generated by manual curation of published studies in human and mouse immunology.")
subcat.descriptions <- c("CGP" = "Chemical and genetic perturbations",
"CP" = "Additional currated pathways",
"CP:BIOCARTA" = "Canonical Pathways gene sets derived from the BioCarta pathway database.",
"CP:KEGG" = "Canonical Pathways gene sets derived from the KEGG pathway database.",
"CP:PID" = "Canonical Pathways gene sets derived from the Pathway Interaction Database (PID) pathway database.",
"CP:REACTOME" = "Canonical Pathways gene sets derived from the Reactome pathway database.",
"MIR" = "All miRNA target prediction gene sets. Combined superset of both miRDB prediction methods and legacy sets.",
"TFT" = "All transcription factor target prediction gene sets. Combined superset of both GTRD prediction methods and legacy sets.",
"CGN" = "Gene sets defined by expression neighborhoods centered on 380 cancer-associated genes. This collection is described in Subramanian, Tamayo et al. 2005",
"CM" = "Gene sets defined by Segal et al. 2004. Briefly, the authors compiled gene sets ('modules') from a variety of resources such as KEGG, GO, and others. By mining a large compendium of cancer-related microarray data, they identified 456 such modules as significantly changed in a variety of cancer conditions.",
"BP" = "Gene sets derived from the GO Biological Process Ontology.",
"CC" = "Gene sets derived from the GO Cellular Component Ontology.",
"MF" = "Gene sets derived from the GO Molecular Function Ontology.",
"N/A" = "No subcategory available.")
# Download gene sets and find unique categories/subcategories
m <- as.data.frame(msigdbr::msigdbr(species = "Homo sapiens"))
u <- unique(m[,c("gs_cat", "gs_subcat")])
# Create unique ids
u.id <- u[,1]
u.sub.present.ix <- u[,2] != ""
u.id[u.sub.present.ix] <- paste0(u[u.sub.present.ix,1], "-", u[u.sub.present.ix,2])
u[!u.sub.present.ix,2] <- "N/A"
# Create final table
msigdb_table <- data.frame(ID = u.id,
Category = u[,1],
Subcategory = u[,2],
Category_Description = cat.descriptions[u[,1]],
Subcategory_Description = subcat.descriptions[u[,2]],
stringsAsFactors = FALSE)
msigdb_table <- tab[order(tab[,1]),]
# Save to file, assuming this script is being run in the "data-raw" folder
# within the singleCellTK package
save(msigdb_table, file = "../data/msigdb_table.rda")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.