knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Attach necessary libraries:
library(ASURATDB)
library(DOSE) # For using `data(DO2EG)`
ASURATDB function format_DO()
reformats a Disease Ontology database.
data(DO2EG) dict_DO <- enrichDO(unlist(DO2EG), ont = "DO", pvalueCutoff = 1, pAdjustMethod = "BH", minGSSize = 0, maxGSSize = 1e+10, qvalueCutoff = 1, readable = FALSE) human_DO <- format_DO(dict = dict_DO@result, all_geneIDs = dict_DO@gene, orgdb = org.Hs.eg.db::org.Hs.eg.db) # Save data. # save(human_DO, file = "genes2bioterm/20201213_human_DO.rda")
The data were stored in the following repositories:
ASURATDB functions collect_CO()
and format_CO()
load a Cell Ontology
database using ontoProc package and reformat the database, respectively.
Tips: As of December 2020, Cell Ontology database might not be complete enough for some biological contexts. For example, well-known marker genes for pancreatic beta cell, Ins1 and Ins2, were not registered for "type B pancreatic cell" with ID "CL:0000169".
# Human dict_CO <- collect_CO(orgdb = org.Hs.eg.db::org.Hs.eg.db) human_CO <- format_CO(dict = dict_CO, orgdb = org.Hs.eg.db::org.Hs.eg.db) # Save data. # save(human_CO, file = "genes2bioterm/20201213_human_CO.rda") # Mouse dict_CO <- collect_CO(orgdb = org.Mm.eg.db::org.Mm.eg.db) mouse_CO <- format_CO(dict = dict_CO, orgdb = org.Mm.eg.db::org.Mm.eg.db) # Save data. # save(mouse_CO, file = "genes2bioterm/20201211_mouse_CO.rda")
The data were stored in the following repositories:
ASURATDB functions collect_GO()
and format_GO()
load a Gene Ontology
database using clusterProfiler package and reformat the database, respectively.
Currently, only human and mouse data are acceptable.
# Human dict_GO <- collect_GO(orgdb = org.Hs.eg.db::org.Hs.eg.db) human_GO <- format_GO(dict = dict_GO, orgdb = org.Hs.eg.db::org.Hs.eg.db) # Human reduced human_GO_red <- human_GO onts <- c("MF", "BP", "CC") for(i in seq_along(onts)){ ids <- human_GO[[onts[i]]][which(human_GO[[onts[i]]]$Count >= 2), ]$ID mat <- human_GO$similarity_matrix[[onts[i]]][ids, ids] human_GO_red$similarity_matrix[[onts[i]]] <- mat } # Save data. # save(human_GO_red, file = "genes2bioterm/20201213_human_GO_red.rda") # Mouse dict_GO <- collect_GO(orgdb = org.Mm.eg.db::org.Mm.eg.db) mouse_GO <- format_GO(dict = dict_GO, orgdb = org.Mm.eg.db::org.Mm.eg.db) # Mouse reduced mouse_GO_red <- mouse_GO onts <- c("MF", "BP", "CC") for(i in seq_along(onts)){ ids <- mouse_GO[[onts[i]]][which(mouse_GO[[onts[i]]]$Count >= 2), ]$ID mat <- mouse_GO$similarity_matrix[[onts[i]]][ids, ids] mouse_GO_red$similarity_matrix[[onts[i]]] <- mat } # Save data. # save(mouse_GO_red, file = "genes2bioterm/20201211_mouse_GO_red.rda")
The data were stored in the following repositories:
ASURATDB functions collect_KEGG()
and format_KEGG()
load a KEGG database
using KEGGREST package via the internet and reformat the database, respectively.
The arguments of collect_KEGG()
are organism
and categories
.
Here, organism
must obey the naming rule of
KEGG
(see KEGGREST
function listDatabases()
) and categories
must be one of
"pathway"
, "module"
, and "drug"
(only for human) in the current version.
# Human dict_KEGG <- collect_KEGG(organism = "hsa", categories = c("pathway")) human_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]), orgdb = org.Hs.eg.db::org.Hs.eg.db) # Save data. # save(human_KEGG, file = "genes2bioterm/20201213_human_KEGG.rda") # Mouse dict_KEGG <- collect_KEGG(organism = "mmu", categories = c("pathway")) mouse_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]), orgdb = org.Mm.eg.db::org.Mm.eg.db) # Save data. # save(mouse_KEGG, file = "genes2bioterm/20201211_mouse_KEGG.rda") # Human (drug) dict_KEGG_drug <- collect_KEGG(organism = "hsa", categories = c("drug")) human_KEGG_drug <- format_KEGG(dict = list(drug = dict_KEGG_drug[["drug"]][["success"]]), orgdb = org.Hs.eg.db::org.Hs.eg.db) # Save data. # save(human_KEGG_drug, file = "genes2bioterm/20221102_human_KEGG_drug.rda")
Note collect_KEGG()
uses KEGGREST
function keggGet()
,
which may produce both successful and unsuccessful results.
The data were stored in the following repositories:
Load databases, where category is "H" (hallmark gene sets) and species is human
(cf. msigdbr::msigdbr_species()
).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "H")
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))] dbtable_gsetID <- unique(dbtable_gsetID) dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name) dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name) stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol))) res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC") res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res))) for(i in 1:length(dbtable_geneID)){ res <- rbind(res, data.frame( ID = dbtable_gsetID$gs_id[i], Description = dbtable_gsetID$gs_name[i], IC = NA, Count = length(dbtable_geneID[[i]]), Gene = paste(dbtable_symbol[[i]], collapse = "/"), GeneID = paste(dbtable_geneID[[i]], collapse = "/"))) } human_MSigDB_Hallmark <- list(hallmark = res) # Save data. # save(human_MSigDB_Hallmark, file = "genes2bioterm/20230127_human_MSigDB_Hallmark.rda")
The data were stored in the following repositories:
Load databases, where category is "C3" (regulatory target gene sets) and
species is human (cf. msigdbr::msigdbr_species()
).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C2") dbtable <- dbtable[which(dbtable$gs_subcat == "CP:BIOCARTA"), ]
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))] dbtable_gsetID <- unique(dbtable_gsetID) dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name) dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name) stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol))) res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC") res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res))) for(i in 1:length(dbtable_geneID)){ res <- rbind(res, data.frame( ID = dbtable_gsetID$gs_id[i], Description = dbtable_gsetID$gs_name[i], IC = NA, Count = length(dbtable_geneID[[i]]), Gene = paste(dbtable_symbol[[i]], collapse = "/"), GeneID = paste(dbtable_geneID[[i]], collapse = "/"))) } human_MSigDB_BIOCARTA <- list(BIOCARTA = res) # Save data. # save(human_MSigDB_BIOCARTA, file = "genes2bioterm/20230211_human_MSigDB_BIOCARTA.rda")
The data were stored in the following repositories:
Load databases, where category is "C3" (regulatory target gene sets) and
species is human (cf. msigdbr::msigdbr_species()
).
dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C3") dbtable <- dbtable[which(dbtable$gs_subcat == "TFT:GTRD"), ]
Reformat the database.
dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))] dbtable_gsetID <- unique(dbtable_gsetID) dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name) dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name) stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol))) res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC") res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res))) for(i in 1:length(dbtable_geneID)){ res <- rbind(res, data.frame( ID = dbtable_gsetID$gs_id[i], Description = dbtable_gsetID$gs_name[i], IC = NA, Count = length(dbtable_geneID[[i]]), Gene = paste(dbtable_symbol[[i]], collapse = "/"), GeneID = paste(dbtable_geneID[[i]], collapse = "/"))) } human_MSigDB_GTRD <- list(GTRD = res) # Save data. # save(human_MSigDB_GTRD, file = "genes2bioterm/20230211_human_MSigDB_GTRD.rda")
The data were stored in the following repositories:
Load databases.
dbtable <- clustermole::clustermole_markers()
sort(unique(dbtable$db))
[1] "ARCHS4" "CellMarker" "MSigDB" "PanglaoDB" "SaVanT" "TISSUES" [7] "xCell"
Select species and databases.
dbtable <- dbtable[which(dbtable$species == "Human"), ] dbtable <- dbtable[which(dbtable$db == "MSigDB"),] dbtable$geneID <- NA
Change gene symbols into entrez IDs.
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db, key = dbtable$gene_original, columns = c("SYMBOL", "ENTREZID"), keytype = "SYMBOL") dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ] dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),] for(i in 1:nrow(dbtable)){ gene <- dbtable$gene_original[i] inds <- which(dictionary$SYMBOL == gene) dbtable$geneID[i] <- dictionary[inds,]$ENTREZID }
Reformat the database. Here, the identifier of each biological term are named "MSigDBID."
dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype) dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype) stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol))) res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC") res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res))) for(i in 1:length(dbtable_geneID)){ res <- rbind(res, data.frame( ID = paste("MSigDBID:", i, sep = ""), Description = names(dbtable_geneID)[i], IC = NA, Count = length(dbtable_geneID[[i]]), Gene = paste(dbtable_symbol[[i]], collapse = "/"), GeneID = paste(dbtable_geneID[[i]], collapse = "/"))) } human_MSigDB <- list(cell = res) # Save data. # save(human_MSigDB, file = "genes2bioterm/20220308_human_MSigDB.rda")
The data were stored in the following repositories:
Load databases.
dbtable <- clustermole::clustermole_markers()
sort(unique(dbtable$db))
[1] "ARCHS4" "CellMarker" "MSigDB" "PanglaoDB" "SaVanT" "TISSUES" [7] "xCell"
Select species and databases.
dbtable <- dbtable[which(dbtable$species == "Human"), ] dbtable <- dbtable[which(dbtable$db == "CellMarker"),] dbtable$geneID <- NA
Change gene symbols into entrez IDs.
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db, key = dbtable$gene_original, columns = c("SYMBOL", "ENTREZID"), keytype = "SYMBOL") dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ] dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),] for(i in 1:nrow(dbtable)){ gene <- dbtable$gene_original[i] inds <- which(dictionary$SYMBOL == gene) dbtable$geneID[i] <- dictionary[inds,]$ENTREZID }
Reformat the database. Here, the identifier of each biological term are named "CellMarkerID."
dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype) dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype) stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol))) res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC") res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res))) for(i in 1:length(dbtable_geneID)){ res <- rbind(res, data.frame( ID = paste("CellMarkerID:", i, sep = ""), Description = names(dbtable_geneID)[i], IC = NA, Count = length(dbtable_geneID[[i]]), Gene = paste(dbtable_symbol[[i]], collapse = "/"), GeneID = paste(dbtable_geneID[[i]], collapse = "/"))) } human_CellMarker <- list(cell = res) # Save data. # save(human_CellMarker, file = "genes2bioterm/20220308_human_CellMarker.rda")
The data were stored in the following repositories:
Create a cell type-related database by combining Cell ontology and MSigDB databases for analyzing human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/" load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true"))) load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true"))) res <- rbind(human_CO[["cell"]], human_MSigDB[["cell"]]) human_CB <- list(cell = res)
Create a cell type-related database by combining Cell ontology, MSigDB, and CellMarker databases for analyzing human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/" load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true"))) load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true"))) load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true"))) res <- do.call("rbind", list(human_CO[["cell"]], human_MSigDB[["cell"]], human_CellMarker[["cell"]])) human_CB <- list(cell = res)
Create a cell type-related database by combining Disease Ontology, Cell ontology and MSigDB databases for analyzing complex human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/" load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true"))) load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true"))) load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true"))) res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]], human_MSigDB[["cell"]])) human_CB <- list(cell = res)
Create a cell type-related database by combining Disease Ontology, Cell ontology, MSigDB, and CellMarker databases for analyzing complex human single-cell transcriptome data.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/" load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true"))) load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true"))) load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true"))) load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true"))) res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]], human_MSigDB[["cell"]], human_CellMarker[["cell"]])) human_CB <- list(cell = res)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.