In keita-iida/ASURATDB: ASURAT database builder

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Installations

Attach necessary libraries:

library(ASURATDB)

Collect Disease Ontology database for human cells

library(DOSE) # For using `data(DO2EG)`

ASURATDB function format_DO() reformats a Disease Ontology database.

data(DO2EG)
dict_DO <- enrichDO(unlist(DO2EG), ont = "DO", pvalueCutoff = 1,
                    pAdjustMethod = "BH", minGSSize = 0, maxGSSize = 1e+10,
                    qvalueCutoff = 1, readable = FALSE)
human_DO <- format_DO(dict = dict_DO@result, all_geneIDs = dict_DO@gene,
                      orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_DO, file = "genes2bioterm/20201213_human_DO.rda")

The data were stored in the following repositories:

Collect Cell Ontology database for human and mouse cells

ASURATDB functions collect_CO() and format_CO() load a Cell Ontology database using ontoProc package and reformat the database, respectively.

Tips: As of December 2020, Cell Ontology database might not be complete enough for some biological contexts. For example, well-known marker genes for pancreatic beta cell, Ins1 and Ins2, were not registered for "type B pancreatic cell" with ID "CL:0000169".

# Human
dict_CO <- collect_CO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
human_CO <- format_CO(dict = dict_CO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_CO, file = "genes2bioterm/20201213_human_CO.rda")

# Mouse
dict_CO <- collect_CO(orgdb = org.Mm.eg.db::org.Mm.eg.db)
mouse_CO <- format_CO(dict = dict_CO, orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Save data.
# save(mouse_CO, file = "genes2bioterm/20201211_mouse_CO.rda")

The data were stored in the following repositories:

Collect Gene Ontology database for human and mouse cells

ASURATDB functions collect_GO() and format_GO() load a Gene Ontology database using clusterProfiler package and reformat the database, respectively. Currently, only human and mouse data are acceptable.

# Human
dict_GO <- collect_GO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
human_GO <- format_GO(dict = dict_GO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Human reduced
human_GO_red <- human_GO
onts <- c("MF", "BP", "CC")
for(i in seq_along(onts)){
  ids <- human_GO[[onts[i]]][which(human_GO[[onts[i]]]$Count >= 2), ]$ID
  mat <- human_GO$similarity_matrix[[onts[i]]][ids, ids]
  human_GO_red$similarity_matrix[[onts[i]]] <- mat
}
# Save data.
# save(human_GO_red, file = "genes2bioterm/20201213_human_GO_red.rda")

# Mouse
dict_GO <- collect_GO(orgdb = org.Mm.eg.db::org.Mm.eg.db)
mouse_GO <- format_GO(dict = dict_GO, orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Mouse reduced
mouse_GO_red <- mouse_GO
onts <- c("MF", "BP", "CC")
for(i in seq_along(onts)){
  ids <- mouse_GO[[onts[i]]][which(mouse_GO[[onts[i]]]$Count >= 2), ]$ID
  mat <- mouse_GO$similarity_matrix[[onts[i]]][ids, ids]
  mouse_GO_red$similarity_matrix[[onts[i]]] <- mat
}
# Save data.
# save(mouse_GO_red, file = "genes2bioterm/20201211_mouse_GO_red.rda")

The data were stored in the following repositories:

Collect KEGG database for human and mouse cells

ASURATDB functions collect_KEGG() and format_KEGG() load a KEGG database using KEGGREST package via the internet and reformat the database, respectively.

The arguments of collect_KEGG() are organism and categories. Here, organism must obey the naming rule of KEGG (see KEGGREST function listDatabases()) and categories must be one of "pathway", "module", and "drug" (only for human) in the current version.

# Human
dict_KEGG <- collect_KEGG(organism = "hsa", categories = c("pathway"))
human_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
                          orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_KEGG, file = "genes2bioterm/20201213_human_KEGG.rda")

# Mouse
dict_KEGG <- collect_KEGG(organism = "mmu", categories = c("pathway"))
mouse_KEGG <- format_KEGG(dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
                          orgdb = org.Mm.eg.db::org.Mm.eg.db)
# Save data.
# save(mouse_KEGG, file = "genes2bioterm/20201211_mouse_KEGG.rda")

# Human (drug)
dict_KEGG_drug <- collect_KEGG(organism = "hsa", categories = c("drug"))
human_KEGG_drug <- format_KEGG(dict = list(drug = dict_KEGG_drug[["drug"]][["success"]]),
                               orgdb = org.Hs.eg.db::org.Hs.eg.db)
# Save data.
# save(human_KEGG_drug, file = "genes2bioterm/20221102_human_KEGG_drug.rda")

Note collect_KEGG() uses KEGGREST function keggGet(), which may produce both successful and unsuccessful results. The data were stored in the following repositories:

Collect MSigDB for human cells

H: hallmark gene sets

Load databases, where category is "H" (hallmark gene sets) and species is human (cf. msigdbr::msigdbr_species()).

dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "H")

Reformat the database.

dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))

res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
  res <- rbind(res, data.frame(
    ID = dbtable_gsetID$gs_id[i],
    Description = dbtable_gsetID$gs_name[i],
    IC = NA,
    Count = length(dbtable_geneID[[i]]),
    Gene = paste(dbtable_symbol[[i]], collapse = "/"),
    GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_Hallmark <- list(hallmark = res)
# Save data.
# save(human_MSigDB_Hallmark, file = "genes2bioterm/20230127_human_MSigDB_Hallmark.rda")

The data were stored in the following repositories:

Github ASURATDB

C2: curated gene sets (BioCarta subset of CP)

Load databases, where category is "C3" (regulatory target gene sets) and species is human (cf. msigdbr::msigdbr_species()).

dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C2")
dbtable <- dbtable[which(dbtable$gs_subcat == "CP:BIOCARTA"), ]

Reformat the database.

dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))

res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
  res <- rbind(res, data.frame(
    ID = dbtable_gsetID$gs_id[i],
    Description = dbtable_gsetID$gs_name[i],
    IC = NA,
    Count = length(dbtable_geneID[[i]]),
    Gene = paste(dbtable_symbol[[i]], collapse = "/"),
    GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_BIOCARTA <- list(BIOCARTA = res)
# Save data.
# save(human_MSigDB_BIOCARTA, file = "genes2bioterm/20230211_human_MSigDB_BIOCARTA.rda")

The data were stored in the following repositories:

Github ASURATDB

C3: regulatory target gene sets (TFT:GTRD)

Load databases, where category is "C3" (regulatory target gene sets) and species is human (cf. msigdbr::msigdbr_species()).

dbtable <- msigdbr::msigdbr(species = "Homo sapiens", category = "C3")
dbtable <- dbtable[which(dbtable$gs_subcat == "TFT:GTRD"), ]

Reformat the database.

dbtable_gsetID <- dbtable[, which(colnames(dbtable) %in% c("gs_name", "gs_id"))]
dbtable_gsetID <- unique(dbtable_gsetID)
dbtable_geneID <- split(x = dbtable$human_entrez_gene, f = dbtable$gs_name)
dbtable_symbol <- split(x = dbtable$gene_symbol, f = dbtable$gs_name)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))

res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
  res <- rbind(res, data.frame(
    ID = dbtable_gsetID$gs_id[i],
    Description = dbtable_gsetID$gs_name[i],
    IC = NA,
    Count = length(dbtable_geneID[[i]]),
    Gene = paste(dbtable_symbol[[i]], collapse = "/"),
    GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB_GTRD <- list(GTRD = res)
# Save data.
# save(human_MSigDB_GTRD, file = "genes2bioterm/20230211_human_MSigDB_GTRD.rda")

The data were stored in the following repositories:

Github ASURATDB

Cell types in MSigDB

Load databases.

dbtable <- clustermole::clustermole_markers()

sort(unique(dbtable$db))

[1] "ARCHS4"     "CellMarker" "MSigDB"     "PanglaoDB"  "SaVanT"     "TISSUES"   
[7] "xCell"

Select species and databases.

dbtable <- dbtable[which(dbtable$species == "Human"), ]
dbtable <- dbtable[which(dbtable$db == "MSigDB"),]
dbtable$geneID <- NA

Change gene symbols into entrez IDs.

dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
                                    key = dbtable$gene_original,
                                    columns = c("SYMBOL", "ENTREZID"),
                                    keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),]
for(i in 1:nrow(dbtable)){
  gene <- dbtable$gene_original[i]
  inds <- which(dictionary$SYMBOL == gene)
  dbtable$geneID[i] <- dictionary[inds,]$ENTREZID
}

Reformat the database. Here, the identifier of each biological term are named "MSigDBID."

dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype)
dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))

res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
  res <- rbind(res, data.frame(
    ID = paste("MSigDBID:", i, sep = ""),
    Description = names(dbtable_geneID)[i],
    IC = NA,
    Count = length(dbtable_geneID[[i]]),
    Gene = paste(dbtable_symbol[[i]], collapse = "/"),
    GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_MSigDB <- list(cell = res)
# Save data.
# save(human_MSigDB, file = "genes2bioterm/20220308_human_MSigDB.rda")

The data were stored in the following repositories:

Collect CellMarker for human cells

Load databases.

dbtable <- clustermole::clustermole_markers()

sort(unique(dbtable$db))

[1] "ARCHS4"     "CellMarker" "MSigDB"     "PanglaoDB"  "SaVanT"     "TISSUES"   
[7] "xCell"

Select species and databases.

dbtable <- dbtable[which(dbtable$species == "Human"), ]
dbtable <- dbtable[which(dbtable$db == "CellMarker"),]
dbtable$geneID <- NA

Change gene symbols into entrez IDs.

dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
                                    key = dbtable$gene_original,
                                    columns = c("SYMBOL", "ENTREZID"),
                                    keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
dictionary <- dictionary[which(!is.na(dictionary$SYMBOL)),]
for(i in 1:nrow(dbtable)){
  gene <- dbtable$gene_original[i]
  inds <- which(dictionary$SYMBOL == gene)
  dbtable$geneID[i] <- dictionary[inds,]$ENTREZID
}

Reformat the database. Here, the identifier of each biological term are named "CellMarkerID."

dbtable_geneID <- split(x = dbtable$geneID, f = dbtable$celltype)
dbtable_symbol <- split(x = dbtable$gene_original, f = dbtable$celltype)
stopifnot(identical(length(dbtable_geneID), length(dbtable_symbol)))

res <- c("ID", "Description", "Count", "Gene", "GeneID", "IC")
res <- data.frame(matrix(ncol = 6, nrow = 0, dimnames = list(NULL, res)))
for(i in 1:length(dbtable_geneID)){
  res <- rbind(res, data.frame(
    ID = paste("CellMarkerID:", i, sep = ""),
    Description = names(dbtable_geneID)[i],
    IC = NA,
    Count = length(dbtable_geneID[[i]]),
    Gene = paste(dbtable_symbol[[i]], collapse = "/"),
    GeneID = paste(dbtable_geneID[[i]], collapse = "/")))
}
human_CellMarker <- list(cell = res)
# Save data.
# save(human_CellMarker, file = "genes2bioterm/20220308_human_CellMarker.rda")

The data were stored in the following repositories:

Create a custom-built database

Combine CO and MSigDB for human cells

Create a cell type-related database by combining Cell ontology and MSigDB databases for analyzing human single-cell transcriptome data.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
res <- rbind(human_CO[["cell"]], human_MSigDB[["cell"]])
human_CB <- list(cell = res)

Combine CO, MSigDB, and CellMarker for human cells

Create a cell type-related database by combining Cell ontology, MSigDB, and CellMarker databases for analyzing human single-cell transcriptome data.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true")))
res <- do.call("rbind", list(human_CO[["cell"]], human_MSigDB[["cell"]],
                             human_CellMarker[["cell"]]))
human_CB <- list(cell = res)

Combine DO, CO, and MSigDB for human cells

Create a cell type-related database by combining Disease Ontology, Cell ontology and MSigDB databases for analyzing complex human single-cell transcriptome data.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true")))
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]],
                             human_MSigDB[["cell"]]))
human_CB <- list(cell = res)

Combine DO, CO, MSigDB, and CellMarker for human cells

Create a cell type-related database by combining Disease Ontology, Cell ontology, MSigDB, and CellMarker databases for analyzing complex human single-cell transcriptome data.

urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_DO.rda?raw=true")))
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=true")))
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=true")))
load(url(paste0(urlpath, "20220304_human_CellMarker.rda?raw=true")))
res <- do.call("rbind", list(human_DO[["disease"]], human_CO[["cell"]],
                             human_MSigDB[["cell"]], human_CellMarker[["cell"]]))
human_CB <- list(cell = res)

Session information

sessionInfo()

keita-iida/ASURATDB documentation built on Feb. 21, 2023, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

keita-iida/ASURATDB
ASURAT database builder

In keita-iida/ASURATDB: ASURAT database builder

Installations

Collect Disease Ontology database for human cells

Collect Cell Ontology database for human and mouse cells

Collect Gene Ontology database for human and mouse cells

Collect KEGG database for human and mouse cells

Collect MSigDB for human cells

H: hallmark gene sets

C2: curated gene sets (BioCarta subset of CP)

C3: regulatory target gene sets (TFT:GTRD)

Cell types in MSigDB

Collect CellMarker for human cells

Create a custom-built database

Combine CO and MSigDB for human cells

Combine CO, MSigDB, and CellMarker for human cells

Combine DO, CO, and MSigDB for human cells

Combine DO, CO, MSigDB, and CellMarker for human cells

Session information

R Package Documentation

Browse R Packages

We want your feedback!

keita-iida/ASURATDB ASURAT database builder

In keita-iida/ASURATDB: ASURAT database builder

Installations

Collect Disease Ontology database for human cells

Collect Cell Ontology database for human and mouse cells

Collect Gene Ontology database for human and mouse cells

Collect KEGG database for human and mouse cells

Collect MSigDB for human cells

H: hallmark gene sets

C2: curated gene sets (BioCarta subset of CP)

C3: regulatory target gene sets (TFT:GTRD)

Cell types in MSigDB

Collect CellMarker for human cells

Create a custom-built database

Combine CO and MSigDB for human cells

Combine CO, MSigDB, and CellMarker for human cells

Combine DO, CO, and MSigDB for human cells

Combine DO, CO, MSigDB, and CellMarker for human cells

Session information

R Package Documentation

Browse R Packages

We want your feedback!

keita-iida/ASURATDB
ASURAT database builder