#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Reformat a Disease Ontology database.
#'
#' This function reformats a Disease Ontology database.
#'
#' @param dict A res slot of enrichDO().
#' @param all_geneIDs All genes in the form of ENTREZ ID.
#' @param orgdb A genome annotation package.
#'
#' @return A formatted database.
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be time-consuming runs.
#' library(DOSE)
#' data(DO2EG)
#' dict_DO <- enrichDO(unlist(DO2EG), ont = "DO", pvalueCutoff = 1,
#' pAdjustMethod = "BH", minGSSize = 0, maxGSSize = 1e+10,
#' qvalueCutoff = 1, readable = FALSE)
#' human_DO <- format_DO(dict = dict_DO@result, all_geneIDs = dict_DO@gene,
#' orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' }
#'
format_DO <- function(dict = NULL, all_geneIDs = NULL, orgdb = NULL){
#--------------------------------------------------
# Compute information contents.
# See computeIC() in DOSE package.
#--------------------------------------------------
doids <- BiocGenerics::toTable(DO.db::DOTERM)
doterms <- doids$do_id
docount <- table(doterms)
doids <- names(docount) #unique(doterms)
xx <- as.list(DO.db::DOOFFSPRING)
cnt <- vapply(doids, function(x){
n <- docount[xx[[x]]]
docount[x] + sum(n[!is.na(n)])
}, integer(1))
names(cnt) <- doids
p <- cnt / sum(docount)
IC <- -log(p)
dict$IC <- NA
for(i in seq_len(nrow(dict))){
if(is.element(dict$ID[i], names(IC))){
dict$IC[i] <- IC[which(names(IC) == dict$ID[i])]
}else{
dict$IC[i] <- 99
}
}
df <- data.frame(ID = dict$ID, Description = dict$Description, IC = dict$IC,
Count = dict$Count, Gene = NA, GeneID = dict$geneID)
res <- list(disease = unique(df))
#--------------------------------------------------
# Fix the slots of gene symbols and ENTREZ Gene IDs.
#--------------------------------------------------
dictionary <- AnnotationDbi::select(orgdb, key = unique(all_geneIDs),
columns = "SYMBOL",
keytype = "ENTREZID")
for(k in seq_len(length(res))){
for(i in seq_len(nrow(res[[k]]))){
genes <- c()
geneIDs <- c()
g <- unlist(strsplit(res[[k]]$GeneID[i], "/"))
if(length(g) == 0){
next
}
for(j in seq_len(length(g))){
ind <- which(dictionary$ENTREZID == g[j])
if(!is.na(dictionary[ind, ]$SYMBOL[1])){
genes <- c(genes, dictionary[ind, ]$SYMBOL[1])
geneIDs <- c(geneIDs, g[j])
}
}
res[[k]]$Gene[i] <- paste(genes, collapse = "/")
res[[k]]$GeneID[i] <- paste(geneIDs, collapse = "/")
res[[k]]$Count[i] <- as.integer(length(geneIDs))
}
}
tidy <- list()
categories <- names(res)
for(k in seq_len(length(categories))){
tidy[[categories[k]]] <- res[[k]]
}
#--------------------------------------------------
# Compute a similarity matrix.
#--------------------------------------------------
categories <- names(res)
for(k in seq_len(length(categories))){
df <- res[[k]]
simmat <- DOSE::doSim(df$ID, df$ID, measure = "Jiang")
tidy[["similarity_matrix"]][[categories[k]]] <- simmat
}
return(tidy)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Do cellTypeToGenes().
#'
#' This function performs cellTypeToGenes().
#'
#' @param data Description.
#' @param orgdb A genome annotation package.
#'
#' @return Results of cellTypeToGenes().
#'
do_cellTypeToGenes <- function(data = NULL, orgdb = NULL){
res <- suppressMessages(
ontoProc::cellTypeToGenes(data, orgDb = orgdb,
gotab = ontoProc::allGOterms))
return(res)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Collect a Cell Ontology database.
#'
#' This function performs cellTypeToGenes() for collecting a Cell Ontology
#' database.
#'
#' @param orgdb A genome annotation package.
#'
#' @return Results from getCellOnto().
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be a time-consuming run.
#' dict_CO <- collect_CO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' }
#'
collect_CO <- function(orgdb = NULL){
#--------------------------------------------------
# Definition
#--------------------------------------------------
co <- ontoProc::getCellOnto()
co <- data.frame(ID = co[["id"]], Description = co[["name"]])
co <- co[which(!is.na(co$Description)), ]
co <- co[(grepl("CL:", co$ID)), ]
#--------------------------------------------------
# Collect CO terms.
#--------------------------------------------------
res <- c("ID", "Description", "Symbol", "GO", "Evidence")
res <- data.frame(matrix(ncol = 5, nrow = 0, dimnames = list(NULL, res)))
# Initializes the progress bar
pb <- txtProgressBar(min = 0, max = nrow(co), style = 3, width = 50,
char = "=")
for(i in seq_len(nrow(co))){
setTxtProgressBar(pb, i)
tmp <- try(do_cellTypeToGenes(co$Description[i], orgdb = orgdb),
silent = TRUE)
if(class(tmp) == "try-error"){
next
}else if(nrow(tmp) != 0){
for(j in seq_len(nrow(tmp))){
res <- rbind(res, data.frame(
ID = co$ID[i],
Description = co$Description[i],
Symbol = tmp$SYMBOL[j],
GO = tmp$GO[j],
Evidence = tmp$EVIDENCE[j]
))
}
}
}
close(pb)
return(res)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Find all the descendants for parent terms.
#'
#' This function finds all the descendants for parent terms.
#'
#' @param id ID.
#' @param co A result of getCellOnto().
#' @param map A map data.
#'
#' @return A map data.
#'
find_descendants <- function(id = NULL, co = NULL, map = NULL){
children <- co[["children"]][[id]]
if(length(children) == 0){
return(NA)
}else{
for(i in seq_len(length(children))){
map <- c(map, c(children[i], find_descendants(children[i], co, map)))
}
return(setdiff(map, NA))
}
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Make a parent-child relation table for CO terms.
#'
#' This function makes a parent-child relation table for CO terms.
#'
#' @param tidy A list of result of format_CO().
#'
#' @return Parent-child relation table.
#'
make_treeTable_CO <- function(tidy = NULL){
categories_woALL <- setdiff(names(tidy), "ALL")
co <- ontoProc::getCellOnto()
res <- list()
for(k in seq_len(length(categories_woALL))){
df <- tidy[[categories_woALL[k]]]
map <- c() ; tmp <- c()
for(i in seq_len(nrow(df))){
dg <- data.frame(child = find_descendants(df$ID[i], co, map),
parent = df$ID[i])
tmp <- rbind(tmp, dg)
}
res[[categories_woALL[k]]] <- tmp
}
return(res)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Compute information contents.
#'
#' This function computes information contents for Cell Ontology terms.
#'
#' @param dict A result of format_CO().
#' @param tidy A list of result of format_CO().
#' @param treeTable A result of make_treeTable_CO().
#'
#' @return A list of result of format_CO().
#' @seealso Mistry and Pavlidis, BMC Bioinformatics, 2008.
#'
compute_IC_CO <- function(dict = NULL, tidy = NULL, treeTable = NULL){
categories_woALL <- setdiff(names(tidy), "ALL")
for(k in seq_len(length(categories_woALL))){
#------------------------------
# Definition
#------------------------------
all_genes <- unique(dict$Symbol)
res <- tidy[[categories_woALL[k]]]
tmp <- data.frame(num_annot_child = NA,
child = treeTable[[categories_woALL[k]]]$child,
parent = treeTable[[categories_woALL[k]]]$parent)
#--------------------------------------------------
# Count the number of times that a gene is annotated with children.
#--------------------------------------------------
for(i in seq_len(nrow(tmp))){
cnt <- res[which(res$ID == tmp$child[i]), ]$Count
if(length(cnt) != 0){
tmp$num_annot_child[i] <- cnt
}else{
tmp$num_annot_child[i] <- 0
}
}
#--------------------------------------------------
# Compute the sum of `tmp$num_annot_child` for each parent.
#--------------------------------------------------
ids <- unique(tmp$parent)
tbl <- data.frame(parent = ids, num_annot_parent = NA, sum_annot_child = NA)
for(i in seq_len(length(ids))){
cnt <- res[which(res$ID == ids[i]), ]$Count
if(length(cnt) != 0){
tbl$num_annot_parent[i] <- cnt
}else{
tbl$num_annot_parent[i] <- 0
}
tbl$sum_annot_child[i] <-
sum(tmp[which(tmp$parent == ids[i]), ]$num_annot_child)
}
#--------------------------------------------------
# Compute IC for each parent.
#--------------------------------------------------
tbl$Freq_parent <- tbl$num_annot_parent + tbl$sum_annot_child
if(categories_woALL[k] == "cell"){
ind <- which(tbl$parent == "CL:0000000")
if(length(ind) == 0){
stop("tidy_CO must include root ontology term.")
}
tbl$Freq_root <- tbl[ind, ]$Freq_parent
}
tbl$Prob <- tbl$Freq_parent / tbl$Freq_root
tbl$IC <- -log(tbl$Prob)
if(identical(tbl$parent, res$ID)){
res$IC <- tbl$IC
}else{
stop("IDs are inconsistent. Check the code of compute_IC_CO().")
}
tidy[[categories_woALL[k]]] <- res
}
return(tidy)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Reformat the result of collect_CO().
#'
#' This function reformats the result of collect_DO().
#'
#' @param dict A result of collect_CO().
#' @param orgdb A genome annotation package.
#'
#' @return A formatted database.
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be time-consuming runs.
#' dict_CO <- collect_CO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' human_CO <- format_CO(dict = dict_CO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' }
#'
format_CO <- function(dict = NULL, orgdb = NULL){
#--------------------------------------------------
# Reformat dict.
#--------------------------------------------------
df <- data.frame(ID = dict$ID, Description = dict$Description, IC = NA,
Count = NA, Gene = NA, GeneID = NA)
df <- unique(df)
for(i in seq_len(nrow(df))){
genes <- unique(dict[which(dict$ID == df$ID[i]), ]$Symbol)
df$Gene[i] <- paste(genes, collapse = "/")
df$Count[i] <- length(genes)
}
rownames(df) <- seq_len(nrow(df))
res <- list(cell = df)
#--------------------------------------------------
# Compute information contents, defined in
# Mistry and Pavlidis, BMC Bioinformatics, 2008.
#--------------------------------------------------
treeTable <- make_treeTable_CO(tidy = res)
res <- compute_IC_CO(dict = dict, tidy = res, treeTable = treeTable)
#--------------------------------------------------
# Fix the slots of gene symbols and ENTREZ Gene IDs.
#--------------------------------------------------
dictionary <- AnnotationDbi::select(orgdb, key = unique(dict$Symbol),
columns = "ENTREZID", keytype = "SYMBOL")
categories <- names(res)
for(k in seq_len(length(categories))){
for(i in seq_len(nrow(res[[categories[k]]]))){
genes <- c()
geneIDs <- c()
g <- unlist(strsplit(res[[categories[k]]]$Gene[i], "/"))
if(length(g) == 0){
next
}
for(j in seq_len(length(g))){
ind <- which(dictionary$SYMBOL == g[j])
if(!is.na(dictionary[ind, ]$ENTREZID[1])){
geneIDs <- c(geneIDs, dictionary[ind, ]$ENTREZID[1])
genes <- c(genes, g[j])
}
}
res[[categories[k]]]$Gene[i] <- paste(genes, collapse = "/")
res[[categories[k]]]$GeneID[i] <- paste(geneIDs, collapse = "/")
res[[categories[k]]]$Count[i] <- as.integer(length(geneIDs))
}
}
tidy <- list()
categories <- names(res)
for(k in seq_len(length(categories))){
tidy[[categories[k]]] <- res[[categories[k]]]
}
#--------------------------------------------------
# Compute a similarity matrix.
#--------------------------------------------------
categories <- names(res)
for(k in seq_len(length(categories))){
df <- res[[categories[k]]]
simmat <- matrix(0, nrow = nrow(df), ncol = nrow(df))
tree <- treeTable[[categories[k]]]
for(i in seq_len(nrow(df) - 1)){
for(j in seq(i + 1, nrow(df))){
#------------------------------
# IC of most informative common ancestor (MICA)
#------------------------------
ancestors_i <- tree[which(tree$child == df$ID[i]), ]$parent
ancestors_j <- tree[which(tree$child == df$ID[j]), ]$parent
common_ancestors <- intersect(ancestors_i, ancestors_j)
if(length(common_ancestors) == 0){
simmat[i, j] <- 0
}else{
common_ancestors <- data.frame(ID = common_ancestors, IC = NA)
for(n in seq_len(nrow(common_ancestors))){
common_ancestors$IC[n] <-
df[which(df$ID == common_ancestors$ID[n]), ]$IC
}
inds <- order(common_ancestors$IC, decreasing = TRUE)
IC_MICA <- common_ancestors[inds, ]$IC[1]
#------------------------------
# Jiang's method
#------------------------------
simmat[i, j] <- 2 * IC_MICA / (df$IC[i] + df$IC[j])
}
}
}
simmat[lower.tri(simmat)] <- simmat[upper.tri(simmat)]
diag(simmat) <- 1
rownames(simmat) <- df$ID
colnames(simmat) <- df$ID
tidy[["similarity_matrix"]][[categories[k]]] <- simmat
}
return(tidy)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Do groupGO().
#'
#' This function performs groupGO().
#'
#' @param genes A gene set.
#' @param orgdb A genome annotation package.
#' @param ont Category of Gene Ontology.
#' @param level Level of Gene Ontology.
#'
#' @return Results of groupGO().
#'
do_groupGO <- function(genes = NULL, orgdb = NULL, ont = NULL, level = NULL){
res <- clusterProfiler::groupGO(
gene = genes,
OrgDb = orgdb,
keyType = "ENTREZID",
ont = ont,
level = level,
readable = FALSE
)
return(res)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Collect Gene Ontology database.
#'
#' This function collects Gene Ontology database.
#'
#' @param orgdb A genome annotation package.
#'
#' @return Results from groupGO().
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be a time-consuming run.
#' dict_GO <- collect_GO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' }
#'
collect_GO <- function(orgdb = NULL){
#--------------------------------------------------
# Preparation
#--------------------------------------------------
if(identical(orgdb, org.Hs.eg.db::org.Hs.eg.db)){
all_geneIDs <- as.list(org.Hs.eg.db::org.Hs.egGO)
}else if(identical(orgdb, org.Mm.eg.db::org.Mm.eg.db)){
all_geneIDs <- as.list(org.Mm.eg.db::org.Mm.egGO)
}else{
stop("Currently, only org.Hs.eg.db and org.Mm.eg.db are acceptable.")
}
all_geneIDs <- names(all_geneIDs[!is.na(all_geneIDs)])
categories <- c("MF", "BP", "CC")
#--------------------------------------------------
# groupGO
#--------------------------------------------------
res <- list()
# Initializes the progress bar
pb <- txtProgressBar(min = 0, max = length(categories), style = 3,
width = 50, char = "=")
for(k in seq_len(length(categories))){
setTxtProgressBar(pb, k)
tmp <- c()
level <- 1
while(1){
ggo <- try(
do_groupGO(genes = all_geneIDs, orgdb = orgdb, ont = categories[k],
level = level),
silent = TRUE
)
if(class(ggo) == "try-error"){
break
}else{
tmp <- c(tmp, list(ggo))
level <- level + 1
}
}
res[[categories[k]]] <- tmp
}
close(pb)
return(res)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Reformat the result of collect_GO().
#'
#' This function reformats the result of collect_GO().
#'
#' @param dict A result of collect_GO().
#' @param orgdb A genome annotation package.
#'
#' @return A formatted database.
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be time-consuming runs.
#' dict_GO <- collect_GO(orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' human_GO <- format_GO(dict = dict_GO, orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' }
#'
format_GO <- function(dict = NULL, orgdb = NULL){
#--------------------------------------------------
# Reformat dict.
#--------------------------------------------------
categories <- names(dict)
res <- list()
for(k in seq_len(length(categories))){
tmp <- c()
levels <- length(dict[[categories[k]]])
for(lv in seq_len(levels)){
tmp <- rbind(tmp, as.data.frame(dict[[categories[k]]][[lv]]@result))
}
tmp <- unique(tmp) # Notice: the right hand side must be data.frame
df <- data.frame(ID = tmp$ID, Description = tmp$Description, IC = NA,
Count = tmp$Count, Gene = NA, GeneID = tmp$geneID)
df$Count <- as.integer(df$Count)
res[[categories[k]]] <- df
}
#--------------------------------------------------
# Compute information contents.
#--------------------------------------------------
res_godata <- list()
for(k in seq_len(length(categories))){
res_godata[[categories[k]]] <- GOSemSim::godata(OrgDb = orgdb,
ont = categories[k],
computeIC = TRUE)
}
for(k in seq_len(length(categories))){
simdata <- res_godata[[categories[k]]]
df <- res[[categories[k]]]
for(i in seq_len(nrow(df))){
if(is.element(df$ID[i], names(simdata@IC))){
df$IC[i] <- simdata@IC[which(names(simdata@IC) == df$ID[i])]
df$IC[i] <- ifelse(is.infinite(df$IC[i]), 99, df$IC[i])
}else{
df$IC[i] <- 99
}
}
res[[categories[k]]] <- df
}
#--------------------------------------------------
# Fix the slots of gene symbols and ENTREZ Gene IDs.
#--------------------------------------------------
geneIDs_MF <- res[["MF"]][which(res[["MF"]]$ID == "GO:0003674"), ]$GeneID
geneIDs_MF <- unlist(strsplit(geneIDs_MF, "/"))
geneIDs_BP <- res[["BP"]][which(res[["BP"]]$ID == "GO:0008150"), ]$GeneID
geneIDs_BP <- unlist(strsplit(geneIDs_BP, "/"))
geneIDs_CC <- res[["CC"]][which(res[["CC"]]$ID == "GO:0005575"), ]$GeneID
geneIDs_CC <- unlist(strsplit(geneIDs_CC, "/"))
geneIDs <- unique(c(geneIDs_MF, geneIDs_BP, geneIDs_CC))
dictionary <- AnnotationDbi::select(orgdb, key = geneIDs, columns = "SYMBOL",
keytype = "ENTREZID")
categories <- names(res)
for(k in seq_len(length(categories))){
for(i in seq_len(nrow(res[[categories[k]]]))){
genes <- c()
geneIDs <- c()
g <- unlist(strsplit(res[[categories[k]]]$GeneID[i], "/"))
if(length(g) == 0){
next
}
for(j in seq_len(length(g))){
ind <- which(dictionary$ENTREZID == g[j])
if(!is.na(dictionary[ind, ]$SYMBOL[1])){
genes <- c(genes, dictionary[ind, ]$SYMBOL[1])
geneIDs <- c(geneIDs, g[j])
}
}
res[[categories[k]]]$Gene[i] <- paste(genes, collapse = "/")
res[[categories[k]]]$GeneID[i] <- paste(geneIDs, collapse = "/")
res[[categories[k]]]$Count[i] <- as.integer(length(geneIDs))
}
}
tidy <- list()
for(k in seq_len(length(categories))){
tidy[[categories[k]]] <- res[[categories[k]]]
}
rm(res)
gc()
#--------------------------------------------------
# Compute a similarity matrix.
#--------------------------------------------------
for(k in seq_len(length(categories))){
df <- tidy[[categories[k]]]
simmat <- GOSemSim::mgoSim(df$ID, df$ID,
semData = res_godata[[categories[k]]],
measure = "Jiang", combine = NULL)
tidy[["similarity_matrix"]][[categories[k]]] <- simmat
}
return(tidy)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Do keggGet().
#'
#' This function performs keggGet().
#'
#' @param data KEGG identifiers.
#'
#' @return Results of keggGet().
#'
do_keggGet <- function(data = NULL){
return(KEGGREST::keggGet(data))
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Collect KEGG database.
#'
#' This function collects KEGG database.
#'
#' @param organism An identifier of organism.
#' @param categories Category name.
#' @param timelag Time lag for accessing KEGG database.
#' Default value is set as 0.1.
#'
#' @return Results from keggGet().
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom stringr str_extract
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be a time-consuming run.
#' dict_KEGG <- collect_KEGG(organism = "hsa", categories = c("pathway"),
#' timelag = 0.1)
#' }
#'
collect_KEGG <- function(organism = NULL, categories = NULL, timelag = 0.1){
#--------------------------------------------------
# Collect KEGG terms.
#--------------------------------------------------
res <- list()
for(k in seq_len(length(categories))){
ids <- KEGGREST::keggLink(categories[k], organism) #names(ids) are gene IDs
map <- data.frame(
ID = ids,
Description = NA,
KEGG_geneID = names(ids)
)
map$NCBI_geneID <- NA
flags <- c()
I <- length(map$ID)
# Initializes the progress bar
pb <- txtProgressBar(min = 0, max = I, style = 3, width = 50, char = "=")
for(i in seq_len(I)){
Sys.sleep(timelag)
setTxtProgressBar(pb, i)
#--------------------------------------------------
# (i) Description
#--------------------------------------------------
category_id <- map$ID[i]
if(categories[k] == "module"){
category_id <- str_extract(category_id, "(?<=_)(.*)")
}
tmp_description <- try(do_keggGet(category_id), silent = TRUE)
if(class(tmp_description) == "try-error"){
flags <- c(flags, i)
next
}
#--------------------------------------------------
# (ii) NCBI_geneID
#--------------------------------------------------
tmp_gene <- try(do_keggGet(map$KEGG_geneID[i]), silent = TRUE)
if(class(tmp_gene) == "try-error"){
flags <- c(flags, i)
next
}
#------------------------------
# (i)
#------------------------------
name <- tmp_description[[1]][["NAME"]]
map$Description[i] <- ifelse(is.null(name), "No_record", name)
#------------------------------
# (ii)
#------------------------------
dblinks <- tmp_gene[[1]][["DBLINKS"]]
id <- dblinks[grep("NCBI-GeneID", dblinks)]
id <- gsub("NCBI-GeneID: ", "", id)
map$NCBI_geneID[i] <- id
}
close(pb)
res[[categories[k]]][["success"]] <- map
#--------------------------------------------------
# Rescue the failures.
#--------------------------------------------------
if(length(flags) > 0){
pb <- txtProgressBar(min = 0, max = I, style = 3, width = 50, char = "=")
failure <- c()
for(i in flags){
setTxtProgressBar(pb, i)
#------------------------------
# (i) Description
#------------------------------
category_id <- map$ID[i]
if(categories[k] == "module"){
category_id <- str_extract(category_id, "(?<=_)(.*)")
}
tmp_description <- try(do_keggGet(category_id), silent = TRUE)
cnt = 0
while(class(tmp_description) == "try-error"){
cnt <- cnt + 1
if(cnt >= 10){
break
}
Sys.sleep(1)
tmp_description <- try(do_keggGet(category_id), silent = TRUE)
}
if(cnt >= 10){
failure <- rbind(failure, c("ID", map$ID[i]))
next
}
#------------------------------
# (ii) NCBI_geneID
#------------------------------
tmp_gene <- try(do_keggGet(map$KEGG_geneID[i]), silent = TRUE)
cnt = 0
while(class(tmp_gene) == "try-error"){
cnt <- cnt + 1
if(cnt >= 10){
break
}
Sys.sleep(1)
tmp_gene <- try(do_keggGet(map$KEGG_geneID[i]), silent = TRUE)
}
if(cnt >= 10){
failure <- rbind(failure, c("KEGG_geneID", map$KEGG_geneID[i]))
next
}
#----------
# (i)
#----------
name <- tmp_description[[1]][["NAME"]]
map$Description[i] <- ifelse(is.null(name), "No_record", name)
#----------
# (ii)
#----------
dblinks <- tmp_gene[[1]][["DBLINKS"]]
id <- dblinks[grep("NCBI-GeneID", dblinks)]
id <- gsub("NCBI-GeneID: ", "", id)
map$NCBI_geneID[i] <- id
}
close(pb)
res[[categories[k]]][["success"]] <- map
tmp <- data.frame(
matrix(ncol = 2, nrow = 0, dimnames = list(NULL, c(
"Input_category_for_keggGet", "Input_for_keggGet"
))))
tmp <- rbind(tmp, data.frame(
Input_category_for_keggGet = unique(failure)[, 1],
Input_for_keggGet = unique(failure)[, 2]
))
res[[categories[k]]][["failure"]] <- tmp
}
}
return(res)
}
#-----------------------------------------------------------------------------80
#
#-----------------------------------------------------------------------------80
#' Reformat collect_KEGG().
#'
#' This function reformats collect_KEGG().
#'
#' @param dict A result of collect_KEGG().
#' @param orgdb A genome annotation package.
#'
#' @return A formatted database.
#' @export
#'
#' @examples
#' \dontrun{
#' # Below may be time-consuming runs.
#' dict_KEGG <- collect_KEGG(organism = "hsa", categories = c("pathway"),
#' timelag = 0.1)
#' human_KEGG <- format_KEGG(
#' dict = list(pathway = dict_KEGG[["pathway"]][["success"]]),
#' orgdb = org.Hs.eg.db::org.Hs.eg.db)
#' }
#'
format_KEGG <- function(dict = NULL, orgdb = NULL){
#--------------------------------------------------
# Definition
#--------------------------------------------------
categories <- names(dict)
#--------------------------------------------------
# Reformat
#--------------------------------------------------
res <- list()
for(k in seq_len(length(categories))){
tmp <- dict[[categories[k]]]
map <- unique(data.frame(
ID = as.character(tmp[["ID"]]),
Description = as.character(tmp[["Description"]]),
IC = NA,
Count = NA,
Gene = NA,
GeneID = NA
))
for(i in seq_len(nrow(map))){
#------------------------------
# Gene and Count
#------------------------------
genes <- unique(tmp[which(tmp[["ID"]] == map[["ID"]][i]), ]$NCBI_geneID)
map$GeneID[i] <- paste(genes, collapse = "/")
map$Count[i] <- length(genes)
}
rownames(map) <- seq_len(nrow(map))
res[[categories[k]]] <- map
}
#--------------------------------------------------
# Fix the slots of gene symbols and ENTREZ Gene IDs.
#--------------------------------------------------
for(k in seq_len(length(categories))){
geneIDs <- unique(dict[[categories[k]]][["NCBI_geneID"]])
dictionary <- AnnotationDbi::select(orgdb, key = geneIDs,
columns = "SYMBOL",
keytype = "ENTREZID")
for(i in seq_len(nrow(res[[categories[k]]]))){
genes <- c()
geneIDs <- c()
g <- unlist(strsplit(res[[categories[k]]]$GeneID[i], "/"))
if(length(g) == 0){
next
}
for(j in seq_len(length(g))){
ind <- which(dictionary$ENTREZID == g[j])
if(!is.na(dictionary[ind, ]$SYMBOL[1])){
genes <- c(genes, dictionary[ind, ]$SYMBOL[1])
geneIDs <- c(geneIDs, g[j])
}
}
res[[categories[k]]]$Gene[i] <- paste(genes, collapse = "/")
res[[categories[k]]]$GeneID[i] <- paste(geneIDs, collapse = "/")
res[[categories[k]]]$Count[i] <- as.integer(length(geneIDs))
}
}
tidy <- list()
categories <- names(res)
for(k in seq_len(length(categories))){
tidy[[categories[k]]] <- res[[categories[k]]]
}
return(tidy)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.