R/steps.R

Defines functions plot_wordcloud plot_biotextgraph assign_community process_network_manual return_gene_path_graph process_network_gene process_network_microbe graph_cluster make_graph tag_words make_TDM make_corpus perform_ora set_filter_words obtain_pubmed obtain_refseq obtain_enrich obtain_enzyme obtain_bugsigdb obtain_alliance obtain_manual

Documented in graph_cluster make_corpus make_graph make_TDM obtain_alliance obtain_bugsigdb obtain_enrich obtain_enzyme obtain_manual obtain_pubmed obtain_refseq perform_ora plot_biotextgraph plot_wordcloud process_network_gene process_network_manual process_network_microbe set_filter_words tag_words

#' obtain_manual
#' 
#' @description Obtain biotext class object from manually specified data
#' @details The function obtains the biotext object from data.frame.
#' @seealso generalf
#' @param df data frame
#' @export
#' @examples obtain_manual(data.frame(text=c("test")))
#' @return biotext class object
#' 
obtain_manual <- function(df) {
    if (!is.data.frame(df)) {
      if (is.vector(df)) {
        df <- data.frame(df) |> `colnames<-`(c("text"))
      }
    }
    ret <- new("biotext")
    ret@type <- paste0("manual")
    ret@rawText <- df
    ret
}



#' obtain_alliance
#' 
#' @description Obtain biotext class object from The Alliance of Genome Resources
#' @details Obtain gene description from 
#' The Alliance of Genome Resources
#' (https://www.alliancegenome.org/downloads)
#' 
#' @param geneList gene ID list
#' @param file filename, default to "GENE-DESCRIPTION-TSV_HUMAN.tsv"
#' @param keyType key type of gene list
#' @param org_db used to convert ID
#' @export
#' @importFrom data.table fread
#' @examples \dontrun{obtain_alliance(c("PNKP"))}
#' @return biotext class object
obtain_alliance <- function(geneList, file="GENE-DESCRIPTION-TSV_HUMAN.tsv",
    keyType="SYMBOL", org_db=org.Hs.eg.db) {
    ret <- new("biotext")
    ret@query <- geneList
    ret@type <- "alliance_genome_resources"

    qqcat("Input genes: @{length(geneList)}\n")
    if (keyType!="SYMBOL"){
        geneList <- AnnotationDbi::select(org_db,
            keys = geneList, columns = c("SYMBOL"),
            keytype = keyType)$SYMBOL
        geneList <- geneList[!is.na(geneList)] |> unique()
        qqcat("  Converted input genes: @{length(geneList)}\n")
    }

    df <- fread(file, header=FALSE)
    df <- df |> dplyr::filter(.data$V2 %in% geneList)
    colnames(df) <- c("HGNC", "Gene_ID", "text")
    ret@rawText <- data.frame(df)
    ret
}


#' obtain_bugsigdb
#' 
#' @description Obtain microbe-related information from bugsigdb
#' @details Obtain microbe description from BugSigDB using bugsigdbr.
#' @seealso generalf
#' @param mb_list microbe list
#' @param target title or abstract
#' @param api_key API key for PubMed
#' @param curate curated articles (TRUE) or search for PubMed (FALSE)
#' if FALSE, same for fetching from PubMed by specified query
#' @export
#' @examples 
#' \dontrun{
#'     obtain_bugsigdb("Veillonella dispar")
#' }
#' @return biotext class object
obtain_bugsigdb <- function(mb_list,
	target="title",
	curate=TRUE,
	api_key=NULL) {

    ret <- new("biotext")
    ret@query <- mb_list
    ret@curate <- curate
    ret@type <- paste0("microbe_",target)

    qqcat("Input microbes: @{length(mb_list)}\n")
    tb <- importBugSigDB()
    subTb <- NULL
    for (m in mb_list) {
        tmp <- tb[grepl(m, tb$`MetaPhlAn taxon names`,
                 fixed = TRUE),]
        if (dim(tmp)[1]>0) {
            qqcat("  Found @{length(unique(tmp$PMID))} entries for @{m}\n")
            tmp$query <- m
            subTb <- rbind(subTb, tmp)
        }
    }
    qqcat("Including @{dim(subTb)[1]} entries\n")

    titles <- unique(subTb$Title)
    titles <- titles[!is.na(titles)]
    fil <- NULL
    for (title in titles){
        tmp <- subset(subTb, subTb$Title==title)
        if (dim(tmp)[1]>1){
            ## Duplicated entry is deleted based on query and paper title
            tmp$title2 <- paste0(tmp$Title,"_",tmp$query)
            tmp2 <- tmp[!duplicated(tmp$title2),]
            fil <- rbind(fil, c(paste(tmp2$query, collapse=","),
                unique(tmp2$Title), unique(tmp2$PMID)))
        } else {
            fil <- rbind(fil, c(tmp$query, tmp$Title, tmp$PMID))
        }
    }
    fil <- data.frame(fil)
    colnames(fil) <- c("query","text","ID")
    fil <- fil[!is.na(fil$ID),]
    ret@pmids <- fil$ID
    ret@rawTextBSDB <- subTb

	if (curate) {
        if (target=="abstract"){
            qqcat("Target is abstract\n")
            pmids <- unique(fil$ID)
            qqcat("  Querying PubMed for @{length(pmids)} pmids\n")
            queryUrl <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/",
                               "efetch.fcgi?db=pubmed&retmode=XML&id=",
                                   paste(pmids, collapse=","))
            if (!is.null(api_key)){
                queryUrl <- paste0(queryUrl, "&api_key=", api_key)
            } else {
                qqcat("  Querying without API key\n")
            }
            onequery <- url(queryUrl, open = "rb", encoding = "UTF8")
            xmlString <- readLines(onequery, encoding="UTF8")
            parsedXML <- xmlParse(xmlString)
            obtainText <- function(pmid) {xpathSApply(parsedXML,
                paste('//PubmedArticle/MedlineCitation[PMID=',pmid,
                    ']/Article/Abstract/AbstractText'),
                xmlValue)}
            PMIDs <- as.numeric(xpathSApply(parsedXML,
                "//PubmedArticle/MedlineCitation/PMID",
                xmlValue))
            abstDf <- NULL
            for (pmid in PMIDs) {
                if (length(obtainText(pmid))!=0) {
                    for (text in obtainText(pmid)) {
                        tax <- unique(subset(fil, fil$ID==pmid)$query)
                        abstDf <- rbind(abstDf, c(pmid, text, tax))
                    }
                }
            }
            abstDf <- data.frame(abstDf) |> `colnames<-`(c("ID","text","query"))
            ret@rawText <- abstDf
        } else {
        	ret@rawText <- fil
        }
    } else {
        abstArg[["queries"]] <- mb_list
        abstArg[["quote"]] <- TRUE
        abstArg[["target"]] <- target
        abstDf <- do.call(pubmed, abstArg)
        ret@rawText <- abstDf
    }
    ret
}

#' obtain_enzyme
#' 
#' @description Obtain EC-related text data from PubMed
#' @details Query the Enzyme Comission number and obtain description,
#' and search pubmed for these enzymes and make word cloud and
#' correlation network. Need to specify the path to "enzyme.dat"
#' downloaded from from ExPASy (https://enzyme.expasy.org/).
#' @seealso generalf
#' 
#' @param file file downloaded from expasy
#' @param ec_num candidate ecnum, like those obtained from eggNOG-mapper
#' @param only_term only return quoted queries to pubmed
#' @param only_df only return ec description data.frame
#' if onlyTerm and onlyDf are both specified, onlyTerm have priority
#' @param tax_ec link taxonomy to EC using UniProt Taxonomy ID file
#' If this is TRUE, data.frame is returned
#' @param tax_file UniProt organism ID file path
#' @param cand_tax when taxec=TRUE, search only for these species.
#' @param arg_list passed to obtain_pubmed()
#' @param target abstract or title
#' @param api_key api key for PubMed
#' @export
#' @return biotext class object
obtain_enzyme <- function(file, ec_num,
    only_term=FALSE, only_df=FALSE, target="abstract",
    tax_ec=FALSE, tax_file=NULL, cand_tax=NULL, arg_list=list(),
    api_key=NULL) {
  flg <- FALSE
  candecs <- NULL
  allFlag <- FALSE
  if (length(ec_num)==1) {
    if (ec_num=="all") {
      allFlag <- TRUE
    }
  }
  qqcat("Processing EC file\n")
  con = file(file, "r")
  while ( TRUE ) {
    line = readLines(con, n = 1)
    if ( length(line) == 0 ) {
      break
    }
    if (startsWith(line,"ID")) {
      ccs <- NULL
      ecs <- NULL
      drs <- NULL
      ec <- gsub("ID   ","",line)
      if (allFlag) {
        flg <- TRUE
      } else {
        if (ec %in% ec_num) {
          flg <- TRUE
        } else {
          flg <- FALSE
        }
      }
    }
    if (flg) {
      if (startsWith(line, "DE")) {
        de <- gsub("\\.","",gsub("DE   ","",line))
      }
      if (startsWith(line, "CC")) {
        cc <- gsub("\\.","",gsub("CC   ","",line))
        cc <- gsub("-!- ", "", cc)
        ccs <- c(ccs, cc)
      }
      if (startsWith(line, "DR")) {
        stLine <- gsub(" ","",gsub("DR", "", line))
        drs <- c(drs, unlist(strsplit(stLine,";")))
      }
      if (startsWith(line, "//")) {
        flg <- FALSE
        ecs <- c(ec, de, paste0(ccs, collapse=" "),
                 paste0(drs, collapse=";"))
        candecs <- rbind(candecs, ecs)
        }
    }
  }
  close(con)
  if (is.null(candecs)) {return(NULL)}
  candecs <- data.frame(candecs) |>
    `colnames<-`(c("number","desc","comment","DRs"))
  if (tax_ec) {
    qqcat("  Linking taxonomy to EC\n")
    retTaxEC <- NULL
    if (is.null(tax_file)) {stop("Please provide UniProt Taxonomy file")}
    if (!is.null(cand_tax)) {
      taxo <- getUPtax(tax_file, candUP="all", candTax=cand_tax)
      for (num in candecs$number) {
        desc <- subset(candecs, candecs$number==num)$desc
        allCharIDs <- as.character(unlist(vapply(unlist(strsplit(subset(candecs, candecs$number==num)$DRs,";")),
                                                 function(x) unlist(strsplit(x, "_"))[2], "character")))
        if (length(intersect(allCharIDs, taxo$UPID))>=1) {
          for (ta in intersect(allCharIDs, taxo$UPID)) {
            for (cta in subset(taxo, taxo$UPID==ta)$Taxonomy) {
              retTaxEC <- rbind(retTaxEC, c(num, desc, ta, cta))
            }
          }
        }
      }
    } else {
      stop("Please specify cand_tax when tax_ec=TRUE")
    }
    if (is.null(retTaxEC)) {stop("No EC could be found for query")}
    retTaxEC <- data.frame(retTaxEC) |> 
      `colnames<-`(c("number","desc","taxonomy","scientificName"))
    queryCheck <- NULL
    for (ct in cand_tax) {
      queryCheck <- cbind(queryCheck,
                          grepl(ct, retTaxEC$scientificName))
    }
    retTaxEC$query <- apply(queryCheck, 1, function(x) {
      if (length(cand_tax[x])==1) {
        cand_tax[x]
      } else {
        paste0(cand_tax[x],",")
      }})
    return(retTaxEC)
  }
  
  quoted <- dQuote(candecs$desc,options(useFancyQuotes = FALSE))
  if (only_term) {return(quoted)}
  if (only_df) {return(candecs)}
  arg_list[["target"]] <- "abstract"
  arg_list[["queries"]] <- quoted
  arg_list[["api_key"]] <- api_key
  abst <- do.call("obtain_pubmed", arg_list)
  abst@ec <- candecs
  abst@type <- "EC"
  return(abst)
}


#' obtain_enrich
#' 
#' @description Obtain enrichment analysis description
#' @details The function performs enrichment analysis on
#' queried vector, and returns the biotext object using the
#' textual information from biological pathways.
#' 
#' @param geneList gene list
#' @param keyType key type of gene list
#' @param enrich `kegg` or `reactome`
#' @param org_db used to convert ID
#' @param top_path the number of pathways to be obtained
#' sorted by p-values
#' @export
#' @examples
#' if (requireNamespace("clusterProfiler")) {
#'     testgenes <- c("IRF3","PNKP","DDX41","ERCC1","ERCC2","XRCC1")
#'     obtain_enrich(testgenes)
#' }
#' @return biotext class object
obtain_enrich <- function(geneList, keyType="SYMBOL", enrich="kegg",
    org_db=org.Hs.eg.db, top_path=30) {
    ret <- new("biotext")
    ret@query <- geneList
    ret@type <- "enrich"

    qqcat("Input genes: @{length(geneList)}\n")
    if (keyType!="ENTREZID"){
        geneList <- AnnotationDbi::select(org_db,
            keys = geneList, columns = c("ENTREZID"),
            keytype = keyType)$ENTREZID
        geneList <- geneList[!is.na(geneList)] |> unique()
        qqcat("  Converted input genes: @{length(geneList)}\n")
    }

    qqcat("Performing enrichment analysis\n")
    if (enrich=="reactome"){
        if (requireNamespace("ReactomePA")) {
            pathRes <- ReactomePA::enrichPathway(geneList)
        } else {
            stop("Please install ReactomePA")
        }
        pathRes@result$Description <- gsub("Homo sapiens\r: ",
                        "",
                        pathRes@result$Description)                
    } else if (enrich=="kegg"){
        pathRes <- clusterProfiler::enrichKEGG(geneList)
    } else {
        ## Currently only supports some pathways
        stop("Please specify 'reactome' or 'kegg'")
    }
    ret@enrichResults <- pathRes@result
    ret@rawText <- pathRes@result[1:top_path,"Description"] |> 
    data.frame() |> `colnames<-`(c("text"))
    ret
}


#' obtain_refseq
#' 
#' @description Obtain RefSeq textual data
#' @details Obtain refseq textual data and makes a biotext object.
#' @seealso generalf
#' @param geneList gene list
#' @param keyType key type of gene list
#' @param organism organism
#' @param org_db used to convert ID
#' @export
#' @examples obtain_refseq(c("PNKP"))
#' @return biotext class object
obtain_refseq <- function(geneList, keyType="SYMBOL", organism=9606, org_db=org.Hs.eg.db) {
    ret <- new("biotext")
    ret@query <- geneList
    ret@type <- "refseq"

    qqcat("Input genes: @{length(geneList)}\n")
    if (keyType!="ENTREZID"){
        geneList <- AnnotationDbi::select(org_db,
            keys = geneList, columns = c("ENTREZID"),
            keytype = keyType)$ENTREZID
        geneList <- geneList[!is.na(geneList)] |> unique()
        qqcat("  Converted input genes: @{length(geneList)}\n")
    }

    tb <- loadGeneSummary(organism = organism)
    fil <- tb %>% filter(tb$Gene_ID %in% geneList)
    fil <- fil[!duplicated(fil$Gene_ID),]
    ret@rawText <- fil
    ret
}

#' obtain_pubmed
#' 
#' @description Obtain PubMed text data
#' @details Obtain pubmed textual data and makes a biotext object.
#' @seealso generalf
#' @param queries query list
#' @param delim delimiter for query, default to OR
#' @param api_key API key for PubMed
#' @param target title or abstract
#' @param ret_max max articles to get
#' @param sort_order sort by relevance or pubdate
#' @param limit query number limit
#' @param quote whether to quote each query
#' @export
#' @return biotext class object
obtain_pubmed <- function(queries, target="title",
	delim="OR", api_key=NULL, ret_max=10, limit=10,
	sort_order="relevance", quote=FALSE) {

    ret <- new("biotext")
    ret@type <- paste0("pubmed_",target)
    if (length(queries)>limit){
      stop("Number of queries exceeded specified limit number")}

    ret@delim <- delim
    if (quote) {
      query <- paste(dQuote(queries,
      	options(useFancyQuotes = FALSE)),
      collapse=paste0(" ",delim," "))
    } else {
      query <- paste(queries, collapse=paste0(" ",delim," "))
    }
    ret@query <- query
    clearQuery <- gsub('\"', '', queries)
    ret <- getPubMed(ret, query, clearQuery, type=target, apiKey=api_key,
                           retMax=ret_max, sortOrder=sort_order)
    ret@retMax <- ret_max
    ret@sortOrder <- sort_order
    ret
}

#' set_filter_words
#' 
#' @description Set filtering words to class.
#' @details Set filtering words to class.
#' If filtered words are already present, add the specified words to them.
#' 
#' @param ret biotext object
#' @param exclude GS for using GeneSummary, 
#' @param exclude_by `frequency` or `tfidf`
#' @param exclude_type ">" or "<"
#' @param exclude_number threshold to exclude
#' @param filterMax filter based on pre-computed maximum values 
#' for each word
#' @param additional_remove your customized words to remove
#' @param pre remove pre-defined words
#' @param pre_microbe remove pre-defined words for microbes
#' @export
#' @examples obtain_refseq(c("PNKP")) |> set_filter_words()
#' @return biotext class object
set_filter_words <- function(ret, exclude_by="frequency",
	exclude_type=">", exclude="GS", exclude_number=2000, filterMax=FALSE,
	additional_remove=NULL, pre=TRUE, pre_microbe=FALSE) {

	filterWords <- ret@filtered

    if (exclude_by=="frequency") {
        pref = paste0(exclude,"_Freq")
    } else {
        pref = paste0(exclude,"_TfIdf")
    }
    if (filterMax) {
        pref <- paste0(pref, "_Max")
    }
    filterWords <- c(filterWords, retFiltWords(useFil=pref,
    	filType=exclude_type, filNum=exclude_number))
    if (!is.null(additional_remove)) {
    	filterWords <- c(filterWords, additional_remove)
    }
    if (pre) {
    	## [TODO] include easyPubMed PubMed_stopwords
	    filterWords <- c(filterWords,"genes","gene","patients","hub",
                      "analysis","cells","cell","expression","doi",
                      "deg","degs","author","authors","elsevier",
                      "oxford","wiley","pmids", "geneid", "pmid", "geneids")
	}
	if (pre_microbe) {
		filterWords <- c(filterWords, "microbiota",
        "microbiome","relative","abundance","abundances",
        "including","samples","sample","otu","otus","investigated",
        "taxa","taxon")
	}
    qqcat("Filtered @{length(unique(filterWords))} words (frequency and/or tfidf)\n")
    ret@filtered <- unique(filterWords)
    ret
}


#' perform_ora
#' @description Performs ORA on obtained text data.
#' @details Performs over representation analysis on the obtained text data.
#' Used only with biotext object with type `refseq`.
#' @param ret biotext class
#' @param threshold ORA threshold to filter (bonferroni-corrected p-value)
#' @export
#' @examples
#' testgenes <- c("IRF3","PNKP","DDX41","ERCC1","ERCC2","XRCC1")
#' obtain_refseq(testgenes) |> perform_ora()
#' @return biotext class object
perform_ora <- function(ret, threshold=0.05) {
	if (ret@type!="refseq") {stop("ORA for type other than refseq is not supported")}
    qqcat("Performing ORA\n")
    sig <- textORA(ret@rawText$Gene_ID |> unique())
    sigFilter <- names(sig)[p.adjust(sig, "bonferroni")>threshold]
    qqcat("Filtered @{length(sigFilter)} words (ORA)\n")
    ret@filtered <- c(ret@filtered, sigFilter)
    ret@ora <- sig
    ret
}


#' make_corpus
#' @description Make corpus based on biotext class
#' @details Make corpus based on biotext class using tm.
#' Ngram, stemming parameters can be specified by this function.
#' @param ret biotext class object
#' @param collapse collapse all the sentences to one sentence
#' @param num_only delete number only (not deleting XXX123)
#' @param stem stem the words or not (default to FALSE)
#' @param preserve preserve the original cases
#' @param ngram n-gram
#' @export
#' @examples obtain_refseq("PNKP") |> make_corpus()
#' @return biotext class object
make_corpus <- function(ret, collapse=FALSE,
	num_only=TRUE, stem=FALSE, preserve=TRUE, ngram=1) {
	if (ret@type=="refseq") {
		texts <- ret@rawText$Gene_summary
	# } else if (startsWith(a@type,"pubmed")) {
	} else {
		texts <- ret@rawText$text
	}
    if (collapse) {
        docs <- VCorpus(VectorSource(paste(texts, collapse=" ")))
    } else {
        docs <- VCorpus(VectorSource(texts))
    }
    ret@numOnly <- num_only
    ret@stem <- stem
    ret@ngram <- ngram

    if (preserve) {pdic <- preserveDict(docs, ngram,
    	num_only, stem)
            ret@dic <- pdic}

    docs <- makeCorpus(docs, ret@filtered, NULL, num_only, stem)
    ret@corpus <- docs
    ret@numOnly <- num_only
    ret@stem <- stem
    ret
}

#' make_TDM
#' 
#' @description Make text document matrix (TDM) from biotext object.
#' @details Make TDM based on biotext class using tm.
#' @param ret biotext class
#' @param tfidf use TF-IDF
#' @param normalize normalize the values to document number
#' @param takeMean take the mean value for words to rank
#' @param takeMax take the max value for words to rank
#' @param docsum count words per doc
#' @return biotext class object
#' @examples obtain_refseq("PNKP") |> make_corpus() |> make_TDM()
#' @export
make_TDM <- function(ret, tfidf=FALSE,
	normalize=FALSE, takeMean=FALSE, takeMax=FALSE, docsum=FALSE) {
	ngram <- ret@ngram
    docs <- ret@corpus
    if (ngram!=1){
        NgramTokenizer <- function(x)
            unlist(lapply(ngrams(words(x), ngram),
                paste, collapse = " "),
                use.names = FALSE)
        if (tfidf) {
            docs <- TermDocumentMatrix(docs,
                control = list(tokenize = NgramTokenizer,
                    weighting = weightTfIdf))
        } else {
            docs <- TermDocumentMatrix(docs,
                control = list(tokenize = NgramTokenizer))
        }
    } else {
        if (tfidf) {
            docs <- TermDocumentMatrix(docs,
                control = list(weighting = weightTfIdf))
        } else {
            docs <- TermDocumentMatrix(docs)
        }
    }

    mat <- as.matrix(docs)
    if (docsum) {
        mat <- apply(mat, 2, function(x) ifelse(x>0, 1, 0))
    }
    if (normalize) {
        mat <- sweep(mat, 2, colSums(mat), `/`)
    }

    if (takeMax & takeMean) {stop("Should specify either of takeMax or takeMean")}
    if (takeMax) {
        perterm <- apply(mat, 1, max, na.rm=TRUE)
    } else {
        if (takeMean) {
            perterm <- apply(mat,1,mean)
        } else {
            perterm <- rowSums(mat)
        }
    }

    matSorted <- sort(perterm, decreasing=TRUE)

    ret@wholeFreq <- matSorted
    ret@TDM <- docs
    ret
}

#' tag_words
#' 
#' @description Tag the words based on pvclust and word matrix.
#' @details Identify important word clusters with statistical significance
#' using `pvclust`.
#' 
#' @param ret biotext class object
#' @param cl cluster for parallel computing
#' @param pvclAlpha alpha value for `pvpick`
#' @param whole perform clustering on whole matrix (take time)
#' @param num_words if set, subset to this number of words
#' @param method method.dist argument in pvclust
#' @importFrom stats as.dist
#' @return biotext class object
#' @examples obtain_refseq(c("IRF3","PNKP")) |> 
#' make_corpus() |> make_TDM() |> tag_words()
#' @export
tag_words <- function(ret, cl=FALSE, pvclAlpha=0.95, whole=FALSE,
	num_words=30, method="correlation") {
	
	# if (corMat) {
	# 	if (is.null(mat)) {stop("Please provide matrix")}
	# 	pvc <- pvclust(as.matrix(as.dist(1-mat,diag=TRUE,upper=TRUE)))
	#     pvcl <- pvpick(pvc, alpha=pvclAlpha)
	#     ret@pvclust <- pvc
    # 	ret@pvpick <- pvcl
    # 	return(ret)
	# }
	
	DTM <- t(as.matrix(ret@TDM))
	freqWords <- names(ret@wholeFreq[1:num_words])
    if (whole){
        pvc <- pvclust(as.matrix(dist(t(DTM))), method.dist=method, parallel=cl)
    } else {
    pvc <- pvclust(as.matrix(dist(
        t(
            DTM[, colnames(DTM) %in% freqWords]
            )
        )), method.dist=method, parallel=cl)
    }
    pvcl <- pvpick(pvc, alpha=pvclAlpha)
    ret@pvclust <- pvc
    ret@pvpick <- pvcl
    ret
}

#' make_graph
#' 
#' @description Make correlation or cooccurrence network
#' @details Make correlation or cooccurrence network based on biotext object.
#' The resulting raw network is stored in `igraphRaw` slot in biotext object.
#' The networks can be customized in the subsequent functions like `process_network_gene`.
#' 
#' @param ret biotext class object
#' @param num_words number of words to include
#' @param cor_threshold correlation threshold
#' @param cooccurrence use cooccurrence
#' @param bn perform Bayesian network inference
#' @param R parameter for boot.strength function
#' @param whole return correlation or cooccurrence for whole words
#' @return biotext class object
#' @examples
#' testgenes <- c("IRF3","PNKP","DDX41","ERCC1","ERCC2","XRCC1")
#' obtain_refseq(testgenes) |> 
#' make_corpus() |> make_TDM() |> make_graph()
#' @export
make_graph <- function(ret, num_words=30, cor_threshold=0.2,
	cooccurrence=FALSE, whole=FALSE, bn=FALSE, R=20) {
    matrixs <- obtainMatrix(ret, bn=bn, R=R, DTM=t(as.matrix(ret@TDM)),
    	freqWords=names(ret@wholeFreq[1:num_words]),
        corThresh=cor_threshold, cooccurrence=cooccurrence,
        onWholeDTM=whole)
    coGraph <- matrixs$coGraph
    ret <- matrixs$ret
    ret@numWords <- num_words
    ret@igraphRaw <- coGraph
    returnDf <- data.frame(word = names(ret@wholeFreq)[1:num_words],
    	freq=ret@wholeFreq[1:num_words])
    ret@freqDf <- returnDf[!is.na(returnDf$word),]
    ret
}

#' graph_cluster
#' @description Performs graph-based clustering.
#' @details Performs graph-based clustering using igraph and stores the information
#' in the biotext object. Default to use `igraph::cluster_leiden`.
#' @param ret biotext class object
#' @param func community detection algorithm in igraph
#' @param factorize convert to factor upon assigning
#' @return biotext class object
#' @examples
#' testgenes <- c("IRF3","PNKP","DDX41","ERCC1","ERCC2","XRCC1")
#' refseq(testgenes) |> graph_cluster()
#' @export
graph_cluster <- function(ret, func=igraph::cluster_leiden, factorize=TRUE) {
	ret@communities <- do.call(func, list(graph=ret@igraphRaw))
    if (factorize) {
        V(ret@igraphRaw)$community <- factor(ret@communities$membership)
    } else {
        V(ret@igraphRaw)$community <- ret@communities$membership
    }
	ret
}


#' process_network_microbe
#' 
#' @description Process the network from BugSigDB.
#' @details Perform tasks before plotting
#' for the data from BugSigDB, like obtaining associated diseases to plot
#' 
#' @param ret biotext class object
#' @param delete_zero_degree exclude zero degree nodes from plotting
#' @param make_upper make these words to uppercase
#' @param disease_plot disease plotting
#' @param ec_plot enzyme plotting
#' @param metab metabolite data.frame
#' @param metab_thresh threshold of association in metabolites - taxa relationship
#' (e.g., correlation coefficient)
#' @param metab_col metabolite data frame column name in the order of
#' "candidate taxon", "metabolite", "quantitative values for thresholding"
#' @param mb_plot microbe plotting
#' @param ec_file enzyme database file
#' @param up_tax_file UniProt taxonomy file
#' @export
#' @examples 
#' \dontrun{
#'     bugsigdb("Veillonella dispar") |> process_network_microbe()
#' }
#' @return biotext class object
#' 
process_network_microbe <- function(ret, delete_zero_degree=TRUE,
    make_upper=NULL, disease_plot=FALSE, ec_plot=FALSE,
    metab=NULL, metab_col=NULL, metab_thresh=0.2,
    mb_plot=FALSE, 
    ec_file=NULL, up_tax_file=NULL) {
    mb_list <- ret@query
    addNet <- list()
    subTb <- ret@rawTextBSDB                


    if (disease_plot) {## This does not need to be deduplicated
        mb_plot<-TRUE
        dis <- NULL
        for (i in seq_len(nrow(subTb))){
            dis <- rbind(dis,
            c(subTb[i, "Condition"], subTb[i, "query"]))
        }
        dis <- dis[!is.na(dis[,1]),]
        dis <- dis[!is.na(dis[,2]),]
        vtx <- data.frame(cbind(c(dis[,1], dis[,2]), c(rep("Diseases",length(dis[,1])),
            rep("Microbes",length(dis[,2]))))) |> `colnames<-`(c("name","type"))
        vtx <- vtx[!duplicated(vtx),]
        dmg <- tbl_graph(nodes=vtx, edges=data.frame(dis), directed=FALSE)
        addNet[["Diseases"]] <- dmg
    }

    if (ec_plot) {
        mb_plot <- TRUE
        if (is.null(ec_file)) {stop("Please provide EC file")}
        if (is.null(up_tax_file)) {stop("Please provide UniProt taxonomy file")}
        ecDf <- enzyme(file=ec_file, ecnum="all", taxec=TRUE,
            taxFile=up_tax_file, candTax=ret@query)
        if (!is.null(ecDf)) {
            ecDf <- ecDf[,c("desc","query")]
            vtx <- data.frame(
                cbind(c(ecDf[,1], ecDf[,2]),
                      c(rep("Enzymes",length(ecDf[,1])),
                        rep("Microbes",length(ecDf[,2]))))) |> 
            `colnames<-`(c("name","type"))
            vtx <- vtx[!duplicated(vtx),]
            ecGraph <- tbl_graph(nodes=vtx,
                edges=data.frame(ecDf),
                directed=FALSE)
            addNet[["Enzymes"]] <- ecGraph
        }
    }

    if (!is.null(metab)) {
        mb_plot<-TRUE
        if (is.null(metab_col)) {
            stop("No column names specified")
        }
        qqcat("Checking metabolites\n")
        metabGraph <- NULL
        for (sp in mb_list) {
            tmp <- metab[grepl(sp,metab[[metab_col[1]]]),]
            tmp <- tmp[abs(tmp[[metab_col[3]]])>metab_thresh,]
            if (dim(tmp)[1]!=0) {
                for (met in tmp[[metab_col[2]]]) {
                    metabGraph <- rbind(metabGraph, c(sp, met))
                }
            } else {
                qqcat("  Found no metabolites for @{sp}\n")
            }
        }
        if (!is.null(metabGraph)) {
            vtx <- data.frame(
                cbind(c(metabGraph[,1], metabGraph[,2]),
                      c(rep("Microbes",length(metabGraph[,1])),
                        rep("Metabolites",length(metabGraph[,2]))))) |> `colnames<-`(c("name","type"))
            vtx <- vtx[!duplicated(vtx),]
            metabGraph <- tbl_graph(nodes=vtx, edges=data.frame(metabGraph), directed=FALSE)
            addNet[["Metabolites"]] <- metabGraph
        }
    }

    DTM <- t(as.matrix(ret@TDM))
    if (mb_plot) {
        row.names(DTM) <- ret@rawText$query
    }
    matSorted <- ret@wholeFreq

    coGraph <- ret@igraphRaw
    freqWords <- names(matSorted[1:ret@numWords])

    coGraph <- induced.subgraph(coGraph,
    names(V(coGraph)) %in% freqWords)
    V(coGraph)$Freq <- matSorted[V(coGraph)$name]

    if (delete_zero_degree){
        coGraph <- induced.subgraph(coGraph,
            degree(coGraph) > 0)
    }

    nodeName <- V(coGraph)$name
    dtmCol <- colnames(DTM)
    for (i in make_upper) {
        dtmCol[dtmCol == i] <- toupper(i)
        nodeName[nodeName == i] <- toupper(i)
    }
    V(coGraph)$name <- nodeName
    colnames(DTM) <- dtmCol

    nodeN <- NULL
    if (mb_plot) {
        mbmap <- c()
        for (rn in nodeName){
            tmp <- DTM[ ,rn]
            for (nm in names(tmp[tmp!=0])){
                if (grepl(",", nm, fixed = TRUE)) {
                    for (microbe in unlist(strsplit(nm, ","))){
                        mbmap <- rbind(mbmap, c(rn, microbe))
                    }
                } else {
                    mbmap <- rbind(mbmap, c(rn, nm))
                }
            }
        }
        mbmap <- mbmap[mbmap[,2]!="",]
        gcnt <- table(mbmap[,2])
        if (length(mb_list)!=1) {
            gcnt <- gcnt[order(gcnt, decreasing=TRUE)]
        }
        if (is.table(gcnt)) {
            ret@geneCount <- gcnt
        }
        
        incGene <- names(gcnt)[1:length(mb_list)]
        mbmap <- mbmap[mbmap[,2] %in% incGene,]
        ret@geneMap <- mbmap

        vtx <- data.frame(cbind(c(mbmap[,1], mbmap[,2]), c(rep("Words",length(mbmap[,1])),
            rep("Microbes",length(mbmap[,2]))))) |> `colnames<-`(c("name","type"))
        vtx <- vtx[!duplicated(vtx),]
        
        ####
        ## It is possible not to distinguish between query microbe name 
        ## and words as in `pubmed()`, as the words are lowercases here, 
        ## and altered to titlecase after
        ####

        mbmap <- tbl_graph(nodes=vtx,
            edges=data.frame(mbmap), directed=FALSE)
        V(coGraph)$type <- "Words"

        coGraph <- graph_join(as_tbl_graph(coGraph),
            as_tbl_graph(mbmap))
        coGraph <- coGraph |> activate("nodes") |>
            mutate(type=ifelse(is.na(.data$Freq),"Microbes","Words"))

        ## If present, add additional graphs
        if (length(addNet)!=0) {
            for (netName in names(addNet)) {
                tmpAdd <- addNet[[netName]]
                coGraph <- graph_join(as_tbl_graph(coGraph),
                    as_tbl_graph(tmpAdd))
                coGraph <- coGraph |> activate("nodes") |>
                    mutate(type=ifelse(is.na(.data$type),netName,.data$type))
            }
        }
        ## Set edge weight
        ## Probably set to NA would be better.


        E(coGraph)$edgeColor <- E(coGraph)$weight
        tmpW <- E(coGraph)$weight
        tmpW[is.na(tmpW)] <- ret@corThresh
        E(coGraph)$weight <- tmpW

    } else {
        V(coGraph)$type <- "Words"
        E(coGraph)$edgeColor <- E(coGraph)$weight
        coGraph <- as_tbl_graph(coGraph)
    }

    ## Node attributes
    if (length(ret@pvpick)!=0) { ## If tag
        netCol <- tolower(names(V(coGraph)))
        for (i in seq_along(ret@pvpick$clusters)){
            for (j in ret@pvpick$clusters[[i]])
                netCol[netCol==j] <- paste0("cluster",i)
        }
        netCol[!startsWith(netCol, "cluster")] <- "not_assigned"
        V(coGraph)$tag <- netCol
        V(coGraph)$tag_raw <- netCol

        ## Add disease and other labs
        if (!is.null(nodeN)) {
            addC <- V(coGraph)$tag
            for (nn in seq_along(names(V(coGraph)))) {
                if (V(coGraph)$tag[nn]!="not_assigned"){next}
                if (names(V(coGraph))[nn] %in% names(nodeN)) {
                    addC[nn] <- nodeN[names(V(coGraph))[nn]]
                } else {
                    next
                }
            }
            V(coGraph)$tag <- addC
        }
    }

    nodeN <- (coGraph |> activate("nodes") |> data.frame())$type
    V(coGraph)$nodeCat <- nodeN

    if (!identical(ret@dic, logical(0))) {
        pdic <- ret@dic
        nodeDf <- coGraph |> activate("nodes") |> data.frame()
        V(coGraph)$name <- apply(nodeDf,
              1,
              function(x) {ifelse(x["type"]=="Words", pdic[x["name"]],
                x["name"])})
    }
    coGraph <- assign_community(ret, coGraph)  
      if (!is.tbl_graph(coGraph)) {
          ret@igraph <- coGraph
      } else {
          ret@igraph <- as.igraph(coGraph)
      }
    ret
}

#' process_network_gene

#' @description Process the network from refseq, pubmed or alliance.
#' @details Perform tasks before plotting the networks
#' like obtaining associated genes to plot.
#' 
#' @param ret biotext class object
#' @param delete_zero_degree exclude zero degree nodes from plotting
#' @param make_upper make these words to uppercase
#' @param make_upper_gene make gene symbol to uppercase
#' @param gene_plot plot and return associated genes
#' @param gene_plot_num how many genes are to be plotted
#' @param gene_path_plot plot associated pathways related to genes
#' @param gene_path_plot_threshold threshold for pathway enrichment
#' @param distinguish_query distinguish query with words in obtained text
#' @param org_db organism database to convert IDs
#' @return biotext class object
#' @examples
#' testgenes <- c("IRF3","PNKP","DDX41","ERCC1","ERCC2","XRCC1")
#' refseq(testgenes) |>
#' process_network_gene()
#' @export
process_network_gene <- function(ret, delete_zero_degree=TRUE,
	make_upper=NULL, make_upper_gene=TRUE, org_db=org.Hs.eg.db,
	gene_plot=FALSE, gene_plot_num=5, gene_path_plot=NULL,
	gene_path_plot_threshold=0.05, distinguish_query=FALSE) {


	DTM <- t(as.matrix(ret@TDM))
	matSorted <- ret@wholeFreq

    if (!is.null(gene_path_plot)) {gene_plot <- TRUE}
    if (gene_plot) {

		if (startsWith(ret@type,"pubmed")) {
			row.names(DTM) <- ret@rawText$query
		} else {
            if (ret@type=="refseq") {
                revID <- AnnotationDbi::select(org_db,
                    keys = as.character(ret@rawText$Gene_ID), 
                    columns = c("SYMBOL"),
                    keytype = "ENTREZID")$SYMBOL                
            } else {
                revID <- ret@rawText$Gene_ID
            }
	        row.names(DTM) <- revID
	    }
    }

	if (make_upper_gene) {
        make_upper <- c(make_upper,
        	tolower(keys(org_db, "SYMBOL")))
	}

	coGraph <- ret@igraphRaw
	freqWords <- names(matSorted[1:ret@numWords])

    coGraph <- induced.subgraph(coGraph,
    names(V(coGraph)) %in% freqWords)
    V(coGraph)$Freq <- matSorted[V(coGraph)$name]

    if (delete_zero_degree){
        coGraph <- induced.subgraph(coGraph,
            degree(coGraph) > 0)
    }

    nodeName <- V(coGraph)$name
    dtmCol <- colnames(DTM)
    for (i in make_upper) {
        dtmCol[dtmCol == i] <- toupper(i)
        nodeName[nodeName == i] <- toupper(i)
    }
    V(coGraph)$name <- nodeName
    colnames(DTM) <- dtmCol

    if (gene_plot) {
        genemap <- c()
        for (rn in nodeName){
	        tmp <- DTM[ ,rn]
	        for (nm in names(tmp[tmp!=0])){
	          if (nm!=""){
	            if (grepl(",",nm,fixed=TRUE)){
	              for (nm2 in unlist(strsplit(nm, ","))){
	                if (distinguish_query) {
	                  genemap <- rbind(genemap, c(rn, paste(nm2, "(Q)")))
	                } else {
	                  genemap <- rbind(genemap, c(rn, nm2))
	                }
	              }
	            } else {
	              if (distinguish_query) {
	                genemap <- rbind(genemap, c(rn, paste(nm, "(Q)")))
	              } else {
	                genemap <- rbind(genemap, c(rn, nm))
	              }
	            }
	          }
	        }
	    }
        gcnt <- table(genemap[,2])
        gcnt <- gcnt[order(gcnt, decreasing=TRUE)]

        if (is.table(gcnt)) {
	        ret@geneCount <- gcnt
	    }
        
        incGene <- names(gcnt)[1:gene_plot_num]
        genemap <- genemap[genemap[,2] %in% incGene,]
        ret@geneMap <- genemap
        genemap <- simplify(igraph::graph_from_edgelist(genemap,
            directed = FALSE))
        coGraph <- tidygraph::graph_join(as_tbl_graph(coGraph),
            as_tbl_graph(genemap))
        coGraph <- coGraph |> activate("nodes") |>
            mutate(type=ifelse(is.na(.data$Freq),"Genes","Words"))
        E(coGraph)$edgeColor <- E(coGraph)$weight

        tmpW <- E(coGraph)$weight
        tmpW[is.na(tmpW)] <- ret@corThresh
        E(coGraph)$weight <- tmpW
    } else {
        E(coGraph)$edgeColor <- E(coGraph)$weight
        V(coGraph)$type <- "Words"
    }

    if (!is.null(gene_path_plot)) {
    	pathGraph <- return_gene_path_graph(ret, gene_path_plot, org_db,
    		threshold=gene_path_plot_threshold)
        pathGraph <- pathGraph |> data.frame()
        withinCoGraph <- intersect(pathGraph[,2], V(coGraph)$name)
        withinCoGraphPathGraph <- pathGraph[
        pathGraph[,2] %in% withinCoGraph,]
        grp <- c()
        for (i in V(coGraph)$name) {
            if (i %in% withinCoGraphPathGraph[,2]){
                tmpMap <- withinCoGraphPathGraph[
                withinCoGraphPathGraph[,2] %in% i,]
                if (is.vector(tmpMap)) {
                    grp <- c(grp, tmpMap[1])
                } else {
                    tmpMap <- tmpMap[order(tmpMap[,1]),]
                    grp <- c(grp, paste(tmpMap[,1], collapse="\n"))
                }
            } else {
                grp <- c(grp, NA)
            }
        }
        V(coGraph)$grp <- grp
    }
    ## Assign node category
    nodeN <- (as_tbl_graph(coGraph) |> activate("nodes") |> data.frame())$type
    V(coGraph)$nodeCat <- nodeN
    names(nodeN) <- V(coGraph)$name

    ## Node attributes
    if (!identical(ret@dic, logical(0))) {
    	pdic <- ret@dic
        nodeDf <- as_tbl_graph(coGraph) |> activate("nodes") |> data.frame()
        V(coGraph)$name <- apply(nodeDf,
              1,
              function(x) {ifelse(x["type"]=="Words", pdic[x["name"]],
                x["name"])})
    }

    if (length(ret@pvpick)!=0) { ## If tag
        netCol <- tolower(names(V(coGraph)))
        for (i in seq_along(ret@pvpick$clusters)){
          for (j in ret@pvpick$clusters[[i]])
            netCol[netCol==j] <- paste0("cluster",i)
        }
        netCol[!startsWith(netCol, "cluster")] <- "not_assigned"
        V(coGraph)$tag <- netCol
        V(coGraph)$tag_raw <- netCol

        
        if (!is.null(nodeN)) {
          addC <- V(coGraph)$tag
          for (nn in seq_along(names(V(coGraph)))) {
            if (V(coGraph)$tag[nn]!="not_assigned"){next}
            if (names(V(coGraph))[nn] %in% names(nodeN)) {
              addC[nn] <- nodeN[names(V(coGraph))[nn]]
            } else {
              next
            }
          }
          V(coGraph)$tag <- addC
        }
    }
    coGraph <- assign_community(ret, coGraph)
      if (!is.tbl_graph(coGraph)) {
          ret@igraph <- coGraph
      } else {
          ret@igraph <- as.igraph(coGraph)
      }
    ret
}

#' return_gene_path_graph
#' @noRd
return_gene_path_graph <- function(ret, gene_path_plot="kegg",
	org_db=org.Hs.eg.db,
	threshold=0.05) {

    if (ret@type=="alliance_genome_resources") {
        gene_list <- AnnotationDbi::select(org_db,
            keys = ret@rawText$Gene_ID, columns = c("ENTREZID"),
            keytype = "SYMBOL")$ENTREZID
        gene_list <- gene_list[!is.na(gene_list)] |> unique()
    } else {
        gene_list <- ret@rawText$Gene_ID |> unique()
    }


    if (gene_path_plot == "reactome") {
        if (requireNamespace("ReactomePA")) {
            pathRes <- ReactomePA::enrichPathway(gene_list)
        } else {
            stop("Please install ReactomePA")
        }
        pathRes@result$Description <- gsub("Homo sapiens\r: ",
                        "",
                        pathRes@result$Description)
    }
    else if (gene_path_plot == "kegg") {
        pathRes <- clusterProfiler::enrichKEGG(gene_list)
    }
    else {
        stop("Please specify 'reactome' or 'kegg'")
    }
    
    sigPath <- subset(pathRes@result, p.adjust<threshold)
    if (gene_path_plot=="kegg"){pathRes@keytype <- "ENTREZID"}
    ret@enrichResults <- pathRes@result
    sigPath <- subset(clusterProfiler::setReadable(pathRes,
        org_db)@result, p.adjust<threshold)

    if (dim(sigPath)[1]==0) {
        stop("No enriched term found.")
    } else {
        qqcat("Found @{dim(sigPath)[1]} enriched term\n")
    }

    pathGraph <- NULL
    for (i in 1:nrow(sigPath)){
        pa <- sigPath[i, "Description"]
        for (j in unlist(strsplit(sigPath[i, "geneID"], "/"))){
            pathGraph <- rbind(pathGraph, c(pa, j))
        }
    }
    pathGraph
}


#' process_network_manual
#' 
#' @description Process graph made from user-specified data.frame
#' @details Process the graph obtained from user data, like 
#' associating words to the other variables.
#' 
#' @param ret biotext class object
#' @param delete_zero_degree delete zero degree nodes
#' @param make_upper make these words uppercase
#' @param query_plot plot the column other than text
#' @param drop_ID drop `ID` column
#' @export
#' @return biotext class object
#' @examples 
#' obtain_manual(data.frame(text=c("test4 test4",
#'                                 "test3 test3 testi",
#'                                 "test2 test2"))) |>
#' make_corpus(num_only=TRUE) |>
#' make_TDM() |>
#' make_graph() |>
#' process_network_manual()
#' 

process_network_manual <- function(ret, delete_zero_degree=TRUE,
                       make_upper=NULL, query_plot=FALSE, drop_ID=TRUE) {
  DTM <- t(as.matrix(ret@TDM))
  
  if (query_plot) {
    if (!"query" %in% colnames(ret@rawText)) {stop("No query words specified")}
    row.names(DTM) <- ret@rawText$query
    
  }
  if (drop_ID) {
    ret@rawText$ID <- NULL
  }

  matSorted <- ret@wholeFreq
  coGraph <- as_tbl_graph(ret@igraphRaw)
  freqWords <- names(matSorted[1:ret@numWords])
  
  coGraph <- induced.subgraph(coGraph,
                              names(V(coGraph)) %in% freqWords)
  V(coGraph)$Freq <- matSorted[V(coGraph)$name]
  
  if (delete_zero_degree){
    coGraph <- induced.subgraph(coGraph,
                                degree(coGraph) > 0)
  }
  
  nodeName <- V(coGraph)$name
  dtmCol <- colnames(DTM)
  for (i in make_upper) {
    dtmCol[dtmCol == i] <- toupper(i)
    nodeName[nodeName == i] <- toupper(i)
  }
  V(coGraph)$name <- nodeName
  colnames(DTM) <- dtmCol
  
  df <- ret@rawText
  incCols <- colnames(df)
  incCols <- incCols[!incCols %in% c("query","text")]
  V(coGraph)$type <- "Words"
  if (length(incCols)!=0) {
    qqcat("Including columns @{paste(incCols, collapse=' and ')} to link with query\n")
    for (ic in incCols) {
          querymap <- df[,c(ic, "query")]
          vtx <- data.frame(cbind(c(querymap[,2], querymap[,1]),
            c(rep("Queries",length(querymap[,2])),
              rep(ic,length(querymap[,1]))))) |> 
          `colnames<-`(c("name","type"))
          vtx <- vtx[!duplicated(vtx),]
          vtx <- vtx |> `rownames<-`(1:nrow(vtx))
          eds <- data.frame(querymap)
          words <- vtx |> subset(.data$type==ic)
          queriesDf <- vtx |> subset(.data$type=="Queries")
          row.names(words)[which(words$name %in% eds[,1])]
          row.names(queriesDf)[which(queriesDf$name %in% eds[,2])]
          eds[,1] <- sapply(eds[,1], function(x) {
            as.integer(row.names(words)[which(words$name %in% x)])
          })
          eds[,2] <- sapply(eds[,2], function(x) {
            as.integer(row.names(queriesDf)[which(queriesDf$name %in% x)])
          })
          qmap <- tbl_graph(nodes=vtx,edges=eds,directed=FALSE)
          coGraph <- graph_join(as_tbl_graph(coGraph),
                                qmap)
    }
  }
  
  nodeN <- NULL
  for (coln in c(incCols, "query")) {
    tmpn <- df[[coln]]
    tmpnn <- rep(coln, length(tmpn))
    names(tmpnn) <- tmpn
    nodeN <- c(nodeN, tmpnn)
  }

  if (query_plot) {
    genemap <- c()
    for (rn in nodeName){
      tmp <- DTM[ ,rn]
      for (nm in names(tmp[tmp!=0])){
        if (nm!=""){
          if (grepl(",",nm,fixed=TRUE)){
            for (nm2 in unlist(strsplit(nm, ","))){
              genemap <- rbind(genemap, c(rn, paste(nm2)))
            }
          } else {
            genemap <- rbind(genemap, c(rn, paste(nm)))
          }
        }
      }
    }
    vtx <- data.frame(cbind(c(genemap[,2], genemap[,1]),
      c(rep("Queries",length(genemap[,2])),
        rep("Words",length(genemap[,1]))))) |> 
    `colnames<-`(c("name","type"))
    vtx <- vtx[!duplicated(vtx),]
    vtx <- vtx |> `rownames<-`(1:nrow(vtx))
    eds <- data.frame(genemap)
    words <- vtx |> subset(.data$type=="Words")
    queriesDf <- vtx |> subset(.data$type=="Queries")
    row.names(words)[which(words$name %in% eds[,1])]
    row.names(queriesDf)[which(queriesDf$name %in% eds[,2])]
    eds[,1] <- sapply(eds[,1], function(x) {
      as.integer(row.names(words)[which(words$name %in% x)])
    })
    eds[,2] <- sapply(eds[,2], function(x) {
      as.integer(row.names(queriesDf)[which(queriesDf$name %in% x)])
    })
    qmap <- tbl_graph(nodes=vtx,edges=eds,directed=FALSE)
    coGraph <- graph_join(as_tbl_graph(coGraph),
                          qmap)
  }
  
  if (length(E(coGraph))==0) {stop("No edge present, stopping.")}
  E(coGraph)$edgeColor <- E(coGraph)$weight
  tmpW <- E(coGraph)$weight
  tmpW[is.na(tmpW)] <- ret@corThresh
  E(coGraph)$weight <- tmpW
  
  ## Node attributes
  if (length(ret@pvpick)!=0) { ## If tag
    netCol <- tolower(names(V(coGraph)))
    for (i in seq_along(ret@pvpick$clusters)){
      for (j in ret@pvpick$clusters[[i]])
        netCol[netCol==j] <- paste0("cluster",i)
    }
    netCol[!startsWith(netCol, "cluster")] <- "not_assigned"
    V(coGraph)$tag <- netCol
    V(coGraph)$tag_raw <- netCol

    
    if (!is.null(nodeN)) {
      addC <- V(coGraph)$tag
      for (nn in seq_along(names(V(coGraph)))) {
        if (V(coGraph)$tag[nn]!="not_assigned"){next}
        if (names(V(coGraph))[nn] %in% names(nodeN)) {
          addC[nn] <- nodeN[names(V(coGraph))[nn]]
        } else {
          next
        }
      }
      V(coGraph)$tag <- addC
    }
  }
  
  ## Assign node category, not tag
  if (!is.null(nodeN)) {
    nodeCat <- NULL
    for (nn in seq_along(names(V(coGraph)))) {
      if (names(V(coGraph))[nn] %in% names(nodeN)) {
        nodeCat[nn] <- nodeN[names(V(coGraph))[nn]]
      } else {
        nodeCat[nn] <- "Words"
      }
    }
  } else {
    nodeCat <- rep("Words",length(V(coGraph)))
  }    
  V(coGraph)$nodeCat <- nodeCat
  
  coGraph <- assign_community(ret, coGraph)  

  if (!identical(ret@dic, logical(0))) {
    pdic <- ret@dic
    nodeDf <- as_tbl_graph(coGraph) |> activate("nodes") |> data.frame()
    V(coGraph)$name <- apply(nodeDf,
          1,
          function(x) {ifelse(x["type"]=="Words", pdic[x["name"]],
            x["name"])})
  }
  if (!is.tbl_graph(coGraph)) {
      ret@igraph <- coGraph
  } else {
      ret@igraph <- as.igraph(coGraph)
  }
  ret
}

#' assign_community
#' assign community and assign remaining queries the other category
#' If all words, just use like V(g)$community <- community$membership
#' @noRd
assign_community <- function(ret, coGraph) {
    if (!is.null(attributes(ret@communities)$names)) {
        memb <- ret@communities$membership
        uniq_memb <- memb |> unique()
        netCol <- tolower(names(V(coGraph)))
        for (i in seq_along(uniq_memb)){
            for (j in ret@communities$names[memb==uniq_memb[i]]) {
                netCol[netCol==j] <- paste0(uniq_memb[i])
            }
        }
        # netCol[!startsWith(netCol, "cluster")] <- "not_assigned"
        for (nn in seq_along(V(coGraph)$nodeCat)) {
            if (V(coGraph)$nodeCat[nn]!="Words") {
                netCol[nn] <- V(coGraph)$nodeCat[nn]
            }
        }
        V(coGraph)$community <- netCol
    }
    coGraph
}

#' plot_biotextgraph
#' 
#' @description Plot biotextgraph after processing graph
#' @details Plot the network stored in biotextgraph object after processing the graph.
#' @seealso plotwc plotnet
#' @param ret biotext object
#' @param edge_link whether to use edge link or edge diagonal
#' @param edge_label whether to show edge label
#' @param show_legend whether to show legend
#' @param cat_colors named vector specifying color for each category of node
#' @param query_color color for queried nodes
#' @param font_family font family
#' @param colorize colorize the word frequency and query separately
#' if FALSE, no node is plotted for query nodes. it overrides color_by_tag.
#' @param color_by_tag color by tagging information (color scale will be discrete)
#' @param color_by_community color by community information in graph
#' @param pal palette for node color specified as (low, high)
#' @param tag_colors palette for node color in discrete mode
#' @param color_text whether to color the text
#' @param layout graph layout
#' @param na_edge_color edge color for NA value
#' @param use_seed seed value for ggrepel
#' @param scale_range scale range for node and text size
#' @param discrete_color_word colorize words by "Words" category, not frequency.
#' @param add_pseudo_freq add pseudo value for nodes other than words
#' @export
#' @examples
#' testgenes <- c("IRF3","PNKP","DDX41","ERCC1","ERCC2","XRCC1")
#' refseq(testgenes) |> plot_biotextgraph()
#' @return biotext class object
plot_biotextgraph <- function(ret,
	edge_link=TRUE,
	edge_label=FALSE,
	show_legend=FALSE,
    cat_colors=NULL,
	query_color="grey",
	font_family="sans",
	colorize=FALSE,
	color_by_tag=FALSE,
    color_by_community=FALSE,
	tag_colors=NULL,
	color_text=TRUE,
	scale_range=c(5,10),
	pal=c("blue","red"),
	layout="nicely",
	na_edge_color="grey",
    add_pseudo_freq=FALSE,
    discrete_color_word=FALSE,
	use_seed=42) {

    if (color_by_community & color_by_tag) {
        stop("Cannot specify both of community or tag for coloring")
    }

	coGraph <- ret@igraph

    if (colorize | add_pseudo_freq) {
        fre <- V(coGraph)$Freq
        fre[is.na(fre)] <- min(fre, na.rm=TRUE)
        V(coGraph)$Freq <- fre
    }

    E(coGraph)$weightLabel <- round(E(coGraph)$weight, 3)

    if (color_by_community) {V(coGraph)$tag <- V(coGraph)$community}

    ## Make ggraph from here ##

    netPlot <- ggraph(coGraph, layout=layout)

    netPlot <- appendEdges(netPlot,
    	bn=is.directed(coGraph),
    	edgeLink=edge_link,
        edgeLabel=edge_label,
        showLegend=show_legend,
        fontFamily=font_family)

    cols <- V(coGraph)$nodeCat |> unique()
    if (is.null(cat_colors)) {
        cat_colors <- RColorBrewer::brewer.pal(length(cols), "Dark2")
        names(cat_colors) <- cols
        cat_colors["Genes"] <- query_color
        cat_colors["Microbes"] <- query_color
        cat_colors["query"] <- query_color
    }

    if (!is.null(V(coGraph)$tag) | color_by_community) {
        cols <- V(coGraph)$tag |> unique()
        if (is.null(tag_colors)) {
            tag_colors <- RColorBrewer::brewer.pal(length(cols), "Dark2")
            if (length(tag_colors) < length(cols)) {
                tag_colors <- colorRampPalette(RColorBrewer::brewer.pal(length(cols), "Dark2"))(length(cols))
            }
            names(tag_colors) <- cols
            tag_colors["Genes"] <- query_color
            tag_colors["query"] <- query_color
            tag_colors["Microbes"] <- query_color
        }
    }
    tagflag <- "none"
    if (color_by_tag) {
    	tagflag <- TRUE
    }
    if (color_by_community) {
    	tagflag <- TRUE
    }

    netPlot <- appendNodesAndTexts(netPlot,
        tag=tagflag,
        colorize=colorize,
        nodePal=NULL,
        showLegend=show_legend,
        catColors=cat_colors,
        pal=pal,
        fontFamily=font_family,
        colorText=color_text,
        scaleRange=scale_range,
        useSeed=use_seed,
        ret=ret,
        tagColors=tag_colors,
        discreteColorWord=discrete_color_word)

    netPlot <- netPlot +
        scale_edge_width(range=c(1,3), name = "Correlation")+
        scale_edge_color_gradient(low=pal[1],high=pal[2],
            name = "Correlation", na.value=na_edge_color)+
        theme_graph()

    if (!is.null(V(coGraph)$grp)) {
        netPlot <- netPlot + ggforce::geom_mark_hull(
            aes(netPlot$data$x,
                netPlot$data$y,
                group = .data$grp,
                label=.data$grp, fill=.data$grp,
                filter = !is.na(.data$grp)),
            concavity = 4,
            expand = unit(2, "mm"),
            alpha = 0.25,
            na.rm = TRUE,
            show.legend = FALSE
        )
    }
    ret@net <- netPlot
    ret
}
#' plot_wordcloud
#' @description Plot wordclouds
#' @details Plot the wordclouds stored in biotextgraph object.
#' @seealso plotwc plotnet
#' @param ret biotext object
#' @param num_words number of words to include
#' @param pal palette for colorizing words
#' @param make_upper make uppercase for these words
#' @param scale_freq scale the frequency for visualization
#' @param color_by_tag colorize based on tagging information
#' @param font_family font family
#' @param use_ggwordcloud use ggwordcloud or wordcloud function
#' @param arg_list arguments to pass to wordcloud functions
#' @param wc_scale scale factor for ggwordcloud
#' @export
#' @examples
#' refseq("DDX41", plotType="wc") |>
#' plot_wordcloud()
#' @return biotext class object
plot_wordcloud <- function(ret, num_words=30, pal=palette(),
	make_upper=NULL, scale_freq=NULL, color_by_tag=FALSE,
	font_family="sans", use_ggwordcloud=TRUE, arg_list=list(),
	wc_scale=10) {
	matSorted <- ret@wholeFreq
	if (num_words > length(matSorted)) {num_words <- length(matSorted)}
    returnDf <- data.frame(word = names(matSorted)[1:num_words],freq=matSorted[1:num_words])

    if (length(ret@pvpick)!=0) {
        wcCol <- returnDf$word
        for (i in seq_along(ret@pvpick$clusters)){
            for (j in ret@pvpick$clusters[[i]])
                wcCol[wcCol==j] <- pal[i]
        }
        wcCol[!wcCol %in% pal] <- "grey"
    }

    for (i in make_upper) {
        returnDf[returnDf$word == i,"word"] <- toupper(i)
    }
    if (!identical(ret@dic, logical(0))) {
    	pdic <- ret@dic
        for (nm in unique(returnDf$word)) {
            if (nm %in% names(pdic)) {
                returnDf[returnDf$word == nm, "word"] <- pdic[nm]
            }
        }
    }

    if (!is.null(scale_freq)) {
        showFreq <- returnDf$freq*scale_freq
    } else {
        showFreq <- returnDf$freq
    }

    if (color_by_tag){
        arg_list[["words"]] <- returnDf$word
        arg_list[["freq"]] <- showFreq
        arg_list[["family"]] <- font_family
        arg_list[["colors"]] <- wcCol
        arg_list[["random.order"]] <- FALSE
        arg_list[["ordered.colors"]] <- TRUE
        if ("bg.color" %in% names(arg_list)) {
            arg_list[["bg.colour"]] <- arg_list[["bg.color"]]
        }
        if (use_ggwordcloud) {
            wc <- do.call(ggwordcloud::ggwordcloud, arg_list)+
            scale_size_area(max_size = wc_scale)+
            theme(plot.background = element_rect(fill="transparent",
                colour = NA))
        } else {
            wc <- as.ggplot(as_grob(~do.call("wordcloud", arg_list)))
        }
    } else {
        arg_list[["words"]] <- returnDf$word
        arg_list[["freq"]] <- showFreq
        arg_list[["family"]] <- font_family
        if ("bg.color" %in% names(arg_list)) {
            arg_list[["bg.colour"]] <- arg_list[["bg.color"]]
        }
        if (use_ggwordcloud) {
            wc <- do.call(ggwordcloud::ggwordcloud, arg_list)+
            scale_size_area(max_size = wc_scale)+
            theme(plot.background = element_rect(fill = "transparent",
                colour = NA))
        } else {
            wc <- as.ggplot(as_grob(~do.call("wordcloud", arg_list)))
        }
    }
    ret@freqDf <- returnDf[!is.na(returnDf$word),]
    ret@wc <- wc
    return(ret)
}
noriakis/wcGeneSummary documentation built on May 31, 2024, 4:42 p.m.