#' @title Merge enriched GO terms.
#' @description combine results from GO enrichment tests obtained with \pkg{topGO} package,
#' for a given ontology (MF, BP, or CC).
#' @importFrom data.table data.table rbindlist := .SD
#' @importFrom biomaRt useDataset getBM useEnsembl
#' @importFrom topGO termStat scoresInTerm
#' @importFrom AnnotationDbi select keys
#' @importFrom GO.db GO.db
#' @family GO_terms
#' @param Input a list containing named elements. Each element must contain the name of \code{\link[topGO]{topGOdata-class}}
#' object created by \code{\link{create_topGOdata}} method and the associated \code{\link[topGO]{topGOresult-class}} object(s).
#' @details This method extracts for each result of GO enrichment test (\code{\link[topGO]{topGOresult-class}} object) and
#' corresponding GO annotations (\code{\link[topGO]{topGOdata-class}} object):
#' informations about GO term (identifiant, name, and description),
#' gene frequency (number of significant genes / Annotated genes), pvalue, -log10(pvalue), significant genes
#' identifiants (GeneID, or Ensembl ID, or uniprot accession), and gene symbols.
#' At the last, this method builds a merged data.table of enriched GO terms (p<0.01)
#' at least once and provides all mentionned columns.
#' @return an \code{\link{enrich_GO_terms-class}} object.
#' @references
#' Alexa A and Rahnenfuhrer J (2016). topGO: Enrichment Analysis for Gene Ontology. R package version 2.28.0.
#'
#' Matt Dowle and Arun Srinivasan (2017). data.table: Extension of data.frame. R package version 1.10.4. https://CRAN.R-project.org/package=data.table
#'
#' Herve Pages, Marc Carlson, Seth Falcon and Nianhua Li (2017). AnnotationDbi: Annotation Database Interface. R package version 1.38.0.
#' @include enrich_GO_terms.R
#' @examples
#' ###################
#' # load genes identifiants (GeneID,ENS...) universe/background (Expressed genes)
#' background_L<-scan(
#' system.file(
#' "extdata/data/input",
#' "background_L.txt",
#' package = "ViSEAGO"
#' ),
#' quiet=TRUE,
#' what=""
#' )
#'
#' ###################
#' # load Differentialy Expressed (DE) gene identifiants from files
#' L_pregnantvslactateDE<-scan(
#' system.file(
#' "extdata/data/input",
#' "L_pregnantvslactateDE.txt",
#' package = "ViSEAGO"
#' ),
#' quiet=TRUE,
#' what=""
#' )
#'
#' L_virginvslactateDE<-scan(
#' system.file(
#' "extdata/data/input",
#' "L_virginvslactateDE.txt",
#' package = "ViSEAGO"
#' ),
#' quiet=TRUE,
#' what=""
#' )
#'
#' L_virginvspregnantDE<-scan(
#' system.file(
#' "extdata/data/input",
#' "L_virginvspregnantDE.txt",
#' package="ViSEAGO"
#' ),
#' quiet=TRUE,
#' what=""
#' )
#' \dontrun{
#' ###################
#' # create topGOdata for BP for each list of DE genes
#' BP_L_pregnantvslactate<-ViSEAGO::create_topGOdata(
#' geneSel=L_pregnantvslactateDE,
#' allGenes=background_L,
#' gene2GO=myGENE2GO,
#' ont="BP",
#' nodeSize=5
#' )
#'
#' BP_L_virginvslactate<-ViSEAGO::create_topGOdata(
#' geneSel=L_virginvslactateDE,
#' allGenes=background_L,
#' gene2GO=myGENE2GO,
#' ont="BP",
#' nodeSize=5
#' )
#'
#' BP_L_virginvspregnant<-ViSEAGO::create_topGOdata(
#' geneSel=L_virginvspregnantDE,
#' allGenes=background_L,
#' gene2GO=myGENE2GO,
#' ont="BP",
#' nodeSize=5
#' )
#'
#' ###################
#' # perform TopGO tests
#' elim_BP_L_pregnantvslactate<-topGO::runTest(
#' BP_L_pregnantvslactate,
#' algorithm ="elim",
#' statistic = "fisher"
#' )
#'
#' elim_BP_L_virginvslactate<-topGO::runTest(
#' BP_L_virginvslactate,
#' algorithm ="elim",
#' statistic = "fisher"
#' )
#'
#' elim_BP_L_virginvspregnant<-topGO::runTest(
#' BP_L_virginvspregnant,
#' algorithm ="elim",
#' statistic = "fisher"
#' )
#'
#' ###################
#' # merge topGO results
#' BP_sResults<-ViSEAGO::merge_enrich_terms(
#' Input=list(
#' L_pregnantvslactate=c("BP_L_pregnantvslactate","elim_BP_L_pregnantvslactate"),
#' L_virginvslactate=c("BP_L_virginvslactate","elim_BP_L_virginvslactate"),
#' L_virginvspregnant=c("BP_L_virginvspregnant","elim_BP_L_virginvspregnant")
#' )
#' )
#' }
#' @exportMethod merge_enrich_terms
#' @name merge_enrich_terms
#' @rdname merge_enrich_terms-methods
#' @exportMethod merge_enrich_terms
setGeneric(
name="merge_enrich_terms",
def=function(Input){
standardGeneric("merge_enrich_terms")
}
)
#' @rdname merge_enrich_terms-methods
#' @aliases merge_enrich_terms
setMethod(
"merge_enrich_terms"
,signature="list",
definition=function(Input){
## check ontology
# same ontology ?
check.onto=unique(
unlist(
lapply(seq_along(Input),function(x){
# extract quering objects names
x=Input[[x]]
# check existance
values<-ls(envir=.GlobalEnv)
# check if available
available<-x%in%values
# stop if not
if(!all(available)){
stop(paste("objects not found:",paste(x[!available],collapse=", "),sep="\n"))
}
# get objects
x=mget(x,envir=.GlobalEnv)
# objects type
obj.type=vapply(x,class,"")
# extract ontoloy type
vapply(seq_along(x),function(y){
# extract ontology slot
if(obj.type[y]=="topGOdata"){
# for topGOdata
slot(x[[y]],"ontology")
}else{
# for topGOresult
sub("^.+\nOntology: ","",slot(x[[y]],"description"))
}
},"")
})
)
)
# check ontology
if(length(check.onto)>1){
# stop if more than one godata by list
stop("Only one ontology supported")
}
## topGO summary informations
# build topGO summary
topGO_summary=lapply(seq_along(Input),function(x){
# extract quering objects names
x=Input[[x]]
# keep names
x_names=x
# get objects
x=mget(x,envir=.GlobalEnv)
# objects type
obj.type=vapply(x,class,"")
# extract topGO objects summary
topGO<-lapply(seq_along(x),function(y){
# extract ontology slot
if(obj.type[y]=="topGOdata"){
# for topGOdata
list(
# description
description=slot(x[[y]],"description"),
# availables genes
available_genes=length(slot(x[[y]],"allGenes")),
# availables genes significant
available_genes_significant=table(slot(x[[y]],"allScores"))[2],
# feasibles genes
feasible_genes=table(slot(x[[y]],"feasible"))[2],
# feasibles genes significant
feasible_genes_significant=table(slot(x[[y]],"allScores")==1 & slot(x[[y]],"feasible")==TRUE)[2],
# nodes with at least x genes
genes_nodeSize=slot(x[[y]],"nodeSize"),
# number of nodes
nodes_number=length(slot(slot(x[[y]],"graph"),"nodes")),
# number of edges
edges_number=length(slot(slot(slot(x[[y]],"graph"),"edgeData"),"data"))
)
}else{
# for topGO result
list(
# description
description=slot(x[[y]],"description"),
# test name
test_name=sub(": ","p<",slot(x[[y]],"testName")),
# algorithm name
algorithm_name=slot(x[[y]],"algorithm"),
# scored GOs
GO_scored=length(slot(x[[y]],"score")),
# significant GOs
GO_significant=table(slot(x[[y]],"score")<0.01)[2],
# feasibles genes
feasible_genes=slot(x[[y]],"geneData")[1],
# feasibles genes significant
feasible_genes_significant=slot(x[[y]],"geneData")[2],
# nodes with at least x genes
genes_nodeSize=slot(x[[y]],"geneData")[3],
# nodes with at least x genes
Nontrivial_nodes=slot(x[[y]],"geneData")[4]
)
}
})
# extract topGO objects summary
names(topGO)<-x_names
# return topGO
topGO
})
# add names to topGO summary
names(topGO_summary)<-names(Input)
# find enrich GOs in a least one comparison
GOs<-lapply(seq_along(Input),function(x){
# objects type
Data=mget(Input[[x]],envir=.GlobalEnv)
## checking step
# objects type
obj.type=vapply(Data,class,"")
# objects type
if(sum(obj.type%in%"topGOdata")>1){
# stop if more than one godata by list
stop("Only one topGOdata object is supported by list")
}
# objects type
if(sum(obj.type%in%"topGOresult")<1){
# stop if more than one godata by list
stop("At least one topGOresult object needed by list")
}
## find and extract pvalues
# load GOdata
pos=which(obj.type=="topGOresult")
# tested algorithm
algorithms<-Data[pos]
# extract significvant pvalues results
unlist(
lapply(algorithms,function(y){
# extract scores
pvalues<-topGO::score(y)
# extract names of enrich terms
as.vector(names(pvalues[pvalues<0.01]))
})
)
})
# remove redondancy and convert to vector
GOs<-as.vector(unique(unlist(GOs)))
## check genes background
# extract genes background
allgenes<-lapply(seq_along(Input),function(x){
# objects type
Data=mget(Input[[x]],envir=.GlobalEnv)
# objects type
obj.type=vapply(Data,class,"")
# load GOdata
pos=which(obj.type=="topGOdata")
slot(Data[[pos]],"allGenes")
})
# check if same gene background
if(length(Input)>1){
same_genes_background=all(
vapply(2:length(allgenes),function(x){
identical(allgenes[[1]],allgenes[[x]])
},TRUE)
)
}else{
same_genes_background=TRUE
}
# stop if no enrich GO terms
if(length(GOs)==0){
stop("No enrich GO terms available in at least one condition")
}
# initialyse input
input=list()
# combine results
allResults<-lapply(seq_along(Input),function(x){
## extract Data
# objects type
Data=mget(Input[[x]],envir=.GlobalEnv)
# objects type
obj.type=vapply(Data,class,"")
# load GOdata
pos=which(obj.type=="topGOdata")
# load GOdata
GOdata=Data[[pos]]
# tested algorithm
algorithms<-Data[-pos]
## extract some statistics from initial GOdata object (before enrichment test)
# get the GOdatatermStat(GOdata)
Stats<-termStat(
GOdata,
whichGO = GOs
)
# convert to data.table
Stats<-data.table(
GO.ID=row.names(Stats),
genes_frequency=paste(
round(Stats$Significant/Stats$Annotated*100,digits=2),
"% (",Stats$Significant,"/",Stats$Annotated,")",
sep=""
)
)
## extract genes identifiants
# extract counts genes by term according GeneList
genes<-scoresInTerm(
GOdata,GOs,
use.names = TRUE
)
# extract genes Ids according GeneList
genes=lapply(names(genes),function(x){
# extract significant terms
val=attr(genes[[x]][genes[[x]]==2],"names")
# build a table
data.table(
GO.ID=x,
Significant_genes=if(length(val)>0){val}else{NA}
)
})
# convert to data.table
genes<-rbindlist(genes)
## add Genes symbols
# get db
db=strsplit(slot(GOdata,"description")," ")[[1]]
# if db match to bioconductor
if(db[1]=="Bioconductor"){
# load GeneID and symbols
annot<-data.table(
select(
get(db[2]),
keys=keys(get(db[2])),
columns =c("ENTREZID","SYMBOL")
)
)
# if found symbols
if(nrow(annot[!is.na(annot$ENTREZID)])>0){
# load GeneID and symbols
genes<-merge(
genes,
annot,
by.x="Significant_genes",
by.y="ENTREZID",
all.x=TRUE
)
}else{
# add empty name columns
genes[,Name:=NA]
}
}
# if db match to enstrezGene
if(db[1]=="EntrezGene"){
# function for generate data packets defined by the "by" argument
pos=function(Data,by=""){
if(by>length(Data)){
by=length(Data)
}
if(length(Data)<=by){
data.table(
start=1,
end=length(Data)
)
}else{
data.table(
start=seq(1,length(Data),by=by),
end=unique(c(seq(by,length(Data),by=by),length(Data)))
)
}
}
# pattern.extract
pattern.extract=function(query,m){
query=lapply(seq_along(query),function(i){
if(length(na.omit(m[[i]][1]))>0){
a=attr(m[[i]],"capture.start")
t(substring(query[i],a,a+attr(m[[i]],"capture.length")-1))
}else{
NA
}
})
query=do.call("rbind",query)
query[query%in%""]<-NA
query[query%in%"\t"]<-NA
query
}
# esummary
esummary<-function(...){
# submitted Data
Data<-na.omit(unique(unique(...)))
# batch size
wpos=pos(Data,by=500)
# core address
core="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?version=2.0&db="
# Data submission an retrieve
query=lapply(seq_len(nrow(wpos)),function(x){
# build block of id
Dat<-paste(
Data[wpos[x,start]:wpos[x,end]],
collapse=","
)
# submit block
query <- paste(core,"gene","&id=",Dat,sep = "")
# read and return reults
readLines(query)
})
query=paste(
unlist(query),
collapse=""
)
# parse results
query<-substring(
query,
unlist(gregexpr("<DocumentSummary ",query)),
unlist(gregexpr("</DocumentSummary>",query))
)
# extraction pattern
pattern="<DocumentSummary uid=\"(?<Id>[[:digit:]]*)\">\t<Name>(?<Name>.*)</Name>"
# locate elements
m1=gregexpr(
paste(pattern,collapse=""),
query,
perl=TRUE
)
# extract elements
res=data.table(
pattern.extract(query,m1)
)
# add an header
names(res)=attr(m1[[1]],"capture.name")
# return
return(res)
}
# get esummary from NCBI
annot<-esummary(genes$Significant_genes)
# if found symbols
if(nrow(annot[!is.na(annot$Id)])>0){
# load GeneID and symbols
genes<-merge(
genes,
annot,
by.x="Significant_genes",
by.y="Id",
all.x=TRUE
)
}else{
# add empty name columns
genes[,Name:=NA]
}
}
# if db match to Ensembl
if(db[1]=="Ensembl"){
# connect to Ensembl
mart<-useEnsembl(
"ensembl",
host=db[2],
version=db[6]
)
# connect to ensembl specified dataset
myspecies<-useDataset(
db[2],
mart
)
# load Ensembl genes, with GO annotations
annot<-data.table(
getBM(
attributes =c("ensembl_gene_id","external_gene_name"),
value=TRUE,
mart =myspecies
)
)
# if found symbols
if(nrow(annot[!"",on="external_gene_name"])>0){
# load GeneID and symbols
genes<-merge(
genes,
annot,
by.x="Significant_genes",
by.y="ensembl_gene_id",
all.x=TRUE
)
}else{
# add empty name columns
genes[,Name:=NA]
}
}
# load GeneID and symbols
names(genes)[3]<-"Significant_genes_symbol"
# reorder columns
genes<-genes[,c("GO.ID","Significant_genes","Significant_genes_symbol"),with=FALSE]
# collapse results
genes<-genes[,lapply(.SD,function(x){paste(x,collapse=";")}),.SDcols=2:3,by="GO.ID"]
# replace all blank cells by NA
genes[genes==""]<-NA
## extract pvalue according the algorithm results
# extract pvalues results
pvalues<-lapply(seq_along(algorithms),function(y){
# extract all pvalues from topGOresult
pvalue<-topGO::score(algorithms[[y]])
# select pvalues from topGOresult
pvalue<-pvalue[GOs]
# extract pvalue in data.table
pvalue<-data.table(
GO.ID=names(pvalue),
pvalue=as.numeric(format(pvalue,scientific = T)),
round(-log10(pvalue),digits=2)
)
# ordering pvalue by go term
names(pvalue)[ncol(pvalue)]="-log10_pvalue"
if(y>1){
# remove GO.ID
pvalue[,"GO.ID":=NULL]
}
# return
pvalue
})
# convert to data.table
pvalues=data.table(do.call("cbind",pvalues))
# algoritms
algorithms=vapply(algorithms,function(x){slot(x,"algorithm")},"")
# if use of different algorithms
if(length(algorithms)>1){
# add names
names(pvalues)[-1]<-paste(rep(algorithms,each=2),names(pvalues)[-1],sep=".")
}
# return input params
assign(
"input",
c(input,list(algorithms)),
inherits=TRUE
)
## combine results#
# all results in list
Results<-list(
data.table(GO.ID=GOs),
Stats,
pvalues,
genes
)
# merge all
Results<-Reduce(
function(...){
merge(
...,
by ="GO.ID",
sort=FALSE,
all=TRUE
)
},
Results
)
# remove NA in GO.Id column
Results<-Results[!is.na(Results$GO.ID)]
# Remove gene ID and symbol if GO term not significant
Results[Results$pvalue>=0.01,`:=`(Significant_genes=NA,Significant_genes_symbol=NA)]
if(!is.null(names(Input))){
# add GOdata name
if(x==1){
# add GOdata name in the header
names(Results)[-1]<-paste(names(Input)[x],
names(Results)[-1],sep=".")
}else{
# add GOdata name in the header
names(Results)[-1]<-paste(names(Input)[x],
names(Results)[-1],sep=".")
}
}
# return Results
Results
})
# number of Godata
nb=length(allResults)
# merge results if more than GOdata input
if(nb>1){
# merge all results
allResults=Reduce(
function(...){
merge(...,
by ="GO.ID",
sort=FALSE,
all=TRUE
)
},
allResults
)
}else{
# if only one GOdata unlist table
allResults<-allResults[[1]]
}
# add Input names
names(input)<-names(Input)
# extract GO terms
GO<-data.table(
select(
GO.db,
keys=allResults$GO.ID,
columns = c("GOID","TERM","DEFINITION")
)
)
# add GO term description and definition to sResults
allResults<-data.table(
GO,
allResults[,"GO.ID":=NULL]
)
# rename the fisrt 3 columns
names(allResults)[seq_len(3)]<-c("GO.ID","term","definition")
# significant results in at least one condition
new(
"enrich_GO_terms",
same_genes_background=same_genes_background,
input=input,
ont=check.onto,
topGO=topGO_summary,
data=allResults
)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.