## This file is for code that calls NCBI and then retrieves data to populate a
## local DB (eventually on the fly).
## I need to have functions that will make tables in a DB, but I also need
## (separate) function(s)? that get the "stuff" from NCBI. Let's start with
## that.
## Lets start by just making this thing get me some PMIDs for a string of EGs
## getNucleotideStuff <- function(x, ret="acc"){
## baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
## xsep <- paste(x, collapse=",")
## url <- paste(baseUrl,"db=nucleotide&id=",xsep,"&rettype=",ret, sep="")
## readLines(url)
## }
## egs = c("10","4557270")
## getNucleotideStuff(egs)
## getNucleotideStuff(egs, "gb")
## getNucleotideStuff(egs, "gp")
## getNucleotideStuff(egs, "seqid")
## getNucleotideStuff(egs, "ft")
## Another way to approach this is something like this:
## str(sapply(EG, function(elt) unlist(xpathApply(xmlDoc(elt), "//PubMedId", xmlValue))))
## where EGSet is the whole doc ie.
## EGSet = xmlParse(url)
## for (i in 1:n) print(xpathApply(EGSet, sprintf("count(//Entrezgene[%d]//PubMedId)", i)))
## for (i in 1:n) print(xpathApply(EGSet, sprintf("//Entrezgene[%d]//PubMedId/text()", i), xmlValue))
## or
## make queries
## xpq = sprintf("//Entrezgene[%d]//PubMedId", 1:n)
## lapply(xpq, xpathApply, doc=EGSet, fun=xmlValue)
## Other notes from discussion of Xpath with Martin on how to use XPath to
## retrieve more from a node without allocating a node...
## library(XML)
## base <-
## url <-
## sprintf("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=%s&retmode=xml",
## "1,10,100")
## eg <- xmlParse(url)
## sets <- getNodeSet(eg, "//Entrezgene")
## res <-
## xpathApply(sets[[1]],
## "//Other-source[//Dbtag_db/text()='GO']//Object-id_id/text()", xmlValue)
## xpathApply(eg, "count(//Dbtag_db[text()='GO'])")
## xpathApply(eg, "//Dbtag_db[text()='GO']/text()")
#####################################################################
## helper functions for retrieving data from nodes
## The following is a generic solution for getting stuff from sub-nodes
## Generic way to get src Docs
.getOtherSrcDocs <- function(doc, otherSourcePath){
## for each doc, get other Source nodes and make mini-docs
srcDocs <- lapply(getNodeSet(doc, otherSourcePath), xmlDoc)
srcDocs
}
## retrieve Db tag from such an other source doc.
.getDBTag <- function(srcDoc, dbTagPath){
dbTag <- unlist(xpathApply(srcDoc, dbTagPath, xmlValue))
if(is.null(dbTag)) dbTag <- "Bummer, the dbTag is NULL."
dbTag
}
## check the node and return the value requested
.checkRetrieveNode <- function(doc, dbTagPath, resIDPath, type){
results = vector("list",length(dbTagPath))
for(i in seq_len(length(dbTagPath))){
if(.getDBTag(doc, dbTagPath[i])==type
|| (is.null(type) & dbTagPath==resIDPath)){
results[[i]] <- (unlist(xpathApply(doc, resIDPath[i], xmlValue)))
}
}
if(!is.null(results))return(results)
}
## default values just get you KEGG gene IDs (for now)
.getSubNodeInfo <- function(doc,
type= "KEGG",
otherSourcePath = "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source",
dbTagPath = "/Other-source/Other-source_src/Dbtag/Dbtag_db",
resIDPath = "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_str"){
## Get sub-nodes for looped parsing
srcDocs <- .getOtherSrcDocs(doc, otherSourcePath)
## check to make sure we don't have unequal arguments
if(length(dbTagPath) != length(resIDPath))
stop("result and dbTag (checking) paths must have same number of elements")
if(length(dbTagPath) == 1){
results <- unlist(lapply(srcDocs, .checkRetrieveNode,
dbTagPath, resIDPath, type))
}else{ ##compound result is expected so don't simplify it.
results <- lapply(srcDocs, .checkRetrieveNode,
dbTagPath, resIDPath, type)
}
}
## GO attempt #2 (still fails since there are apparently some GO IDs where
## there is not a GO evidence code available. So I need to fully vectorize the generic code.
## A convenience function to pad GO Ids
.padGOIds <- function(GOIds){
pads <- 7-nchar(GOIds)
res<-character(length(GOIds))
for(i in seq_len(length(GOIds))){
res[i] <- paste(paste(rep("0",pads[i]),collapse=""),GOIds[i],sep="")
}
paste("GO:",res,sep="")
}
.modifyGOs <- function(elem){
elem[[1]] <- .padGOIds(elem[[1]])
elem[[2]] <- gsub("evidence: ", "", elem[[2]])
elem
}
## for GO retrieval I have partially vectorized the helper functions.
getGOInfo <- function(doc){
GOIds <- .getSubNodeInfo(doc, type = "GO",
otherSourcePath = "/Entrezgene/Entrezgene_properties/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source",
dbTagPath = rep("/Other-source/Other-source_src/Dbtag/Dbtag_db", 2),
resIDPath = c("/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_id", "/Other-source/Other-source_post-text"))
GOIds <- lapply(GOIds, .modifyGOs)
GOIds
}
getGeneStuff <- function(entrezGenes, dir = "files"){
baseUrl <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
xsep <- paste(entrezGenes, collapse=",")
url <- paste(baseUrl,"db=gene&id=",xsep,"&retmode=xml", sep="")
## NOW we have to parse the available XML
EGSet <- xmlParse(getURL(url, followlocation=TRUE))
## Here we will save the files out to a dir
miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
wd <- getwd()
path <- paste(wd, dir, sep="/")
if(!file.exists(path)){dir.create(path)}
setwd(path)
paths <- paste(paste(path, entrezGenes, sep="/"),".xml",sep="")
for(i in seq_len(length(miniDocs))){
saveXML( miniDocs[[i]], file = paths[[i]])
}
setwd(wd)
## attempt to remove miniDocs each time?
## (something tells me this is a token gesture only)
rm(miniDocs)
## even just doing this (saving the files) takes nearly 30 gigs of RAM before
## too long (but at least it completed).
## ## Some tags can only occur once per gene
## entrezGeneID <- unlist(xpathApply(EGSet, "//Gene-track_geneid", xmlValue))
## speciesName <- unlist(xpathApply(EGSet, "//Org-ref_taxname", xmlValue))
## ## But most information is either more complex than that or is stored in a
## ## more complex way by the XML
## ## miniDocs are the individual entrez Gene records
## ## miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
## ## ## ## Sometimes we are lucky and the information we want has an obviously
## ## ## ## unique tag.
## ## ## ## Like pubmed Ids
## ## pmIdPath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_refs/Pub/Pub_pmid/PubMedId"
## ## pmIds <- lapply(miniDocs, function(x)
## ## unlist(xpathApply(x, pmIdPath, xmlValue)))
## ###########################################################################
## ## Lets try to get rid of the minidocs
## ## miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
## ##pmIdPath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_refs/Pub/Pub_pmid/PubMedId"
## ## pmIds <- lapply(miniDocs, function(x)
## ## unlist(xpathApply(x, pmIdPath, xmlValue)))
## ## Can try this way: - no improvement?
## ## egs <- getNodeSet(EGSet, "//Entrezgene")
## ## ###pmIds <- xpathApply(xmlDoc(egs[[1]]), "//PubMedId", xmlValue)
## ## pmIds <- sapply(egs, function(x) unlist(xpathApply(xmlDoc(x), "//PubMedId", xmlValue)))
## ## Can also try this way: - but for 800 genes in a chunk,
## ## - at least it tops out at 2 GB (for 800 genes/chunk = better on usage AND
## ## - speed with smaller chunks, but it means we bug some server more
## ## - often...
## ## IT also takes MUCH longer though! (probably because of larger parse space))
## n <- length(entrezGenes)
## paths <- sprintf("//Entrezgene[%d]//PubMedId", 1:n)
## pmids <- lapply(paths, xpathApply, doc=EGSet, fun=xmlValue)
## pmIds <- lapply(pmids, unlist)
## ## This is frustrating. Total process looks to take about 6-12 Gigs of ram (and be pretty slow for parsing only PMIDs ~ 1 hour). I mean the ram usage is WAY down when I stop calling getNodeSet() (or maybe NOT - since it seems to climb more as things progress), but I am seriously tempted to just save out individual XML files to a tempDir, in an initial step, and then parse one at a time. Worst part is that it seems to eat MORE ram the further along it goes...
## ## I should really do this. At least if I am working from files in a dir, I
## ## can have better control over how the parsing is done and where the time
## ## is going.
## ## ## For GO I should be able to do something like this:
## ## res <- xpathApply(egs[[1]],
## ## "//Other-source[//Dbtag_db/text()='GO']//Object-id_id/text()", xmlValue)
## ## ## checks
## ## res1 <- xpathApply(egs[[1]], "count(//Dbtag_db[text()='GO'])")
## ## res2 <- xpathApply(egs[[1]], "//Dbtag_db[text()='GO']/text()")
## ## ## But sometimes we have to do some more checking to make sure that what we
## ## ## are retrieving is in the context of certain kinds of tags, or of tags
## ## ## that contain certain information.
## ## ## custom helper to get and process the GO terms
## ## GOIds <- lapply(miniDocs, getGOInfo)
## ## ## KEGG Gene IDs are NOT the path IDs.
## ## otherSourcePath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source"
## ## dbTagPath <- "/Other-source/Other-source_src/Dbtag/Dbtag_db"
## ## resIDPath_str <- "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_str"
## ## KEGGGeneIds <- lapply(miniDocs, .getSubNodeInfo, type = "KEGG",
## ## otherSourcePath = otherSourcePath,
## ## dbTagPath = dbTagPath,
## ## resIDPath = resIDPath_str)
## ## KEGGPathIds <- lapply(miniDocs, .getSubNodeInfo, type = "KEGG pathway",
## ## otherSourcePath = otherSourcePath,
## ## dbTagPath = dbTagPath,
## ## resIDPath = resIDPath_str)
## ## unigeneIds <- lapply(miniDocs, .getSubNodeInfo, type = "UniGene",
## ## otherSourcePath = otherSourcePath,
## ## dbTagPath = dbTagPath,
## ## resIDPath = resIDPath_str)
## ## resIDPath_id <- "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_id"
## ## ## May only want to look for MIM if we are tax ID 9606 (or can be empty here)
## ## MIMIds <- lapply(miniDocs, .getSubNodeInfo, type = "MIM",
## ## otherSourcePath = otherSourcePath,
## ## dbTagPath = dbTagPath,
## ## resIDPath = resIDPath_id)
## ## ## Refseqs are in another couple of places (we can merge these later, or I
## ## ## can write a helper like for GO and merge them as we go.)
## ## RSOtherSourcePath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary"
## ## RSdbTagPath <- "/Gene-commentary/Gene-commentary_heading"
## ## RSResIdTagPathRNA <- "/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_accession"
## ## RSRNAIds <- lapply(miniDocs, .getSubNodeInfo,
## ## type = "RefSeqs maintained independently of Annotated Genomes",
## ## otherSourcePath = RSOtherSourcePath,
## ## dbTagPath = RSdbTagPath,
## ## resIDPath = RSResIdTagPathRNA)
## ## RSResIdTagPathProt <- "/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_accession"
## ## RSProtIds <- lapply(miniDocs, .getSubNodeInfo,
## ## type = "RefSeqs maintained independently of Annotated Genomes",
## ## otherSourcePath = RSOtherSourcePath,
## ## dbTagPath = RSdbTagPath,
## ## resIDPath = RSResIdTagPathProt)
## ## ## Get official Symbol
## ## symbolSourcePath <- "/Entrezgene/Entrezgene_properties/Gene-commentary/Gene-commentary_properties/Gene-commentary"
## ## symbolDbTag <- "/Gene-commentary/Gene-commentary_label"
## ## symbolresIDPath <- "/Gene-commentary/Gene-commentary_text"
## ## symbolIds <- lapply(miniDocs, .getSubNodeInfo,
## ## type = "Official Symbol",
## ## otherSourcePath = symbolSourcePath,
## ## dbTagPath = symbolDbTag,
## ## resIDPath = symbolresIDPath)
## ## ## Get official Name
## ## fullNames <- lapply(miniDocs, .getSubNodeInfo,
## ## type = "Official Full Name",
## ## otherSourcePath = symbolSourcePath, ##same path as symbols
## ## dbTagPath = symbolDbTag,
## ## resIDPath = symbolresIDPath)
## ## ## Get alternate symbols (merge with official ones when making table)
## ## ## TODO: would be SAFER to do some of the 1:1 stuff like this too
## ## ## Therefore redo how I get EG, PMID, and species name.
## ## aliasSourcePath <- "/Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_syn"
## ## aliasDbTag <- "/Gene-ref_syn/Gene-ref_syn_E"
## ## aliasresIDPath <- "/Gene-ref_syn/Gene-ref_syn_E"
## ## aliasIds <- lapply(miniDocs, .getSubNodeInfo,
## ## type = NULL,
## ## otherSourcePath = aliasSourcePath,
## ## dbTagPath = aliasDbTag,
## ## resIDPath = aliasresIDPath)
## ## Data sanity checks:
## ## All genes should be from the same critter:
## ## TODO: move the checks on EG uniqueness to outside of this function
## if(length(unique(entrezGeneID)) != length(entrezGeneID))
## stop("Some of the entrez gene IDs have been repeated.")
## if(length(unique(speciesName))>1)
## stop("The IDs being processed need to all be from the same species.")
## ## The following checks can stay at this level though
## if(unique(entrezGeneID %in% entrezGenes) %in% FALSE) ##if any don't match
## stop("The entrez Genes discovered don't match the IDs being looked up!")
## if(length(entrezGenes) > length(entrezGeneID))
## warning("Some of the entrez Genes beings sought were not found.")
## if(length(entrezGeneID) > length(entrezGenes))
## stop("There are more EGs being found than we expected.")
## ## return a list of things
## list(entrez = entrezGeneID,
## species = speciesName,
## pmIds = pmIds##,
## ## GOIds = GOIds##,
## ## KEGGGeneIds = KEGGGeneIds,
## ## KEGGPathIds = KEGGPathIds,
## ## aliasIds = aliasIds,
## ## symbolIds = symbolIds,
## ## fullNames = fullNames,
## ## RSProtIds = RSProtIds,
## ## RSRNAIds = RSRNAIds,
## ## MIMIds = MIMIds,
## ## unigeneIds = unigeneIds
## )
}
## NCBI got back to me
##An example approach is given below:
## 1) esearch with taxid in the following format:
## http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=txid9606%5Borgn%5D+AND+alive%5Bprop%5D&usehistory=y
## 2) parse out the WebEnv and QueryKey value:
## http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=NCID_1_39931064_130.14.18.47_9001_1290210718_1522935737&query_key=1&rettype=uilist&retmode=text
## Regards,
## Tao Tao, PhD
## NCBI User Services
## get EGs from an NCBI tax ID
getEntrezGenesFromTaxId <- function(taxId){
## 1st retrieve the WebEnv and QueryKey values
url1 <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=txid",taxId,"%5Borgn%5D+AND+alive%5Bprop%5D&usehistory=y", sep="")
## NOW we have to parse the available XML
XML <- xmlParse(getURL(url1, followlocation=TRUE))
## Some tags can only occur once per gene
## TODO: wire up the xpath for this
webEnv <- unlist(xpathApply(XML, "//WebEnv", xmlValue))
queryKey <- unlist(xpathApply(XML, "//QueryKey", xmlValue))
## Then assemble the final URL
url2 <- paste(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=",
webEnv, "&query_key=", queryKey, "&rettype=uilist&retmode=text", sep="")
readLines(url2)
}
## Then we need to combine the sub-lists found withing our "super"-list
.mergeLists <- function(listOfLists){
## mergeLists just takes two lists at a time, and merges them by
## concatenating their contents.
list1 <- listOfLists[[1]]
result <- vector("list", length(list1))
for(i in seq_len(length(list1))){
for(j in seq_len(length(listOfLists))){
result[[i]] <- c(result[[i]], listOfLists[[j]][[i]])
}
}
## keep the names
names(result) = names(list1)
result
}
## General helper for merging two simple lists
.combineTwoLists <- function(list1,list2){
if(length(list1) != length(list2)){
stop("lists must be equal length to be merged")}
result <- vector("list",length(list1))
for(i in seq_len(length(list1))){
if(is.null(list1[[i]]) && is.null(list2[[i]])){
result[[i]] <- NA
}else{
result[[i]] <- c(list1[[i]], list2[[i]])
}
}
result
}
## conversion Utility:
.convertNullToNA <- function(list){
ind <- unlist(lapply(list, is.null))
list[ind] <- NA
list
}
## this just makes a nice named data.frame from your data lists
.makeSimpleDF <- function(entrez, fieldVals, name){
if(length(entrez) != length(fieldVals))
stop("To make data.frame from list + vector, both must be equal len.")
names(fieldVals) <- entrez
fieldVals <- .convertNullToNA(fieldVals)
expandedList <- unlist2(fieldVals)
result <- data.frame(names(expandedList), expandedList,
stringsAsFactors=FALSE)
colnames(result) <- c("gene_id", name)
## now I have to remove any rows with NA values across the whole row.
result <- result[!is.na(result[,2]),]
row.names(result) <- NULL ##1:dim(result)[1]
result
}
## used to collapse GO lists to a data frame
.unwindGOs <- function(GOIds, entrez, type){
res <- vector("list",length(GOIds))
for(i in seq_len(length(GOIds))){
for(j in seq_len(length(GOIds[[i]]))){
if( is.null(unlist2(GOIds[[i]][[j]][[type]])) ){
res[[i]] <- NA
}else{
res[[i]] <- c(res[[i]], unlist2(GOIds[[i]][[j]][[type]]))
}
}
}
names(res) <- entrez
res <- .convertNullToNA(res)
res
}
## used to make the 6 custom GO tables
.makeUnWoundGOTables <- function(entrez, GOIds, con){
## GOIds is a list of equal length to the entrez IDs
if(length(entrez) != length(GOIds)){
stop("There must be a list of GOIds")}
uw_gos <- .unwindGOs(GOIds, entrez, type=1)
if(length(uw_gos[is.na(uw_gos)]) != length(uw_gos)){
go_id <- unlist2(uw_gos)
evidence <- unlist2(.unwindGOs(GOIds, entrez, type=2))
ontology <- Ontology(go_id) ## This step REQUIRES that there be GO IDs
baseFrame <- cbind(gene_id = names(go_id), go_id=go_id,
evidence=evidence, ontology=ontology)
bp <- data.frame(matrix(split(baseFrame, ontology)$BP,
byrow=FALSE,
nrow=length(grep("BP",ontology))))[,1:3]
mf <- data.frame(matrix(split(baseFrame, ontology)$MF,
byrow=FALSE,
nrow=length(grep("MF",ontology))))[,1:3]
cc <- data.frame(matrix(split(baseFrame, ontology)$CC,
byrow=FALSE,
nrow=length(grep("CC",ontology))))[,1:3]
headerNames = c("gene_id","go_id","evidence")
names(bp) <- headerNames
names(mf) <- headerNames
names(cc) <- headerNames
.makeSimpleTable(bp, table = "go_bp", con, fieldNameLens=c(10,3),
indFields = c("_id", "go_id"))
.makeSimpleTable(mf, table = "go_mf", con, fieldNameLens=c(10,3),
indFields = c("_id", "go_id"))
.makeSimpleTable(cc, table = "go_cc", con, fieldNameLens=c(10,3),
indFields = c("_id", "go_id"))
## Now expand the three data.frames to incl all ancestor terms
bp_all <- .expandGOFrame(bp, GO.db::GOBPANCESTOR)
mf_all <- .expandGOFrame(mf, GO.db::GOMFANCESTOR)
cc_all <- .expandGOFrame(cc, GO.db::GOCCANCESTOR)
.makeSimpleTable(bp_all, table = "go_bp_all", con, fieldNameLens=c(10,3),
indFields = c("_id", "go_id"))
.makeSimpleTable(mf_all, table = "go_mf_all", con, fieldNameLens=c(10,3),
indFields = c("_id", "go_id"))
.makeSimpleTable(cc_all, table = "go_cc_all", con, fieldNameLens=c(10,3),
indFields = c("_id", "go_id"))
}else{
## as with other tables, if we have nothing to populate, then we don't
## want to even make a table!
warning(paste("no values found for any GO tables",
" in this data chunk.", sep=""))
return()
}
}
## TODO: flesh out the following:
#.makeMetaTables <- function(){}
.makeOrgDB <- function(sList, con){
.makeCentralTable(sList$entrez, con)
## gene_info table is special
## I need to make the data.frame and do some minor filtering.
## gene_infoData <- data.frame(
## gene_id = sList$entrez,
## gene_name = unlist(.convertNullToNA(as.list(sList$fullNames))),
## symbol = unlist(.convertNullToNA(as.list(sList$symbolIds))))
## gene_infoData <- ## still have to remove lines with no data!
## gene_infoData[!is.na(gene_infoData[,2]) & !is.na(gene_infoData[,2]),]
## .makeSimpleTable(gene_infoData,
## table = "gene_info", con, fieldNameLens=c(255,80),
## indFields = character())
.makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
fieldVals = sList$pmIds, "pubmed_id"),
table = "pubmed", con)
## .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
## fieldVals = .combineTwoLists(sList$alias,
## sList$symbol), "alias_symbol"),
## table = "alias", con)
## .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
## fieldVals = sList$KEGGPathIds, "path_id"),
## table = "kegg", con)
## .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
## fieldVals = .combineTwoLists(sList$RSProtIds,
## sList$RSRNAIds), "accession"),
## table = "refseq", con)
## .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
## fieldVals = sList$MIMIds, "omim_id"),
## table = "omim", con)
## .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
## fieldVals = sList$unigeneIds, "unigene_id"),
## table = "unigene", con)
## ## GO tables are special
## .makeUnWoundGOTables(entrez = sList$entrez, GOIds = sList$GOIds, con)
}
getDataAndAddToDb <- function(EGChunk, con){
list <- getGeneStuff(EGChunk)
# .makeOrgDB(list, con)
print(gc())
}
## Wrap the functionality like so:
buildEntrezGeneDbFromWebServices <- function(entrezGenes, file="test.sqlite"){
EGs <- entrezGenes
chunkSize <- 400 ## 800 is the max here but is probably NOT optimal
## TODO: check if there is a file and if so just remove it.
## file.remove(file) ##remove the old file when they re-run the code?
con <- dbConnect(SQLite(), file)
## Then break it into chunks
if(length(EGs)<chunkSize){
sList <- getGeneStuff(EGs)
.makeOrgDB(sList, con)
}else{
numChunks <- length(EGs) %/% chunkSize
remChunks <- length(EGs) %% chunkSize
splitFactor <- rep(seq_len(numChunks), each=chunkSize)
splitFactor <- c(splitFactor, rep(numChunks+1, each=remChunks))
EGChunks <- split(EGs, splitFactor)
## Then we just need to apply through and make a super-list
## superListOfLists <- lapply(EGChunks, getGeneStuff)
## sList <- .mergeLists(superListOfLists)
## .makeOrgDB(sList, con)
##lapply(EGChunks, getDataAndAddToDb, con)
for(chunk in EGChunks){
getDataAndAddToDb(chunk, con)
}
}
}
##############################################################################
## More TODO:
##
## 1) Wrap this so that we can 1) get all the EGs, then retrieve their results
## serially into one big super-list for import into a DB.
##
## 2) Some checking will have to be done as we add contents that are matched
## (thinking of the GO terms here) to the DB. Specifically, each GO ID needs
## to have an evidence code and I know from making this work, that some of
## them will NOT have that. Therefore, I have to check as they are formatted
## and put into the DB.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.