## 1st the legacy stuff:
### Mandatory fields: objName, Class and L2Rchain
reactome_AnnDbBimap_seeds <- list(
list(
objName="PATHID2NAME",
Class="AnnDbBimap",
L2Rchain=list(
list(
tablename="pathway2name",
Lcolname="DB_ID",
Rcolname="path_name"
)
)
),
list(
objName="PATHID2EXTID",
Class="AnnDbBimap",
L2Rchain=list(
list(
tablename="pathway2gene",
Lcolname="DB_ID",
Rcolname="gene_id"
)
)
),
list(
objName="REACTOMEID2GO",
Class="AnnDbBimap",
L2Rchain=list(
list(
tablename="reactome2go",
Lcolname="DB_ID",
Rcolname="go_id"
)
)
)
)
createAnnObjs.reactome <- function(prefix, objTarget, dbconn, datacache)
{
##checkDBSCHEMA(dbconn, "reactome")
## AnnDbBimap objects
seed0 <- list(
objTarget=objTarget,
datacache=datacache
)
ann_objs <- createAnnDbBimaps(reactome_AnnDbBimap_seeds, seed0)
## Reverse maps
ann_objs$PATHNAME2ID <- revmap(ann_objs$PATHID2NAME, objName="PATHNAME2ID")
ann_objs$EXTID2PATHID <- revmap(ann_objs$PATHID2EXTID, objName="EXTID2PATHID")
ann_objs$GO2REACTOMEID <- revmap(ann_objs$REACTOMEID2GO, objName="GO2REACTOMEID")
## 1 special map that is not an AnnDbBimap object (just a named integer vector)
ann_objs$MAPCOUNTS <- createMAPCOUNTS(dbconn, prefix)
prefixAnnObjNames(ann_objs, prefix)
}
## newer select methods:
setMethod("keytypes", "ReactomeDb",
function(x) c("ENTREZID","GO","PATHNAME","PATHID","REACTOMEID")
)
setMethod("columns", "ReactomeDb",
function(x) c("ENTREZID","GO","PATHNAME","PATHID","REACTOMEID")
)
.keysReact <- function(x, keytype){
## argument checking
testForValidKeytype(x, keytype)
sql <- switch(keytype,
"ENTREZID" = "SELECT gene_id FROM pathway2gene",
"GO" = "SELECT go_id FROM reactome2go",
"PATHNAME" = "SELECT path_name FROM pathway2name",
"PATHID" = paste0("SELECT DB_ID FROM pathway2name UNION ",
"SELECT DB_ID FROM pathway2gene"),
## need proper reactome tables for this one:
"REACTOMEID" = "SELECT DB_ID FROM DatabaseObject",
stop("No keytype specified for .keys"))
as.character(unique(dbQuery(dbconn(x), sql, 1L)))
}
setMethod("keys", "ReactomeDb",
function(x, keytype, ...){
if(missing(keytype)) keytype <- "ENTREZID"
#.keysReact(x, keytype)
smartKeys(x=x, keytype=keytype, ..., FUN=.keysReact)
}
)
## I think the way to go is to generate the SQL based on table associations.
## pathways_ids are DB_IDs (a subset)
## DatabaseObject and all standard reactome tables also have DB_IDs as keys...
## So to get stuff, I just need to make an outer join across all affected tables, joining on DB_IDs etc as appropriate. Then at the end, I need to go to uppercase, and drop any unwanted columns. While we are in here, I should probably clean up the schema a bit (change reactome_id over to DB_ID on the supplemental tables - DONE)
## This could all get really complicated if we start to expand this a lot...
## match a col value to appropriate table
## Can ALSO match to a subquery (when appropriate)
## every thing that this function returns must contain the central ID.
## TODO: this needs to actually match up with TWO things (more complex):
## 1) the table/query (done).
## 2) the colnames
.getTable <- function(col, retVal="table"){
res <- switch(col,
"ENTREZID" = c("pathway2gene","gene_id"),
"GO" = c("reactome2go","go_id"),
"PATHNAME" = c("pathway2name","path_name"),
"PATHID" = c("pathway2name","DB_ID"),
## need proper reactome tables
"REACTOMEID" = c("DatabaseObject","DB_ID"),
stop("No col specified for .keys"))
## Then test and return appropriate records.
if(retVal=="table"){
res <- res[[1]]
}else if(retVal=="colname"){
res <- res[[2]]
}
res
}
## match all cols
.getTables <- function(cols, retVal="table"){
res <- character(length(cols))
for(i in seq_len(length(cols))){
res[i] <- .getTable(cols[i], retVal=retVal)
}
names(res) <- cols
unique(res)
}
## ## FUTURE TODO: if we have more than one table per type (and we will), then we
## ## need to have a way to get all those together and do that join as a
## ## subquery, but then treat it as a single table for the rest of the way...
## ## match a pair of things to a join
## .makeJoinWhereClause <- function(table1, table2){
## joins <- character()
## if(!missing(table1) && !missing(table2)){
## ## Two tables that need to be joined.
## joins <- paste(c(paste0(table1,".DB_ID"),
## paste0(table2,".DB_ID")),
## collapse="=")
## }else{
## ## Then there is only one table involved (no join needed)
## }
## joins
## }
## ## .makeJoinWhereClause("pathway2gene","pathway2name")
## ## match a pair of things to a join
## .makeJoinRelationship <- function(table1, table2){
## joins <- character()
## if(!missing(table1) && !missing(table2)){
## ## Two tables that need to be joined.
## joins <- paste(table1,"LEFT OUTER JOIN",table2, "ON")
## }else{
## ## Then there is only one table involved (no join needed)
## }
## joins
## }
## ## .makeJoinRelationship("pathway2gene","pathway2name")
## ## match all pairs for a given vector.
## ## Initially I thought to make use of this to solve the problem.
## ## combn(LETTERS[1:4], 2L)
## ## But no, because that it way too many joins. All I really need is one join
## ## for each link in the chain. So if I have a, b, c. I just need a-b and
## ## b-c. I don't care about a-c etc. Number of links will always be n-1
## ## (where n is the number of types. what I really need to use here is
## ## split... n <- length(types) - 1 a b c a b b c So remove 1st and last, Then
## ## rep(middleVals, each=2) then append 1st and last back on. Then split by 2s
## ## needed to properly rep the tables vector.
## .stutter <- function(tables){
## if(length(tables>2)){
## start <- tables[1]
## n <- length(tables)
## end <- tables[n]
## tablesTrunc <- tables[2:(n-1)]
## tablesReps <- rep(tablesTrunc, each=2)
## res <- c(start, tablesReps, end)
## }else{
## stop("To stutter you need at least 3 tables")
## }
## split(repTables, rep(1:(n-1), each=2))
## }
## ## Needed to process all the joins
## .makeJoins <- function(tables){
## n <- length(tables)
## res <- character()
## if(length(tables)==2){
## res[1] <- paste(.makeJoinRelationship(tables[1],tables[2]),
## .makeJoinWhereClause(tables[1],tables[2]), "UNION",
## .makeJoinRelationship(tables[2],tables[1]),
## .makeJoinWhereClause(tables[2],tables[1]))
## }else{
## repChunks <- .stutter(tables)
## ## Then
## for(i in seq_len(length(repChunks))){
## res[i] <-paste(.makeJoinRelationship(repChunks[[i]][1],repChunks[[i]][2]),
## .makeJoinWhereClause(repChunks[[i]][1],repChunks[[i]][2]),
## "UNION",
## .makeJoinRelationship(repChunks[[i]][2],repChunks[[i]][1]),
## .makeJoinWhereClause(repChunks[[i]][2],repChunks[[i]][1]))
## }
## }
## unique(res)
## }
## types = c("ENTREZID", "PATHNAME")
## types = c("ENTREZID", "PATHNAME", "GO")
## tables = .getTables(types)
## .makeJoins(tables)
## This is for making simple queries
.extractWithSimpleQuery <- function(x, table, colType, keys){
## generate a simple query for each table
sql <- paste("SELECT * FROM", table, "WHERE", colType,"IN",
paste0('("',paste(keys, collapse='","'),'")') )
## then extract it
dbQuery(dbconn(x), sql)
}
## this calls .extractWithSimpleQuery for each table and merges results
.collateQueryResults <- function(x, tables, colType, keys, mergeID="DB_ID"){
res <- data.frame()
mergeKeys <- character()
for(i in seq_len(length(tables))){
if(i==1){
res <- .extractWithSimpleQuery(x, tables[i], colType, keys)
}else{
if(i==2){ mergeKeys <- res[[mergeID]] } ## cond. for speed
res <- merge(res,
.extractWithSimpleQuery(x, tables[i],
colType=mergeID,
keys=mergeKeys),
by.x=mergeID, by.y=mergeID,
all.x=TRUE, all.y=TRUE)
}
}
res
}
## function for making select happen
.selectReact <- function(x, keys, cols, keytype){
## Some argument checking
testSelectArgs(x, keys=keys, cols=cols, keytype=keytype)
## filter out keys that are not legit (just from the DB query)
ktKeys = keys(x, keytype=keytype)
qkeys <- keys[keys %in% ktKeys]
## collate possible types (type must ALWAYS be in front)
types <-unique(c(keytype,cols)) ## this was not enough
## translate to relevant "tables" (some virtual) that I need stuff from
tables <- .getTables(types)
## need to know the colType that goes with the INITIAL keytype..
colType <- .getTable(keytype, retVal="colname")
## now I need to go through each table, and for each I want to extract the
## missing piece with a SIMPLE query (basically get ONE thing), and then
## append it onto the results
res <- .collateQueryResults(x, tables, colType, qkeys, mergeID="DB_ID")
## resort_base just needs:
## the results,
## ALL of the keys,
## the colname that matches the keys (jointype or colType) and
## the reqCols (which means the column names expected to be in the results
## at this point,
## primary object (i.e., TxDb, OrgDb etc.))
reqCols <- .getTables(types, retVal="colname")
## now drop any unrequested cols
res <- res[,reqCols,drop=FALSE]
## And then resort/tidy etc.
res <- resort_base(res, keys, jointype=colType, reqCols=reqCols)
## Capture relationship between uc and lc names
names(reqCols) <- types
## Then match to the colnames
colnames(res) <- names(reqCols)[match(colnames(res),reqCols)]
## OLDE (delete later):
## joins <- .makeJoins(tables)
## create a join for these tables.
## TODO: add keys to this query.
## sql <- paste("SELECT * FROM",joins, collapse=" UNION ")
## res <- dbQuery(dbconn(x), sql, 1L)
## TODO: sort the results
res
}
## select method
setMethod("select", "ReactomeDb",
function(x, keys, columns, keytype, ...){
## if (missing(keytype)) keytype <- "ENTREZID"
kt <- "ENTREZID"
.selectWarnReact(x, keys, columns, keytype, kt=kt, ...)
## .selectReact(x, keys, columns, keytype)
}
)
## usage:
## head(keys(reactome.db))
## head(keys(reactome.db, keytype="PATHNAME"))
## head(keys(reactome.db, keytype="PATHNAME", pattern="Homo.sapiens"))
## k = head(keys(reactome.db, keytype="ENTREZID", pattern="Homo.sapiens", column="PATHNAME"))
### Works, but this shows how sometimes we can only enrich by doing this... and this can be tricky!
## select(reactome.db, k, cols=c("ENTREZID","PATHNAME"), keytype="ENTREZID")
### The following works (but is probably too slow for a unit test)
## length(keys(reactome.db, keytype="ENTREZID", column="GO"))
## length(keys(reactome.db, keytype="ENTREZID"))
## TODO:
## Change this to be either full outer joins, or devise a new strategy???
## Remove repeated cols? (maybe done for me by resort_base?)
## resort_base()
## translate names (needs some new functions.
## Add a WHERE IN clause to the end for filtering out the keys...
## Full outer joins are just not going to work. I need a different approach.
## I need instead to just write a nibbler. Something that will just nibble at
## the SQL DB as needed and get just what is needed one slice at a time (just
## not practical to generate big ass queries all the time).
## The code will know
## 1) what tables have the data for each cols value
## 2) for each nibble (bite) it can call a simpler select function that just
## "knows" how to make VERY simple SQL queries based on the .getTables
## function idea (including the original keys). For more hopeless queries,
## subqueries can be placed in this function, but the point is that this
## function will only know how to get one kind of thing at a time.
## 3) then the results of each nibble can be merged into place in R
## This above part is mostly "done" (except I have to pass keys in!)
## Next comes the part where I remap col names and re-order/resort the results.
## 4) Use our method (resort_base) to clean up
## 5) rename the columns (this should be mapped up near where we specify what tables/queries are needed to extract based on COLs)
## WHY DO IT THIS WAY? Because if I write/generate a SQL query I end up
## pulling WAY more stuff back than I will ever need. This is slow and
## painful and the queries are all going to have to start as full outer joins,
## and I don't even know a priori what order they should be put in... In
## actual fact, I only want the records that go with my keys. So I will look
## up matches one tiny query at a time, and then merge the results together
## for a hybrid approach that I hope will be more performant.
## So to pull this off, I will need to deal with the fact that sometimes the
## keys I need will change mid-job. So I may start with GO IDs, so fine, I
## look up the reactome IDs (easy), but then I also need Entrez IDs, my code
## needs to be smart enough to use reactome IDs next after it fails to be able
## to nibble out the Entrez IDs using just the GO IDs. And it needs to have a
## logical set of fall back keys to use...
## TODO:
## 1) make this work when the initial keys don't all match. - filtering these in .reactSelect before calling to the DB is probably sufficient.... - DONE
## 2) find an example where the dim(res) is not the same number every time. IOW, verify that merge nibbling is not the equivalent of an inner join... - hint: just test with other IDs like DB_IDs - DONE
## 3) implement filtering and column renaming. - almost done, still have to do renaming and vectorize the colnames identifing (this latter step should take care of #4 below) - DONE
## 4) remove columns that are not requested from the results - DONE
## Bugs:
## 6) Do I have a resort_base bug with keys becoming duplicated??? - fixed?
## 7) Add documentation aliases
## 5) Finish up unit tests. (all here - but they get their own page)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.