R/createOrganismPackage.R

Defines functions makeHubGTFIntoTxDb available.GTFsForTxDbs .getMaxEns makeOrganismDbFromBiomart makeOrganismDbFromUCSC makeOrganismDbFromTxDb .gentlyExtractDbFiles makeOrganismPackage .extractDbFiles .biocAnnPackages .testKeys .testGraphData .mungeGraphData .getMetaDataValue .taxIdToOrgDbName .taxIdToOrgDb .packageTaxIds

Documented in makeOrganismDbFromBiomart makeOrganismDbFromTxDb makeOrganismDbFromUCSC makeOrganismPackage

## New helper to lookup which org object or package should be used
## based on the taxonomy ID.  It takes a tax ID and returns an appropriate OrgDb object.
.packageTaxIds <- function(){
    c('180454'='org.Ag.eg.db',
      '3702'='org.At.tair.db',
      '9913'='org.Bt.eg.db',
      '9615'='org.Cf.eg.db',
      '9031'='org.Gg.eg.db',
      '9598'='org.Pt.eg.db',
      '511145'='org.EcK12.eg.db',
      '386585'='org.EcSakai.eg.db',
      '7227'='org.Dm.eg.db',
      '9606'='org.Hs.eg.db',
      '10090'='org.Mm.eg.db',
      '9823'='org.Ss.eg.db',
      '10116'='org.Rn.eg.db',
      '9544'='org.Mmu.eg.db',
      '6239'='org.Ce.eg.db',
      '8355'='org.Xl.eg.db',
      '559292'='org.Sc.sgd.db',
      '7955'='org.Dr.eg.db')
}

.taxIdToOrgDb <- function(taxid) {
    ## These are the packaged TaxIds
    packageTaxIds <- .packageTaxIds()
    if (taxid %in% names(packageTaxIds)) {
        pkg <- packageTaxIds[names(packageTaxIds) %in% taxid]
        nmspc <- loadNamespace(pkg)
        res <- get(pkg, nmspc)
    } else {
        ## If we don't have a package, then lets get the taxIds and AHIds
        ## for the hub objects
        loadNamespace("AnnotationHub")
        ah <- AnnotationHub::AnnotationHub()
        ah <- subset(ah, ah$rdataclass=='OrgDb')
        mc <- mcols(ah)[,'taxonomyid', drop=FALSE]
        ## Then just get the object
        AHID <- rownames(mc[mc$taxonomyid==taxid,,drop=FALSE])
        if (!length(AHID))
            stop("no OrgDb package found for taxid ", taxid)
        else res <- ah[[AHID]]
    }
    res
}
## examples:
## .taxIdToOrgDb(9606)
## .taxIdToOrgDb(9986)


## Need another helper to get us from taxID to the OrgDbName...
.taxIdToOrgDbName <- function(taxid) {
    packageTaxIds <- .packageTaxIds()
    if (taxid %in% names(packageTaxIds)) {
        pkg <- packageTaxIds[names(packageTaxIds) %in% taxid]
        nmspc <- loadNamespace(pkg)
        path <- dbfile(get(pkg, nmspc))
        res <- sub("sqlite$", "db", basename(path))
    } else {
        ## If we don't have a package, then lets get the taxIds and AHIds
        ## for the hub objects
        loadNamespace("AnnotationHub")
        ah <- AnnotationHub::AnnotationHub()
        ah <- subset(ah, ah$rdataclass=='OrgDb')
        mc <- mcols(ah)[,c('taxonomyid','title'), drop=FALSE]
        ## Then just get the object
        data <- mc[mc$taxonomyid==taxid,,drop=FALSE]
        res <- sub("sqlite","db", data$title)
    }
    res
}
## examples
## .taxIdToOrgDbName(9606)
## .taxIdToOrgDbName(9986)


#############################################################################

## simplify DB retrievals from metadata table
.getMetaDataValue <- function(db, name){
  con <- AnnotationDbi::dbconn(db)
  res <- dbGetQuery(con,
    paste0("SELECT value FROM metadata WHERE name='", name,"'"))[[1]]
  if(!is.character(res))
      stop("missing metadata table value for: ", name)
  res
}

## function to allow us to convert a list into the inernally preferred form...
.mungeGraphData <- function(graphData){
  ##pkgs <- sapply(graphData, names) ## This is no good.   :(
  pkgs <- matrix(unlist(lapply(graphData, names)), ncol=2, byrow=TRUE)
  keys <- matrix(unlist(graphData), ncol=2, byrow=TRUE)
  graphData <- cbind(pkgs, keys)
  colnames(graphData) <- c("xDbs","yDbs","xKeys","yKeys")
  graphData
}

## early sanity checks for graphData
.testGraphData <- function(graphData){
  if(ncol(graphData) !=4L)
      stop("'graphData' must contain exactly 4 columns.")
}

## test keys for graphData BEFORE we make any objects (which just
## means that we are not going to use data from the @resources slot
## for this function)
.testKeys <- function(fkeys){
  pkgs <- unlist(lapply(names(fkeys), get))
  res <- logical(length(pkgs))
  for(i in seq_len(length(pkgs))){
    res[i] <- fkeys[i] %in% columns(pkgs[[i]])
  }
  if(!all(res))
    stop("some foreign keys are not present in their associated databases")
}

## helper to list bioc Annot packages (for filling in things like
## suggests fields)
.biocAnnPackages <- function(){
    availAnns <- as.data.frame(available.packages(
                               contrib.url(repositories()[["BioCann"]],
                               "source")))
    as.character(availAnns[["Package"]])
}


## Helper to set up to just load packages that need loading.
.extractDbFiles <- function(gd, deps){
    pkgs <- unique(names(.extractPkgsAndCols(gd)))
    ## Before we can proceed, we may need to call library on the deps...
    lapply(deps, library, character.only = TRUE)
    files <- unlist(lapply(pkgs, function(x){dbfile(get(x))}))
    setNames(files, pkgs)
}


## We want makeOrganismPackage to be self contained (have all it needs)
## IOW we want to store .sqlite files in a local inst/extdata when
## they are not known packages.
makeOrganismPackage <- function(pkgname,
                                graphData,
                                organism,
                                version,
                                maintainer,
                                author,
                                destDir,
                                license="Artistic-2.0"){
   ## there should only be one template
   template_path <- system.file("OrgPkg-template",package="OrganismDbi")
   ## We need to get a list of dependencies:
   ## 1st convert graphData into a data.frame
   gd <- .mungeGraphData(graphData)
   allDeps <- unique(as.vector(gd[,1:2]))
   ## Filter dependencies to make sure they are really package names
   biocPkgNames <- .biocAnnPackages()
   deps <- allDeps[allDeps %in% biocPkgNames]
   depsStr <- paste(deps,collapse=", ")
   ## We need to define some symbols in order to have the
   ## template filled out correctly.
   symvals <- list(
    PKGTITLE=paste("Annotation package for the",pkgname,"object"),
    PKGDESCRIPTION=paste("Contains the",pkgname,"object",
      "to access data from several related annotation packages."),
    PKGVERSION=version,
    AUTHOR=author,
    MAINTAINER=maintainer,
    LIC=license,
    ORGANISM=organism,
    ORGANISMBIOCVIEW=gsub(" ","_",organism),
    PKGNAME=pkgname,
    DEPENDENCIES=depsStr
   )
   ## Check the graphData object and rename if needed
   .testGraphData(gd)
## Try to call require on all the supporting packages.
## pkgs <- unique(names(.extractPkgsAndCols(gd)))
## for (pkg in pkgs)
##     .initPkg(pkg)

   ## #########################################################################
   ## Extract the dbFile information from each object and store that
   ## into resources

   ## EXCEPT Don't: (I can't really do this for 'packages' as the data
   ## is system specific)

   ## TODO: change this so that it isn't getting a bunch of dbFiles
   ## and then throwing them aways (or so that it's optional with a
   ## parameter or whatever seems appropriate for this function)

   resources <- .extractDbFiles(gd, deps)
   resources <- unlist(lapply(resources, function(x){return("")}))
   ## #########################################################################

   ## Also check that the fkeys are really columns for the graphData
   fkeys <- .extractPkgsAndCols(gd)
   .testKeys(fkeys)

   ## Should never have duplicates
   if (any(duplicated(names(symvals))))
       stop("'symvals' contains duplicated symbols")
   ## All symvals should by single strings (non-NA)
   is_OK <- sapply(symvals, isSingleString)
   if (!all(is_OK)) {
       bad_syms <- paste(names(is_OK)[!is_OK], collapse="', '")
       stop("values for symbols '", bad_syms, "' are not single strings")
   }
   createPackage(pkgname=pkgname,
                 destinationDir=destDir,
                 originDir=template_path,
                 symbolValues=symvals)

   ## Now just do work to save the data.frame (pass that in instead of the
   ## other stuff) in /data as a serialized R file.
   ## There will already be a /data dir in the template
   ## So just save to it:
   graphData <- gd
   graphInfo <- list(graphData=graphData, resources=resources)
   ## create data dir (because R CMD build removes empty dirs)
   ## And then save the data there.
   dir.create(file.path(destDir,pkgname,"data"))
   save(graphInfo, file=file.path(destDir,pkgname,"data","graphInfo.rda"))

   ## Get and other things that need to be saved and stash them into
   ## /inst/extdata
   otherDeps <- allDeps[!allDeps %in% biocPkgNames]
   .saveFromStr <- function(x, file){saveDb(x=get(x), file=file)}
   if(length(otherDeps)>0){
       ## Then we have to save stuff locally
       extPath <- file.path(destDir,pkgname,"inst","extdata")
       dir.create(extPath, recursive=TRUE)
       mapply(.saveFromStr, x=otherDeps, file=file.path(extPath,
                                           paste0(otherDeps,".sqlite")))
   }
}






################################################################################
## Now for some create functions that are more specialized:
## To create OrgansimDb objects from UCSC or from biomaRt
## the initial versions of these will just create the object (and not
## a package)

## Also: there are some pre-agreed upon expectations for this
## function.  It will make a specific kind of OrganismDb object.  One
## that contains certain specific elements (GO, OrgDb and TxDb - to
## start).  And the expectation is that we will always have those
## things when this function finishes.  So the contract is less
## general than before.


########
## Helper to set up to just load packages that need loading.  BUT this
## version of this function is less agressive and doesn't try to load
## a file if it already exists.  This is a different behavior than we
## want for packaging where things should be more strict.
.gentlyExtractDbFiles <- function(gd, deps){
    pkgs <- unique(names(.extractPkgsAndCols(gd)))
    ## Before we can proceed, we may need to call library on the deps...
    .library <- function(dep){
        if(!exists(dep)){
            library(dep, character.only = TRUE)
        }
    }
    lapply(deps, .library)
    files <- unlist(lapply(pkgs, function(x){dbfile(get(x))}))
    setNames(files, pkgs)
}


## from TxDb
## There are some issues with this as it's currently implemented:
## Because its not using the txdb that is passed in but is instead
## making one up and then assigning it to the global namespace.  This
## is what we want for makeOrgansimDbFromXXX when XXX is UCSC or
## BiomaRt, but its *not* what we want for a simple exsting txdb.
## It's straightforward to do something different when the txdb
## exists().  But: how to I look up it's original name in the
## .GlobalEnv ???  I need to know it's name for the graphData list
## below.
## The other big problem is that if my TxDb objects don't have a
## dbfile() then they can't be saved and re-loaded later.  But this
## problem is not "new" for this object.
## Still TODO: 1) either put the assigned functions in a special env that is accesible to these objects OR else make a subclass that can hold those objects.
## 2) make a save method that complains if any of the objects has not had saveDb called on it before calling a constructor.  And also add a warning to the constructor when somethings dbfile() is an empty string.

### Private environment for storing TxDb objects as needed (failed strategy)
## To work I would have to make the env public.  - It was never a
## great idea fo r this use case.
## TxDbObjs <- new.env(hash=TRUE, parent=emptyenv())
## I may still want to go with the other option of stashing this data
## into a subclass...  For one thing, that option can't have name
## clashes...
## Also: the name this local TxDb gets assigned to cannot be the same as is used by a package.  Otherwise a shortened 'custom' TxDb can be overwritten by a name clash with a package name...  This could end up being true even if I store the TxDb locally inside of a named sub-class.
## Also also: the name should not be made 'special' in the case where makeOrganismDbFromTxDb is called as a helper function from within makeOrganismDbFromUCSC or makeOrganismDbFromBiomart.

makeOrganismDbFromTxDb <- function(txdb, keytype=NA, orgdb=NA){
    if (!is(txdb, "TxDb"))
        stop("'txdb' must be a TxDb object")
    if(!is(orgdb, "OrgDb") && !is.na(orgdb))
        stop("'orgdb' must be an OrgDb object or NA")
    if (!isSingleStringOrNA(keytype))
        stop("'keytype' must be a single string or NA")

    ## Then assign that object value to the appropriate name:
    txdbName <- makePackageName(txdb)
    ## We temp assign to global scope
    ## (b/c you need it there if you 'generated' it)
    ## After we can remove it? (will be stored in the object)
    assign(txdbName, txdb, .GlobalEnv)
    ## Then get the tax ID:
    taxId <- taxonomyId(txdb)

    ## Then get the name and valued for the OrgDb object
    if (!is(orgdb, "OrgDb") && is.na(orgdb)){
        orgdbName <- .taxIdToOrgDbName(taxId)
        orgdb <- .taxIdToOrgDb(taxId)
        assign(orgdbName, orgdb, .GlobalEnv)
    }else{
        org <- metadata(orgdb)[metadata(orgdb)$name=='ORGANISM',2]
        org <- sub(" ", "_", org)
        orgdbName <- paste0('org.',org,'.db')
        orgdb <- orgdb
        assign(orgdbName, orgdb, .GlobalEnv)
    }
    ## get the primary key for the OrgDb object:
    if(is.na(keytype)){
        geneKeyType <- chooseCentralOrgPkgSymbol(orgdb)
    }else{
        geneKeyType <- keytype
    }

    graphData <- list(join1 = setNames(object=c('GOID', 'GO'),
                                       nm=c('GO.db', orgdbName)),
                      join2 = setNames(object=c(geneKeyType, 'GENEID'),
                                       nm=c(orgdbName, txdbName)))

    ## get the organism
    organism <- organism(txdb)

    #############################################################
    ## Process and then test the graph Data
    gd <- .mungeGraphData(graphData)
    .testGraphData(gd)
    allDeps <- unique(as.vector(gd[,1:2]))
    biocPkgNames <- .biocAnnPackages()
    deps <- allDeps[allDeps %in% biocPkgNames]
    resources <- .gentlyExtractDbFiles(gd, deps)
    ## Check that the fkeys are really columns for the graphData
    fkeys <- .extractPkgsAndCols(gd)
    .testKeys(fkeys)
    ## Then make the object:
    graphInfo <- list(graphData=gd, resources=resources)
    OrganismDb(graphInfo=graphInfo)
}


## from UCSC
makeOrganismDbFromUCSC <- function(genome="hg19",
                                   tablename="knownGene",
                                   transcript_ids=NULL,
                                   circ_seqs=NULL,
                                   url="http://genome.ucsc.edu/cgi-bin/",
                               goldenPath.url=getOption("UCSC.goldenPath.url"),
                                   miRBaseBuild=NA){

    if (!missing(url))
        .Deprecated(msg="'url' argument is deprecated and was ignored")

    ## So call the function to make that TxDb
    txdb <- makeTxDbFromUCSC(genome=genome,
                             tablename=tablename,
                             transcript_ids=transcript_ids,
                             circ_seqs=circ_seqs,
                             goldenPath.url=goldenPath.url,
                             miRBaseBuild=miRBaseBuild)
    makeOrganismDbFromTxDb(txdb)
}

makeOrganismDbFromBiomart <- function(biomart="ENSEMBL_MART_ENSEMBL",
                                      dataset="hsapiens_gene_ensembl",
                                      transcript_ids=NULL,
                                      circ_seqs=NULL,
                                      filter="",
                                      id_prefix="ensembl_",
                                      host="https://www.ensembl.org",
                                      port,
                                      miRBaseBuild=NA,
                                      keytype="ENSEMBL",
                                      orgdb = NA){

    if (!missing(port))
        warning("The 'port' argument is deprecated and will be ignored.")

    ## So call the function to make that TxDb
    txdb <- makeTxDbFromBiomart(biomart=biomart,
                                dataset=dataset,
                                transcript_ids=transcript_ids,
                                circ_seqs=circ_seqs,
                                filter=filter,
                                id_prefix=id_prefix,
                                host=host,
                                miRBaseBuild=miRBaseBuild)
    makeOrganismDbFromTxDb(txdb, keytype=keytype, orgdb=orgdb)
}

## PROBLEM: .extractDbFiles(gd, deps) requires (strictly) that all objects be available as files somewhere (no exceptions allowed)
## This means that when I get to this stage, with biomaRt, it fails because there is not a TxDb on disc...






















################################################################################
################################################################################


.getMaxEns <- function(srcUrl){
    release <- sub("^ftp://ftp.ensembl.org/pub/release-","",srcUrl)
    release <- unique(sub("/gtf/.*gtf.gz$","", release))
    max(release)
}

## I need a function that will list the GTFs that end users can use to
## make into TxDbs (will probably not overlap perfectly with available
## OrgDbs)
available.GTFsForTxDbs <- function() {
    loadNamespace("AnnotationHub")
    ah <-  AnnotationHub::AnnotationHub()
    ## get OrgDb species
    aho <- subset(ah, ah$rdataclass=='OrgDb')
    oTaxids <-unique(aho$taxonomyid)

    ## get GTF species from ensembl
    ahg <- subset(ah, grepl('gtf.gz$',ah$sourceurl))
    ahg <- subset(ahg, ahg$dataprovider=='Ensembl')
    max <- .getMaxEns(ahg$sourceurl)
    maxStr <- paste0("ftp://ftp.ensembl.org/pub/release-",max)
    ahg <- subset(ahg, grepl(maxStr,ahg$sourceurl))
    gTaxids <-unique(ahg$taxonomyid)

    ## intersect of taxIds
    taxInt <- intersect(oTaxids, gTaxids)
    ## subset down to just the ahgs that can work...
    ahg <- subset(ahg, ahg$taxonomyid %in% taxInt)
    ## for now return the subsetted annotationHubObject
    ahg
}


## And I need a function that will make the transformation for them
makeHubGTFIntoTxDb <- function(ahg){
    if(length(ahg) > 1){
        stop('This function expects only one hub object at a time.')}
    ## get the available GTFs.
    ahgs <- available.GTFsForTxDbs()
    ## Is this one of those? If so, then make it happen
    if(names(ahg) %in%  names(ahgs)){
        txMeta <- data.frame(name='Data source', value='Ensembl GTF')
        txdb <- makeTxDbFromGRanges(ahg[[1]],
                                    metadata= txMeta,
                                    taxonomyId=ahg$taxonomyid)
        require(OrganismDbi)
        ## requires using the 'ENSEMBL' keytype (for these TxDbs)
        ## odb <- makeOrganismDbFromTxDb(txdb, keytype='ENSEMBL')
        odb <- makeOrganismDbFromTxDb(txdb)
    }else{
        stop('No OrgDb information for ', ahg$species)
    }
    odb
}


## testing:
## ahgs <- OrganismDbi:::available.GTFsForTxDbs()
## ahg <-  ahgs[1]
## odb <- OrganismDbi:::makeHubGTFIntoTxDb(ahg)

## debug(OrganismDbi:::makeHubGTFIntoTxDb)
## debug(OrganismDbi:::makeOrganismDbFromTxDb)
## debug(OrganismDbi:::.gentlyExtractDbFiles)


## 1st bad problem is that it basically can't find this:
## org.Ailuropoda_melanoleuca.eg.db.  This is bad since it is one more
## thing that I have to put into the global namespace (temp hack).
## This will need to be done a different way!  Should I use a special
## package environment?  Will it help if I put a slot in that can hold
## an OrdDb?  Or should I find a way to make the resources vector more
## generic by having it only apply to hub items (IOW combine a generic
## character vector with hubcache information). - It is clear that the
## resources vector cannot work for all kinds of paths and situations
## though.  Since people can put objects ANYWHERE.

## 2nd bad problem (even worse if you can believe it), is that these
## orgDbs don't have ensembl IDs in them.  This is bad because that
## basically means that none of these organisms can be supported for
## these TxDbs (which will use ensembl based gene identifiers.  I can
## update the OrgDbs, it's just a crucial missing piece that has to be
## retrieved before I can proceed.

## 3rd problem this uncovered is that I had to use AH cars for the
## object just to get easy access to the metadata.  It seems like we
## could benefit from a universal application of that metadata to the
## contents that come out of these cars...  Then it would be
## straightforward to write a method like I wanted to here.
Bioconductor/OrganismDbi documentation built on Nov. 2, 2024, 7:20 a.m.