### =========================================================================
### makeNCBIToOrgDbs ('non-standard' OrgDbs)
### -------------------------------------------------------------------------
## This recipe makes 'non-standard' OrgDb sqlite files from data
## at ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/. These OrgDbs are less
## comprehensive than the 'standard' packages available in the
## Bioconductor repo. This code generates 1000 sqlite files.
## This recipe should be run right before a new release. The
## biocversion should be the current devel version, soon to roll over
## to the new release.
## The 'standard' OrgDbs are generated with makeStandardOrgDbsToSqlite.R.
.NCBIMetadataFromUrl <- function(baseUrl, justRunUnitTest, biocVersion, currentMetadata) {
load(system.file('extdata','viableIDs.rda', package='AnnotationForge'))
ids <- results
if (justRunUnitTest) ids <- head(ids)
## FIXME: need different solution; this subset produces NAs
if (length(biocVersion) > 1) {
stop(paste("'biocVersion' must be a single value. Make sure new",
"'OrgDbs' go into the CORRECT Bioconductor version!"))
## Marc's note:
## need to find an alternative to this... old school table of tax Ids
if (!exists("specData")) {
load(system.file("data", "specData.rda", package = "GenomeInfoDbData"))
sd <- specData[!is.na(specData[[3]]),]
## need to find offenders
lookup <- function(id){
message(paste0("looking up value for: ", id))
GenomeInfoDb:::lookup_organism_by_tax_id(id, all=TRUE)
## Some taxonomy IDs cannot be looked up at all - so discard
ids <- as.numeric(ids[ids %in% sd$tax_id])
res <- lapply(ids,lookup)
taxonomyId <-
as.integer(as.character(unlist(lapply(res, function(x){x$tax_id}))))
genus <- unlist(lapply(res, function(x){x$genus}))
species <- unlist(lapply(res, function(x){x$species}))
genus <- gsub(" ", "_", genus)
genus <- gsub("/", "|", genus)
species <- gsub(" ", "_", species)
species <- gsub("/", "|", species)
oriSpecies <- paste(genus, species)
fullSpecies <- gsub(" ", "_", oriSpecies)
title <- paste0("org.", fullSpecies, ".eg", ".sqlite")
rDataPath <- paste0("ncbi/uniprot/",biocVersion,"/",title)
genome <- setNames(rep("NCBI genomes", length(fullSpecies)), title)
dateMessage <- paste0('NCBI gene annotations as of ', as.character(date()))
sourceVersion <- rep(dateMessage, length(fullSpecies))
description <- paste("NCBI gene ID based annotations about", oriSpecies)
sourceUrls <- c(baseUrl,"ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz")
sourceUrl <- rep(list(sourceUrls), length(fullSpecies))
if (!requireNamespace("AzureStor", quietly = TRUE)){
message("AzureStor not installed.\n Regenerating all files")
azurefiles <- character(0)
sas = Sys.getenv("AZURE_SAS_TOKEN", NA_character_)
message("AZURE_SAS_TOKEN environment variable is not set.\n Regenerating all files.")
azurefiles <- character(0)
ep <- AzureStor::storage_endpoint(endpoint="https://bioconductorhubs.blob.core.windows.net",
## assumes upload to staginghub
container <- AzureStor::storage_container(ep, "staginghub")
azurefiles <- AzureStor::list_storage_files(container,
azurefiles <- unlist(lapply(azurefiles, FUN=basename))
}, error=function(e){
azurefiles <- character(0)
}, finally={
if (!exists("azurefiles")) azurefiles <- character(0)
if (length(azurefiles) != 0){
subset <- !(title %in% azurefiles)
lst <- lapply(list(title=title, species = oriSpecies,
taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
sourceVersion = sourceVersion,
description=description, rDataPath=rDataPath), "[", subset)
lst <- list(title=title, species = oriSpecies,
taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
sourceVersion = sourceVersion,
description=description, rDataPath=rDataPath)
lst <- list(title=title, species = oriSpecies,
taxonomyId = taxonomyId, genome = genome, sourceUrl=sourceUrl,
sourceVersion = sourceVersion,
description=description, rDataPath=rDataPath)
needToRerunNonStandardOrgDb <- function(biocVersion = BiocManager::version(),
baseUrl = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/",
resourceDir=".", justRunUnitTest=FALSE){
load(system.file('extdata','viableIDs.rda', package='AnnotationForge'))
ids <- results
if (justRunUnitTest) ids <- head(ids)
## FIXME: need different solution; this subset produces NAs
if (length(biocVersion) > 1) {
stop(paste("'biocVersion' must be a single value. Make sure new",
"'OrgDbs' go into the CORRECT Bioconductor version!"))
## Marc's note:
## need to find an alternative to this... old school table of tax Ids
if (!exists("specData")) {
load(system.file("data", "specData.rda", package = "GenomeInfoDbData"))
sd <- specData[!is.na(specData[[3]]),]
## need to find offenders
lookup <- function(id){
message(paste0("looking up value for: ", id))
GenomeInfoDb:::lookup_organism_by_tax_id(id, all=TRUE)
## Some taxonomy IDs cannot be looked up at all - so discard
ids <- as.numeric(ids[ids %in% sd$tax_id])
res <- lapply(ids,lookup)
taxonomyId <-
as.integer(as.character(unlist(lapply(res, function(x){x$tax_id}))))
genus <- unlist(lapply(res, function(x){x$genus}))
species <- unlist(lapply(res, function(x){x$species}))
genus <- gsub(" ", "_", genus)
genus <- gsub("/", "|", genus)
species <- gsub(" ", "_", species)
species <- gsub("/", "|", species)
oriSpecies <- paste(genus, species)
fullSpecies <- gsub(" ", "_", oriSpecies)
title <- paste0("org.", fullSpecies, ".eg", ".sqlite")
if (!requireNamespace("AzureStor", quietly = TRUE)){
message("AzureStor not installed.\n Cannot determine.")
azurefiles <- character(0)
sas = Sys.getenv("AZURE_SAS_TOKEN", NA_character_)
message("AZURE_SAS_TOKEN environment variable is not set.\n Cannot determine.")
azurefiles <- character(0)
ep <- AzureStor::storage_endpoint(endpoint="https://bioconductorhubs.blob.core.windows.net",
## assumes upload to staginghub
container <- AzureStor::storage_container(ep, "staginghub")
azurefiles <- AzureStor::list_storage_files(container, resourceDir)[,"name"]
azurefiles <- unlist(lapply(azurefiles, FUN=basename))
}, error=function(e){
azurefiles <- character(0)
}, finally={
if (!exists("azurefiles")) azurefiles <- character(0)
if (length(azurefiles)){
subset <- !(title %in% azurefiles)
res <- any(subset)
res <- TRUE
oldAWSS3_needToRerunNonStandardOrgDb <- function(biocVersion = BiocManager::version(),
baseUrl =
load(system.file('extdata','viableIDs.rda', package='AnnotationForge'))
ids <- results
## FIXME: need different solution; this subset produces NAs
if (length(biocVersion) > 1) {
stop(paste("'biocVersion' must be a single value. Make sure new",
"'OrgDbs' go into the CORRECT Bioconductor version!"))
## Marc's note:
## need to find an alternative to this... old school table of tax Ids
if (!exists("specData")) {
load(system.file("data", "specData.rda", package = "GenomeInfoDbData"))
sd <- specData[!is.na(specData[[3]]),]
## need to find offenders
lookup <- function(id){
message(paste0("looking up value for: ", id))
GenomeInfoDb:::lookup_organism_by_tax_id(id, all=TRUE)
## Some taxonomy IDs cannot be looked up at all - so discard
ids <- as.numeric(ids[ids %in% sd$tax_id])
res <- lapply(ids,lookup)
taxonomyId <-
as.integer(as.character(unlist(lapply(res, function(x){x$tax_id}))))
genus <- unlist(lapply(res, function(x){x$genus}))
species <- unlist(lapply(res, function(x){x$species}))
genus <- gsub(" ", "_", genus)
genus <- gsub("/", "|", genus)
species <- gsub(" ", "_", species)
species <- gsub("/", "|", species)
oriSpecies <- paste(genus, species)
fullSpecies <- gsub(" ", "_", oriSpecies)
title <- paste0("org.", fullSpecies, ".eg", ".sqlite")
aws <- system2("aws",
args=paste0("s3 ls s3://annotationhub/ncbi/uniprot/",
biocVersion," --recursive"), stdout=TRUE, stderr=TRUE)
}, error=function(e){
aws <- character(0)
stop("Cannot access AWS. Unable to determine")
}, finally={
if (!exists("aws")) aws <- character(0)
aws <- gsub("\\s+", " ", stringr::str_trim(aws))
aws <- aws[-1]
if (length(aws)){
s3titles <- sapply(strsplit(sapply(strsplit(aws, " "),"[[", 4), "/"),"[[",4)
subset <- !(title %in% s3titles)
res <- any(subset)
res <- TRUE
## STEP 1: make function to process metadata into AHMs
makeNCBIToOrgDbsToAHM <-
function(currentMetadata, justRunUnitTest = FALSE,
BiocVersion = BiocManager::version(),
baseUrl = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/") {
meta <- .NCBIMetadataFromUrl(baseUrl, justRunUnitTest,
message("Processing ", length(meta[[1]]), " files.")
MoreArgs=c(currentMetadata, list(
Coordinate_1_based = TRUE,
DataProvider = baseUrl,
Maintainer = paste("Bioconductor Package Maintainer",
RDataClass = "OrgDb",
DispatchClass = "SQLiteFile",
RDataDateAdded = Sys.time(),
Recipe = "AnnotationHubData:::NCBIToOrgDbs",
Tags = c("NCBI", "Gene", "Annotation"))))
## STEP 2: Make a recipe function that takes an AnnotationHubRecipe object.
NCBIToOrgDbs <- function(ahm){
fullSpecies <- ahm@Species
genus <- unlist(strsplit(fullSpecies,split=" "))[1]
species <- unlist(strsplit(fullSpecies,split=" "))[2]
dbname <- makeOrgPackageFromNCBI(version="1.0.0",
file.rename(from=file.path(dirname(ahm@HubRoot), dbname), to=file.path(ahm@HubRoot, ahm@RDataPath))
file.path(ahm@HubRoot, ahm@RDataPath)
## STEP 3: Call the helper to set up the newResources() method
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.