.ucscBase <- "http://hgdownload.cse.ucsc.edu/"
.getchainFiles <- function(url, fileName=NA_character_, verbose=TRUE) {
result <- .httrRead(url, extension=fileName, getmd5sum=TRUE)
if(length(result)) {
files <- paste0(url, "/", result$files)
df <- .httrFileInfo(files, verbose=TRUE)
if(identical(names(result), c("files","md5sum")))
cbind(df, md5sum=result$md5sum, stringsAsFactors=FALSE)
} else
data.frame(fileurl=NA_character_, date=NA,
size=NA, md5sum=NA_character_, stringsAsFactors=FALSE)
}
## FIXME: eutils file (and interface?) has moved to
## "https://www.ncbi.nlm.nih.gov/books/NBK25501/"
.organismToTaxid <- function(organism=character()) {
## query NCBI for taxonomy ID
.eutils <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils"
## 1. ids
uorganism <- unique(organism[!is.na(organism)])
query <- paste(uorganism, collapse=" OR ")
url <- sprintf("%s/esearch.fcgi?db=taxonomy&term=%s&retmax=%d",
.eutils, query, length(uorganism))
xml <- XML::xmlParse(url)
## 2. records
id <- as.character(sapply(xml["//Id/text()"], xmlValue))
scin <- taxid <- character()
if (length(id)) {
query2 <- paste(id, collapse=",")
url <- sprintf("%s/efetch.fcgi?db=taxonomy&id=%s&retmax=%d",
.eutils, query2, length(uorganism))
xml <- XML::xmlParse(url)
scin <- sapply(xml["/TaxaSet/Taxon/ScientificName"], xmlValue)
taxid <- sapply(xml["/TaxaSet/Taxon/TaxId/text()"], xmlValue)
}
scin[which(scin %in% "Pongo abelii")] <- "Pongo pygmaeus abelii"
scin[which(scin %in% "Xenopus (Silurana) tropicalis")]="Xenopus tropicalis"
scin[which(scin %in% "Ictidomys tridecemlineatus")]="Spermophilus tridecemlineatus"
## there are 3 special cases: WE provide query, ncbi returns scin
#a) query ="Pongo pygmaeus abelii", scin="Pongo abelii", taxid="9601"
#b) query ="Xenopus tropicalis", scin="Xenopus (Silurana) tropicalis", taxid="8364"
#c) query ="Spermophilus tridecemlineatus", scin="Ictidomys tridecemlineatus", taxid="43179"
## 3. Results
as.integer(taxid[match(organism, scin)])
}
.getUCSCResources <-
function(fileType, dirName, fileName, verbose=FALSE, justRunUnitTest=FALSE)
{
## get resource from UCSC
.fileBase <- sprintf("%sgoldenPath", .ucscBase)
genome_tbl <- rtracklayer::ucscGenomes(organism=TRUE)
genomes <- genome_tbl$db
## remove faulty genome.
rm <- c("cb1", "eboVir3", "dp2", "strPur1", "ci1", "calMil1","monDom1",
"balAcu1" ,"musFur1")
genomes <- setdiff(genomes, rm)
urls <- sprintf("%s/%s/%s", .fileBase, genomes, dirName)
if(justRunUnitTest)
urls <- tail(urls, n=2)
rsrc <- do.call(rbind, lapply(urls, .getchainFiles,
fileName=fileName, verbose=verbose))
rsrc <- rsrc[complete.cases(rsrc),]
title <- basename(rsrc$fileurl)
## parse the filename for each file type.
switch(fileType, chain={
rsrc$from <- sub("^([[:alnum:]]+)To[A-Z].*", "\\1", title)
rsrc$to <- sub(".*To([A-Z])([[:alnum:]]+).*", "\\L\\1\\E\\2",
title, perl=TRUE)
}, "2bit"={
rsrc$from <- sub(".2bit","", title)
}, {
stop("unknown fileType ", sQuote(fileType))
})
## add the organism
idx <- match(rsrc$from, genome_tbl$db)
rsrc$organism <- rep(NA_character_, length(idx))
rsrc$organism[!is.na(idx)] <- genome_tbl[idx[!is.na(idx)], "organism"]
## add the taxonmy Id.
rsrc$taxid <- rep(NA_character_, length(idx))
rsrc$taxid[!is.na(idx)] <- .organismToTaxid(rsrc$organism[!is.na(idx)])
rsrc
}
makeUCSCChain <- function(currentMetadata, justRunUnitTest=FALSE,
BiocVersion=BiocManager::version()) {
rsrc <- .getUCSCResources(fileType="chain", dirName="liftOver",
fileName="chain.gz", verbose=TRUE, justRunUnitTest)
## input_sources table
sourceSize <- as.numeric(rsrc$size)
sourceUrls <- rsrc$fileurl
sourceVersion <- gsub(" ", "_", rsrc$date)
sourceLastModifiedDate <- rsrc$date
## resources table
species <- rsrc$organism
genome <- rsrc$from
taxonomyId <- as.integer(rsrc$taxid)
title <- basename(rsrc$fileurl)
description <- sprintf("UCSC liftOver chain file from %s to %s",
rsrc$from, rsrc$to)
rdatapaths <-gsub(.ucscBase, "",sourceUrls)
md5sum <- rsrc$md5sum
Map(AnnotationHubMetadata,
SourceSize=sourceSize,
SourceUrl=sourceUrls,
SourceVersion=sourceVersion,
SourceLastModifiedDate = sourceLastModifiedDate,
SourceMd5 =md5sum,
Description=description,
Title=title,
Genome=genome,
Species=species,
TaxonomyId=taxonomyId,
RDataPath= rdatapaths,
MoreArgs=list(
BiocVersion=BiocVersion,
# input sources
SourceType= "Chain",
# resources
DataProvider = "UCSC",
Maintainer = "Bioconductor Maintainer <maintainer@bioconductor.org>",
Coordinate_1_based = FALSE,
Location_Prefix = .ucscBase,
RDataDateAdded = Sys.time(),
#rdata table
DispatchClass= "ChainFile" ,
RDataClass = "GRanges",
Recipe = NA_character_,
Tags = c("liftOver", "chain", "UCSC", "genome", "homology")))
}
makeAnnotationHubResource("UCSCChainPreparer", makeUCSCChain)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.