## location of Ensembl specific functions
.getEnsemblSSL <- function() {
cache <- .biomartCacheLocation()
bfc <- BiocFileCache::BiocFileCache(cache, ask = FALSE)
if(.checkInCache(bfc, hash = "ensembl-ssl-settings-httr2")) {
ensembl_config <- .readFromCache(bfc, "ensembl-ssl-settings-httr2")
} else {
ensembl_config <- .checkEnsemblSSL()
.addToCache(bfc, ensembl_config, hash = "ensembl-ssl-settings-httr2")
}
return(ensembl_config)
}
.checkArchiveList <- function(https = TRUE, http_config = list()) {
## determine if a cached version exists and if it's less than one week old
cache <- .biomartCacheLocation()
bfc <- BiocFileCache::BiocFileCache(cache, ask = FALSE)
cache_entry <- "ensembl-archive-html"
use_cached_version <- .useCache(bfc = bfc,
cacheEntry = cache_entry,
numDays = 7L)
if(use_cached_version) {
archive_html <- .readFromCache(bfc, cache_entry)
} else {
archive_html <- .getArchiveList(https = https, http_config = http_config)
.addToCache(bfc, archive_html, hash = cache_entry)
}
return(archive_html)
}
.getArchiveList <- function(https = TRUE, http_config = list()) {
url_worked <- FALSE
mirrors <- c("www", "asia", "useast")
protocol <- ifelse(https, "https://", "http://" )
while(!url_worked) {
if(length(mirrors) == 0) {
stop("Unable to contact any Ensembl mirror")
}
mirror_option <- mirrors[1]
mirrors <- mirrors[-1]
url <- paste0(protocol, mirror_option, ".ensembl.org/info/website/archives/index.html?redirect=no")
html_request <- request(url) |>
req_timeout(10) |>
req_options(!!!http_config) |>
req_retry(max_tries = 3) |>
req_error(is_error = \(resp) FALSE)
html <- req_perform(html_request)
## this is TRUE if there's an HTTP error or we get the Ensembl error page
if(identical(resp_status(html), 200L) &&
!grepl("The Ensembl web service you requested is temporarily unavailable", resp_body_string(html))) {
return( resp_body_string(html) )
}
}
}
.currentEnsemblVersion <- function() {
archives <- listEnsemblArchives()
current <- archives[archives$current_release == "*", ]
return(current)
}
## scrapes the ensembl website for the list of current archives and returns
## a data frame containing the versions and their URL
listEnsemblArchives <- function(https) {
if(!missing(https)) {
warning("Ensembl will soon enforce the use of https.\n",
"As such the 'https' argument will be deprecated in the next release.")
}
https <- TRUE
.listEnsemblArchives(https = https, http_config = list())
}
.listEnsemblArchives <- function(https = TRUE, http_config) {
html <- .checkArchiveList(https, http_config)
html <- xml2::read_html(html)
archive_box <- as.character(
xml2::xml_find_first(x = html, xpath = "//div[contains(@class,'archive-box')]")
)
archives <- strsplit(archive_box, split = "<li>")[[1]][-1]
extracted <- str_extract_all(string = archives,
pattern = "Ensembl [A-Za-z0-9 ]{2,6}|http[s]?://.*ensembl\\.org|[A-Z][a-z]{2} [0-9]{4}")
## split the version number into a separate column
extracted <- lapply(extracted, FUN = function(x) {
version <- str_match(x[2], pattern = ".+ ([a-zA-Z0-9]+)$")[2]
return( c(x, version) )
})
current <- ifelse(stringr::str_detect(archives, "- this site"), "*", "")
tab <- do.call("rbind", extracted)
tab <- cbind(tab, current)
dframe <- data.frame("name" = as.character(tab[,2]),
"date" = as.character(tab[,3]),
"url" = stringr::str_replace(tolower(as.character(tab[,1])),
"http://",
"https://"),
"version" = as.character(tab[,4]),
"current_release" = as.character(tab[,5]),
stringsAsFactors = FALSE)
return(dframe)
}
.listEnsembl <- function(mart = NULL, version = NULL, GRCh = NULL,
mirror = NULL, verbose = FALSE) {
if(is.null(version)) {
version_num <- .currentEnsemblVersion()["version"]
} else {
version_num <- version
}
## determine if a cached version exists and if it's less than one week old
cache <- .biomartCacheLocation()
bfc <- BiocFileCache::BiocFileCache(cache, ask = FALSE)
use_cached_version <- .useCache(bfc = bfc,
cacheEntry = paste0("ensembl-marts-", version_num),
numDays = 7L)
if(use_cached_version) {
marts <- .readFromCache(bfc, paste0("ensembl-marts-", version_num))
} else {
host <- .constructEnsemblURL(mirror = mirror, version = version, GRCh = GRCh)
port <- ifelse(grepl("https", host)[1], yes = 443, no = 80)
ensemblRedirect <- is.null(mirror)
http_config <- .getEnsemblSSL()
marts <- .listMarts(mart = mart, host = host, verbose = verbose, http_config = http_config,
port = port, ensemblRedirect = ensemblRedirect)
.addToCache(bfc, marts, hash = paste0("ensembl-marts-", version_num))
}
return(marts)
}
listEnsembl <- function(mart = NULL, version = NULL, GRCh = NULL,
mirror = NULL, verbose = FALSE) {
marts <- .listEnsembl(mart = mart, version = version, GRCh = GRCh,
mirror = mirror, verbose = verbose)
sel = which(marts$biomart == "ENSEMBL_MART_ENSEMBL")
if(length(sel) > 0){
marts$biomart[sel] = "genes"
}
sel = which(marts$biomart == "ENSEMBL_MART_SNP")
if(length(sel) > 0){
marts$biomart[sel] = "snps"
}
sel = which(marts$biomart == "ENSEMBL_MART_FUNCGEN")
if(length(sel) > 0){
marts$biomart[sel] = "regulation"
}
sel = which(marts$biomart == "ENSEMBL_MART_VEGA")
if(length(sel) > 0){
marts$biomart[sel] = "vega"
}
sel = which(marts$biomart == "ENSEMBL_MART_MOUSE")
if(length(sel) > 0){
marts$biomart[sel] = "mouse_strains"
}
return(marts)
}
#' creates an Ensembl URL based on the arguments provided to useEnsembl.
#' If there are conflicting options, order of precedence is:
#' GRCh, version, mirror
#' Default return value is https://www.ensembl.org
.constructEnsemblURL <- function(mirror = NULL, version = NULL, GRCh = NULL) {
host <- NULL
if(!is.null(mirror) && (!is.null(version) || !is.null(GRCh))){
warning("version or GRCh arguments cannot be used together with the mirror argument.\n",
"We will ignore the mirror argument and connect to the main Ensembl site.",
call. = FALSE)
mirror <- NULL
}
if(!is.null(version) && !is.null(GRCh)) {
stop("version or GRCh arguments cannot be used together.\n",
"Please specify only the 'version' or 'GRCh' argument.",
call. = FALSE)
}
if(!is.null(version)) {
archives <- .listEnsemblArchives(https = TRUE, http_config = list())
idx <- match(version, archives[,'version'], nomatch = NA)
if(is.na(idx)) {
stop("Specified Ensembl version is not available.\n",
"Use listEnsemblArchives() to view available versions.",
call. = FALSE)
}
host <- archives[idx, 'url']
}
if(!is.null(GRCh)){
if(GRCh == 37){
host <- paste0("https://grch", GRCh, ".ensembl.org")
} else {
warning("Only 37 can be specified for GRCh version. Using the current version.",
call. = FALSE)
}
}
if(!is.null(mirror)){
if(!(mirror %in% c("www", "useast", "asia"))) {
warning("Invalid mirror. Select a mirror from [www, useast, asia].\n",
"Default when no mirror is specified is to use ",
"www.ensembl.org which may be automatically redirected." )
host <- "https://www.ensembl.org"
} else {
host <- paste0("https://", mirror, ".ensembl.org")
}
}
if(is.null(host)) {
host <- "https://www.ensembl.org"
}
return(host)
}
useEnsembl <- function(biomart, dataset, host,
version = NULL, GRCh = NULL, mirror = NULL, verbose = FALSE){
if(missing(biomart)) {
stop("You must provide the argument 'biomart'\n",
"Available Ensembl Marts can be viewed with ",
"the function listEnsembl()")
}
biomart <- switch (tolower(biomart),
"ensembl" = "ENSEMBL_MART_ENSEMBL",
"genes" = "ENSEMBL_MART_ENSEMBL",
"snp" = "ENSEMBL_MART_SNP",
"snps" = "ENSEMBL_MART_SNP",
"regulation" = "ENSEMBL_MART_FUNCGEN",
"mouse_strains" = "ENSEMBL_MART_MOUSE",
"vega" = "ENSEMBL_MART_VEGA",
biomart
)
## test https connection and store required settings
http_config <- .getEnsemblSSL()
## a crude check to ensure the sub-domain is included. Otherwise queries will fail
if(!missing(host)) {
no_subdomain <- grepl(x = host, pattern = "http[s]?://ensembl", fixed = FALSE)
} else {
no_subdomain <- FALSE
}
## create the host URL & turn off redirection if a mirror is specified
if(missing(host) || no_subdomain ) {
if(no_subdomain) {
warning("You cannot use the host 'ensembl.org'.\n",
"Please provide a subdomain e.g. www.ensembl.org or use one of the 'mirror', 'version', 'GRCh' arguments")
}
if(is.null(version) && is.null(GRCh)) {
mirror <- .chooseEnsemblMirror(mirror = mirror, http_config = http_config)
}
host <- .constructEnsemblURL(version = version, GRCh = GRCh, mirror = mirror)
ensemblRedirect <- is.null(mirror)
} else {
ensemblRedirect <- FALSE
}
## choose the port based on whether we use https or not
port <- ifelse(grepl(pattern = "https://", x = host),
yes = 443, no = 80)
if(grepl(x = host, pattern = "www|useast|asia")) {
marts <- .listEnsembl(version = version, GRCh = GRCh, mirror = mirror)
} else {
marts <- .listMarts(host = host, port = port, http_config = http_config, ensemblRedirect = FALSE)
}
mindex = NA
if(!missing(biomart)){
mindex=match(biomart,marts$biomart)
}
if(is.na(mindex))
stop("Incorrect BioMart name, use the listMarts function to see which BioMart databases are available")
## adding option to force use of specified host with ensembl
redirect <- ifelse(!ensemblRedirect && grepl(x = host, pattern = "ensembl.org"),
"?redirect=no",
"")
mart <- Mart(
biomart = biomart,
vschema = "default",
host = paste0(host, ":",
port,
"/biomart/martservice",
redirect),
http_config = http_config
)
if(grepl("archive", martHost(mart))) {
## hack to work around redirection of most recent mirror URL
archives <- .listEnsemblArchives(https = TRUE, http_config = http_config)
current_release <- archives[archives$current_release == "*", 'url']
if(grepl(martHost(mart), pattern = current_release)) {
martHost(mart) <- stringr::str_replace(martHost(mart), pattern = current_release, "https://www.ensembl.org")
martHost(mart) <- stringr::str_replace(martHost(mart), pattern = ":80/", ":443/")
}
}
if(!missing(dataset)){
mart = useDataset(mart = mart, dataset=dataset, verbose = verbose)
}
return(mart)
}
##############################################
listEnsemblGenomes <- function(includeHosts = FALSE, host = NULL){
## use the default websites unless an alternative is provided
if(is.null(host)) {
hosts <- c("https://protists.ensembl.org/",
"https://fungi.ensembl.org/",
"https://metazoa.ensembl.org/",
"https://plants.ensembl.org/")
} else {
hosts <- host
}
http_config <- .getEnsemblSSL()
marts <- lapply(hosts, FUN = function(x) {
as.data.frame(
.listMarts(host = x, mart = NULL, http_config = http_config,
verbose = FALSE, ensemblRedirect = FALSE,
port = 443, includeHosts = includeHosts)
) }
)
marts <- do.call("rbind", marts)
return(marts)
}
useEnsemblGenomes <- function(biomart, dataset, host = NULL) {
if(missing(biomart)) {
stop("You must provide the argument 'biomart'\n",
"Available Ensembl Genomes Marts can be viewed with ",
"the function listEnsemblGenomes()")
}
marts <- listEnsemblGenomes(includeHosts = TRUE, host = host)
if(!biomart %in% marts$biomart) {
stop(biomart, " is not in the list of available Marts'\n",
"Available Ensembl Genomes Marts can be viewed with ",
"the function listEnsemblGenomes()")
} else {
martDetails <- marts[which(marts$biomart == biomart), ]
}
host <- paste0("https://", martDetails$host)
http_config <- .getEnsemblSSL()
ens <- .useMart(biomart = biomart,
dataset = dataset,
host = host,
verbose = FALSE,
port = 443,
ensemblRedirect = FALSE,
http_config = http_config)
return(ens)
}
#' This function submits a small test query to identify a working Ensembl mirror.
#' If no mirror argument is provided it will use "www" as its first choice.
#' If the selected mirror returns a success (http 200) response it will be used
#' Otherwise another mirror is selected at random and used instead.
#' If all mirrors fail it will return an error
.chooseEnsemblMirror <- function(mirror, http_config) {
mirrors <- c("www", "asia", "useast")
if(missing(http_config)) {
http_config <- do.call(c, .getEnsemblSSL())
}
example_query <- '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "ensembl_gene_id" value = "ENSG00000000003"/>
<Attribute name = "ensembl_gene_id" />
</Dataset>
</Query>'
## create Ensembl URL and stop any redirection to a mirror
host <- .constructEnsemblURL(mirror = mirror)
host <- paste0(host, "/biomart/martservice?redirect=no")
mirror <- str_match(host, pattern = "://([a-z]{3,6})\\.")[1,2]
req <- httr2::request(host) |>
req_body_form(query = example_query) |>
req_timeout(10) |>
req_options(!!!http_config)
result <- tryCatch(httr2::req_perform(req),
error = function(c) { "timeout" } )
tryAgain <- any(result == "timeout") || httr2::resp_status(result) == 500
if(tryAgain) { ## try an alternative mirror if ensembl returns 500
remaining_mirrors <- setdiff(mirrors, mirror)
while((length(remaining_mirrors) > 0) && (tryAgain)) {
mirror <- sample(remaining_mirrors, size = 1)
message("Ensembl site unresponsive, trying ", mirror, " mirror")
host <- str_replace(host,
pattern = "://([a-z]{3,6})\\.",
replacement = paste0("://", mirror, "."))
req <- httr2::request(host) |>
req_body_form(query = example_query) |>
req_timeout(10) |>
req_options(!!!http_config)
result <- tryCatch(httr2::req_perform(req),
error = function(c) { "timeout" } )
tryAgain <- any(result == "timeout") || httr2::resp_status(result) == 500
if(tryAgain) {
remaining_mirrors <- setdiff(remaining_mirrors, mirror)
}
}
}
if(tryAgain) {
stop("Unable to query any Ensembl site")
}
return(mirror)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.