#' Assign extra gene metadata columns (mcols) from Ensembl into GRanges
#'
#' @note Updated 2023-12-05.
#' @noRd
#'
#' @param object `GRanges`.
#'
#' @return `GRanges`.
.addEnsemblFtpMcols <-
function(object, ignoreVersion) {
assert(
is(object, "GRanges"),
isFlag(ignoreVersion)
)
provider <- metadata(object)[["provider"]]
assert(isSubset(provider, c("Ensembl", "GENCODE")))
if (
isFALSE(ignoreVersion) &&
isSubset("geneIdNoVersion", colnames(mcols(object)))
) {
geneIdCol <- "geneIdNoVersion"
} else {
geneIdCol <- "geneId"
}
if (isSubset(
x = c(geneIdCol, "description", "geneSynonyms", "ncbiGeneId"),
y = colnames(mcols(object))
)) {
return(object)
}
organism <- metadata(object)[["organism"]]
genomeBuild <- metadata(object)[["genomeBuild"]]
if (is.null(genomeBuild)) {
genomeBuild <- currentEnsemblGenomeBuild(organism)
}
genomeBuild2 <- sub(
pattern = "\\.p[0-9]+$",
replacement = "",
x = genomeBuild
)
if (isSubset(genomeBuild2, "GRCh37")) {
return(object)
}
release <- metadata(object)[["release"]]
if (is.null(release)) {
release <- currentEnsemblVersion()
}
if (identical(provider, "GENCODE")) {
release <- mapGencodeToEnsembl(release)
}
alert("Downloading extra gene-level metadata from Ensembl.")
extraMcols <- .ensemblFtpGeneMetadata(
organism = organism,
genomeBuild = genomeBuild,
release = release
)
if (is.null(extraMcols)) {
return(object)
}
if (
isFALSE(ignoreVersion) &&
identical(geneIdCol, "geneIdNoVersion")
) {
colnames(extraMcols)[
colnames(extraMcols) == "geneId"
] <- "geneIdNoVersion"
}
if (isSubset("description", colnames(mcols(object)))) {
extraMcols[["description"]] <- NULL
}
if (isSubset("geneSynonyms", colnames(mcols(object)))) {
extraMcols[["geneSynonyms"]] <- NULL
}
if (isSubset("ncbiGeneId", colnames(mcols(object)))) {
extraMcols[["ncbiGeneId"]] <- NULL
}
mcols <- leftJoin(x = mcols(object), y = extraMcols, by = geneIdCol)
mcols(object) <- mcols
object
}
#' Get a data frame of extra gene-level metadata from Ensembl FTP server
#'
#' @note Updated 2023-10-12.
#' @noRd
#'
#' @return `DFrame`.
#'
#' @examples
#' df <- .ensemblFtpGeneMetadata(
#' organism = "Homo sapiens",
#' genomeBuild = "GRCh38",
#' release = 109L
#' )
#' df <- .ensemblFtpGeneMetadata(
#' organism = "Mus musculus",
#' genomeBuild = "GRCm39",
#' release = 109L
#' )
.ensemblFtpGeneMetadata <-
function(organism, genomeBuild, release) {
assert(
isOrganism(organism),
isString(genomeBuild),
isInt(release)
)
genomeBuild <- sub(
pattern = "\\.p[0-9]+$",
replacement = "",
x = genomeBuild
)
## Ensembl is prone to timeouts over FTP, so using HTTPS here instead.
ftpBaseUrl <- pasteUrl(
"ftp.ensembl.org",
"pub",
paste0("release-", release),
protocol = "https"
)
mysqlSubdir <- getUrlDirList(
url = pasteUrl(ftpBaseUrl, "mysql"),
pattern = snakeCase(paste(organism, "core", release))
)
assert(isString(mysqlSubdir))
url <- pasteUrl(ftpBaseUrl, "mysql", mysqlSubdir, "gene.txt.gz")
gene <- tryCatch(
expr = {
import(
con = .cacheIt(url),
format = "tsv",
colnames = FALSE,
quote = ""
)
},
error = function(e) {
alertWarning(sprintf("Failed to import {.url %s}.", url))
NULL
}
)
if (is.null(gene)) {
return(NULL)
}
switch(
EXPR = as.character(ncol(gene)),
"16" = {
geneColMap <- c(
"mysqlId" = 8L,
"geneId" = 13L,
"description" = 10L
)
},
"17" = {
## e.g. GRCh38 and GRCm38 release 87.
geneColMap <- c(
"mysqlId" = 8L,
"geneId" = 14L,
"description" = 11L
)
},
abort(sprintf("Unsupported file: {.file %s}.", url))
)
url <- pasteUrl(
ftpBaseUrl, "mysql", mysqlSubdir,
"external_synonym.txt.gz"
)
synonym <- tryCatch(
expr = {
import(
con = .cacheIt(url),
format = "tsv",
colnames = FALSE,
quote = ""
)
},
error = function(e) {
alertWarning(sprintf("Failed to import {.url %s}.", url))
NULL
}
)
if (is.null(synonym)) {
return(NULL)
}
url <- pasteUrl(
ftpBaseUrl, "tsv", snakeCase(organism),
paste(
gsub(
pattern = " ",
replacement = "_",
x = organism
),
genomeBuild,
release,
"entrez",
"tsv.gz",
sep = "."
)
)
entrez <- tryCatch(
expr = {
import(
con = .cacheIt(url),
format = "tsv",
colnames = TRUE,
quote = ""
)
},
error = function(e) {
alertWarning(sprintf("Failed to import {.url %s}.", url))
NULL
}
)
if (is.null(entrez)) {
return(NULL)
}
df1 <- gene
df1 <- df1[, geneColMap]
colnames(df1) <- names(geneColMap)
assert(
is.integer(df1[["mysqlId"]]),
is.character(df1[["geneId"]]),
is.character(df1[["description"]])
)
df1 <- df1[complete.cases(df1), , drop = FALSE]
df1 <- as(df1, "DFrame")
df2 <- synonym
df2 <- df2[complete.cases(df2), , drop = FALSE]
df2 <- unique(df2)
df2 <- split(x = df2, f = df2[[1L]])
df2 <- lapply(X = df2, FUN = `[[`, 2L)
df2 <- as.DataFrame(list(
"mysqlId" = as.integer(names(df2)),
"geneSynonyms" = unname(df2)
))
df3 <- entrez
df3 <- df3[, c("gene_stable_id", "xref"), drop = FALSE]
df3 <- df3[complete.cases(df3), , drop = FALSE]
keep <- grepl(pattern = "^[0-9]+$", x = df3[["xref"]])
df3 <- df3[keep, , drop = FALSE]
df3[["xref"]] <- as.integer(df3[["xref"]])
df3 <- unique(df3)
df3 <- split(x = df3, f = df3[[1L]])
df3 <- lapply(X = df3, FUN = `[[`, 2L)
df3 <- as.DataFrame(list(
"geneId" = names(df3),
"ncbiGeneId" = unname(df3)
))
out <- leftJoin(x = df1, y = df2, by = "mysqlId")
out <- leftJoin(x = out, y = df3, by = "geneId")
out[["mysqlId"]] <- NULL
out <- out[, sort(colnames(out))]
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.