#' Get Seqinfo
#'
#' @note Updated 2023-11-29.
#' @noRd
#'
#' @param x GFF file or `.getGffMetadata()` return list.
#'
#' @return `Seqinfo` or `NULL` (on failure).
#'
#' @examples
#' ## Ensembl ====
#' file <- pasteUrl(
#' "ftp.ensembl.org",
#' "pub",
#' "release-102",
#' "gtf",
#' "homo_sapiens",
#' "Homo_sapiens.GRCh38.102.gtf.gz",
#' protocol = "https"
#' )
#' seq <- .getSeqinfo(file)
#' print(seq)
#'
#' ## GENCODE ====
#' file <- pasteUrl(
#' "ftp.ebi.ac.uk",
#' "pub",
#' "databases",
#' "gencode",
#' "Gencode_human",
#' "release_36",
#' "gencode.v36.annotation.gtf.gz",
#' protocol = "https"
#' )
#' seq <- .getSeqinfo(file)
#' print(seq)
#'
#' ## RefSeq ====
#' file <- pasteUrl(
#' "ftp.ncbi.nlm.nih.gov",
#' "genomes",
#' "refseq",
#' "vertebrate_mammalian",
#' "Homo_sapiens",
#' "all_assembly_versions",
#' "GCF_000001405.38_GRCh38.p12",
#' "GCF_000001405.38_GRCh38.p12_genomic.gff.gz",
#' protocol = "https"
#' )
#' seq <- .getSeqinfo(file)
#' print(seq)
#'
#' ## UCSC ====
#' file <- pasteUrl(
#' "hgdownload.soe.ucsc.edu",
#' "goldenPath",
#' "hg38",
#' "bigZips",
#' "genes",
#' "hg38.knownGene.gtf.gz",
#' protocol = "ftp"
#' )
#' seq <- .getSeqinfo(file)
#' print(seq)
.getSeqinfo <- function(x) {
if (!is.list(x)) {
x <- .getGffMetadata(x)
}
if (
!isSubset(
x = c("genomeBuild", "organism", "provider"),
y = names(x)
) ||
anyNA(c(
x[["genomeBuild"]],
x[["organism"]],
x[["provider"]]
))
) {
return(NULL)
}
assert(
isString(x[["file"]]),
isString(x[["genomeBuild"]]),
isString(x[["organism"]]),
isString(x[["provider"]]),
isScalar(x[["release"]]) || is.null(x[["release"]]),
msg = "GFF file does not contain sufficient metadata."
)
seq <- switch(
EXPR = x[["provider"]],
"Ensembl" = {
.getEnsemblSeqinfo(
organism = x[["organism"]],
genomeBuild = x[["genomeBuild"]],
release = x[["release"]]
)
},
"GENCODE" = {
.getGencodeSeqinfo(
organism = x[["organism"]],
genomeBuild = x[["genomeBuild"]],
release = x[["release"]]
)
},
"RefSeq" = {
.getRefSeqSeqinfo(
file = ifelse(
test = isAUrl(x[["url"]]),
yes = x[["url"]],
no = x[["file"]]
)
)
},
"UCSC" = {
.getUcscSeqinfo(genomeBuild = x[["genomeBuild"]])
},
NULL
)
if (is(seq, "Seqinfo")) {
assert(
identical(
x = unique(unname(genome(seq))),
y = x[["genomeBuild"]]
),
msg = sprintf(
"Seqinfo does not contain expected genome build: '%s'.",
x[["genomeBuild"]]
)
)
seq <- seq[sort(seqnames(seq))]
}
seq
}
## NOTE We may want to make our own variant of `getChromInfoFromEnsembl` to
## avoid parsing issues currently seen with GenomeInfoDb.
#' Get Ensembl genome assembly seqinfo
#'
#' @note Updated 2023-12-06.
#' @noRd
#'
#' @seealso
#' Some legacy releases aren't supported:
#' - https://github.com/Bioconductor/GenomeInfoDb/issues/97
#' - https://github.com/Bioconductor/GenomeInfoDb/issues/98
.getEnsemblSeqinfo <- function(organism, genomeBuild, release) {
assert(
isString(organism),
isString(genomeBuild),
isInt(release)
)
args <- list("species" = organism, "release" = release, "as.Seqinfo" = TRUE)
if (grepl(pattern = "GRCh37", x = genomeBuild, fixed = TRUE)) {
## Specifying release 87 currently errors.
## https://github.com/Bioconductor/GenomeInfoDb/issues/97
args[["release"]] <- NA
args[["use.grch37"]] <- TRUE
}
seq <- tryCatch(
expr = {
do.call(what = getChromInfoFromEnsembl, args = args)
},
warning = function(w) {
message(w)
NULL
},
error = function(e) {
message(e)
NULL
}
)
if (is.null(seq)) {
alertWarning("Failed to download Seqinfo from Ensembl.")
return(seq)
}
assert(is(seq, "Seqinfo"))
if (grepl(pattern = "GRCh37.", x = genomeBuild, fixed = TRUE)) {
seqGenome <- genome(seq)
seqGenome <- sub(
pattern = "^GRCh37",
replacement = genomeBuild,
x = seqGenome
)
genome(seq) <- seqGenome
}
seq
}
#' Get GENCODE genome assembly seqinfo
#'
#' @note Updated 2023-11-22.
#' @noRd
.getGencodeSeqinfo <- function(organism, genomeBuild, release) {
seq <- .getEnsemblSeqinfo(
organism = organism,
genomeBuild = genomeBuild,
release = mapGencodeToEnsembl(release)
)
assert(is(seq, "Seqinfo"))
genome(seq) <- sub(
pattern = "\\.p[0-9]+$",
replacement = "",
x = genome(seq)
)
keep <- intersect(
x = seqnames(seq),
y = c(seq(from = 1L, to = 23L), "MT", "X", "Y")
)
assert(hasLength(keep))
seq <- seq[keep]
seqnames(seq)[seqnames(seq) == "MT"] <- "M"
seqnames(seq) <- paste0("chr", seqnames(seq))
seq
}
#' Get UCSC genome assembly seqinfo
#'
#' @note Updated 2023-01-30.
#' @noRd
.getUcscSeqinfo <- function(genomeBuild) {
tryCatch(
expr = {
quietly({
Seqinfo(genome = genomeBuild)
})
},
error = function(e) {
NULL
}
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.