#' Make GRanges from rtracklayer
#'
#' @note Updated 2023-10-12.
#' @noRd
#'
#' @details
#' The `"meta"` argument is generated by `makeGRangesFromGff`.
.makeGRangesFromRtracklayer <-
function(file,
level,
ignoreVersion,
extraMcols,
meta) {
assert(
isString(file),
isFlag(ignoreVersion),
isFlag(extraMcols),
is.list(meta),
isString(meta[["format"]]),
isString(meta[["provider"]])
)
level <- match.arg(
arg = level,
choices = c("genes", "transcripts")
)
meta[["level"]] <- level
gr <- import(con = .cacheIt(file))
assert(is(gr, "GRanges"))
format <- ifelse(
test = grepl(pattern = "GTF", x = meta[["format"]]),
yes = "GTF",
no = "GFF"
)
provider <- match.arg(
arg = meta[["provider"]],
choices = c(
"Ensembl",
"FlyBase",
"GENCODE",
"RefSeq",
"UCSC",
"WormBase"
)
)
funName <- paste0(
".",
camelCase(
object = tolower(paste("rtracklayer", provider, level, format)),
strict = TRUE
)
)
tryCatch(
expr = {
what <- .getFun(funName)
},
error = function(e) {
abort(sprintf("Unsupported GFF: {.file %s}.", basename(file)))
}
)
gr <- do.call(what = what, args = list("object" = gr))
metadata(gr) <- meta
seqinfo <- .getSeqinfo(meta)
if (is(seqinfo, "Seqinfo")) {
seqinfo(gr) <- seqinfo[seqlevels(gr)]
}
.makeGRanges(
object = gr,
ignoreVersion = ignoreVersion,
extraMcols = extraMcols
)
}
## Ensembl =====================================================================
## Updated 2022-05-04.
.rtracklayerEnsemblGenesGff <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("Name", "biotype", "gene_id", "version"),
y = names(mcols(object))
),
areDisjointSets(
x = c("gene_biotype", "gene_id_version", "gene_name"),
y = names(mcols(object))
)
)
keep <- !is.na(mcols(object)[["gene_id"]])
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
assert(hasNoDuplicates(mcols(object)[["gene_id"]]))
names(mcols(object))[
names(mcols(object)) == "Name"
] <- "gene_name"
names(mcols(object))[
names(mcols(object)) == "biotype"
] <- "gene_biotype"
mcols(object)[["gene_id_version"]] <-
paste(
mcols(object)[["gene_id"]],
mcols(object)[["version"]],
sep = "."
)
mcols(object)[["Alias"]] <- NULL
mcols(object)[["ID"]] <- NULL
mcols(object)[["Parent"]] <- NULL
mcols(object)[["version"]] <- NULL
object
}
## Updated 2022-05-04.
.rtracklayerEnsemblGenesGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c(
"gene_id",
"gene_version",
"type"
),
y = names(mcols(object))
),
areDisjointSets(
x = "gene_id_version",
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "gene"
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
assert(hasNoDuplicates(mcols(object)[["gene_id"]]))
mcols(object)[["gene_id_version"]] <-
paste(
mcols(object)[["gene_id"]],
mcols(object)[["gene_version"]],
sep = "."
)
mcols(object)[["gene_version"]] <- NULL
object
}
## Updated 2022-05-04.
.rtracklayerEnsemblTranscriptsGff <-
function(object) {
genes <- .rtracklayerEnsemblGenesGff(object)
mcols(genes) <- removeNa(mcols(genes))
assert(
is(object, "GRanges"),
isSubset(
x = c("Name", "Parent", "biotype"),
y = names(mcols(object))
),
areDisjointSets(
x = c("transcript_biotype", "transcript_name"),
y = names(mcols(object))
)
)
keep <- !is.na(mcols(object)[["transcript_id"]])
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
assert(
hasNoDuplicates(mcols(object)[["transcript_id"]]),
allAreMatchingRegex(
x = as.character(mcols(object)[["Parent"]]),
pattern = "^gene:"
)
)
mcols(object) <- removeNa(mcols(object))
names(mcols(object))[
names(mcols(object)) == "biotype"
] <- "transcript_biotype"
names(mcols(object))[
names(mcols(object)) == "Name"
] <- "transcript_name"
mcols(object)[["gene_id"]] <- gsub(
pattern = "^gene:",
replacement = "",
x = as.character(mcols(object)[["Parent"]])
)
mcols(object)[["transcript_id_version"]] <-
paste(
mcols(object)[["transcript_id"]],
mcols(object)[["version"]],
sep = "."
)
mcols(object)[["Alias"]] <- NULL
mcols(object)[["ID"]] <- NULL
mcols(object)[["Parent"]] <- NULL
mcols(object)[["version"]] <- NULL
geneCols <- c(
"gene_id",
setdiff(
x = names(mcols(genes)),
y = names(mcols(object))
)
)
mcols(object) <- leftJoin(
x = mcols(object),
y = mcols(genes)[geneCols],
by = "gene_id"
)
object
}
## Updated 2022-05-04.
.rtracklayerEnsemblTranscriptsGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c(
"gene_id",
"gene_version",
"transcript_id",
"transcript_version",
"type"
),
y = names(mcols(object))
),
areDisjointSets(
x = c(
"gene_id_version",
"transcript_id_version"
),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "transcript"
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
mcols(object)[["gene_id_version"]] <-
paste(
mcols(object)[["gene_id"]],
mcols(object)[["gene_version"]],
sep = "."
)
mcols(object)[["transcript_id_version"]] <-
paste(
mcols(object)[["transcript_id"]],
mcols(object)[["transcript_version"]],
sep = "."
)
mcols(object)[["gene_version"]] <- NULL
mcols(object)[["transcript_version"]] <- NULL
object
}
## FlyBase =====================================================================
## Updated 2021-01-27.
.rtracklayerFlybaseGenesGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("gene_id", "type"),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "gene"
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
assert(hasNoDuplicates(mcols(object)[["gene_id"]]))
object
}
## Updated 2021-01-27.
.rtracklayerFlybaseTranscriptsGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("transcript_id", "type"),
y = names(mcols(object))
)
)
keep <- grepl(
pattern = paste(c("^pseudogene$", "RNA$"), collapse = "|"),
x = mcols(object)[["type"]],
ignore.case = TRUE
)
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
assert(hasNoDuplicates(mcols(object)[["transcript_id"]]))
object
}
## GENCODE =====================================================================
## Uses `gene_type` instead of `gene_biotype`.
## Note that `gene_id` and `gene_name` are nicely defined, so don't use `Name`.
## Consider removing gene and transcript versions automatically.
## Updated 2023-11-29.
.rtracklayerGencodeGenesGff <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("ID", "gene_id", "type"),
y = names(mcols(object))
),
areDisjointSets(
x = c(
"gene_id_no_version",
"gene_version"
),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "gene"
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
mcols(object)[["gene_id_version"]] <- mcols(object)[["ID"]]
assert(hasNoDuplicates(mcols(object)[["gene_id_version"]]))
mcols(object)[["gene_id"]] <-
stripGeneVersions(mcols(object)[["gene_id_version"]])
mcols(object)[["ID"]] <- NULL
mcols(object)[["Parent"]] <- NULL
mcols(object)[["ont"]] <- NULL
object
}
## Updated 2022-05-04.
.rtracklayerGencodeGenesGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("gene_id", "type"),
y = names(mcols(object))
),
areDisjointSets(
x = c("gene_id_version", "gene_version"),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "gene"
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
mcols(object)[["gene_id_version"]] <- mcols(object)[["gene_id"]]
assert(hasNoDuplicates(mcols(object)[["gene_id_version"]]))
mcols(object)[["gene_id"]] <-
stripGeneVersions(mcols(object)[["gene_id_version"]])
mcols(object)[["ont"]] <- NULL
object
}
## Updated 2023-11-29.
.rtracklayerGencodeTranscriptsGff <-
function(object) {
assert(
isSubset(
x = c(
"ID",
"gene_id",
"transcript_id",
"type"
),
y = names(mcols(object))
),
areDisjointSets(
x = c(
"transcript_id_no_version",
"transcript_version"
),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "transcript"
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
mcols(object)[["transcript_id_version"]] <- mcols(object)[["ID"]]
assert(hasNoDuplicates(mcols(object)[["transcript_id_version"]]))
mcols(object)[["transcript_id"]] <-
stripTranscriptVersions(mcols(object)[["transcript_id_version"]])
mcols(object)[["gene_id_version"]] <-
as.character(mcols(object)[["Parent"]])
mcols(object)[["gene_id"]] <-
stripGeneVersions(mcols(object)[["gene_id_version"]])
mcols(object)[["ID"]] <- NULL
mcols(object)[["Parent"]] <- NULL
mcols(object)[["ont"]] <- NULL
object
}
## GENCODE GRCh37 contains duplicate unversioned transcript identifiers.
##
## Updated 2023-11-29.
.rtracklayerGencodeTranscriptsGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c(
"gene_id",
"transcript_id",
"type"
),
y = names(mcols(object))
),
areDisjointSets(
x = c(
"gene_id_version",
"gene_version",
"transcript_id_version",
"transcript_version"
),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "transcript"
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
mcols(object)[["transcript_id_version"]] <-
mcols(object)[["transcript_id"]]
assert(hasNoDuplicates(mcols(object)[["transcript_id_version"]]))
mcols(object)[["transcript_id"]] <-
stripTranscriptVersions(mcols(object)[["transcript_id_version"]])
mcols(object)[["gene_id_version"]] <- mcols(object)[["gene_id"]]
mcols(object)[["gene_id"]] <-
stripGeneVersions(mcols(object)[["gene_id_version"]])
mcols(object)[["ont"]] <- NULL
object
}
## RefSeq ======================================================================
#' Extract NCBI gene identifiers from RefSeq GFF/GTF file.
#'
#' @section GCF_000001405.40_GRCh38.p14 genes with multiple NCBI identifiers:
#'
#' - TRNAA-AGC: 124901561, 124901562, 124901563, 124901564, 124901565
#' - TRNAE-UUC: 107987368, 124905580, 124905583, 124905584, 124905586
#' - TRNAG-CCC: 124905578, 124905581, 124905588
#' - TRNAN-GUU: 124905579, 124905582, 124905585, 124905587
#' - TRNAV-CAC: 107985614, 107985615
#'
#' Note that these are RefSeq status: MODEL.
#'
#' @note Updated 2023-03-01.
#' @noRd
.getNcbiGeneIdsFromRefSeqGff <- function(object) {
mcols <- mcols(object)[, c("ID", "Dbxref")]
geneIds <- mcols[["ID"]]
dbxref <- mcols[["Dbxref"]]
lgl <- grepl(pattern = "GeneID:", x = dbxref, fixed = TRUE)
assert(
is(dbxref, "CharacterList"),
is(lgl, "LogicalList")
)
ncbiGeneIds <- unlist(dbxref[lgl], recursive = FALSE)
assert(identical(length(ncbiGeneIds), length(geneIds)))
ncbiGeneIds <- do.call(
what = rbind,
args = strsplit(x = ncbiGeneIds, split = ":", fixed = TRUE)
)[, 2L]
ncbiGeneIds <- as.integer(ncbiGeneIds)
df <- DataFrame("ID" = geneIds, "ncbi_gene_id" = ncbiGeneIds)
df <- unique(df[complete.cases(df), ])
assert(hasNoDuplicates(df[["ID"]]))
df
}
#' Extract NCBI gene identifiers from RefSeq GTF file.
#'
#' @note Updated 2023-03-01.
#' @noRd
.getNcbiGeneIdsFromRefSeqGtf <- function(object) {
mcols <- mcols(object)[, c("gene_id", "db_xref")]
keep <- grepl(pattern = "GeneID:", x = mcols[["db_xref"]], fixed = TRUE)
mcols <- unique(mcols[keep, ])
assert(hasNoDuplicates(mcols[["gene_id"]]))
geneIds <- mcols[["gene_id"]]
ncbiGeneIds <- strMatch(
x = mcols[["db_xref"]],
pattern = "GeneID:([[:digit:]]+)"
)[, 2L]
ncbiGeneIds <- as.integer(ncbiGeneIds)
df <- DataFrame("gene_id" = geneIds, "ncbi_gene_id" = ncbiGeneIds)
df <- df[complete.cases(df), ]
df
}
## Updated 2022-05-04.
.rtracklayerRefseqGenesGff <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("ID", "Parent", "description", "gene"),
y = names(mcols(object))
),
areDisjointSets(
x = "gene_id",
y = names(mcols(object))
)
)
keep <- !is.na(mcols(object)[["gbkey"]]) &
mcols(object)[["gbkey"]] == "Gene"
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
ncbiGeneIds <- .getNcbiGeneIdsFromRefSeqGff(object)
mcols(object) <- leftJoin(x = mcols(object), y = ncbiGeneIds, by = "ID")
names(mcols(object))[names(mcols(object)) == "ID"] <- "parent_gene_id"
assert(hasNoDuplicates(mcols(object)[["parent_gene_id"]]))
names(mcols(object))[names(mcols(object)) == "gene"] <- "gene_id"
assert(all(grepl(
pattern = "^gene-",
x = mcols(object)[["parent_gene_id"]]
)))
mcols(object)[["parent_gene_id"]] <- gsub(
pattern = "^gene-",
replacement = "",
x = mcols(object)[["parent_gene_id"]]
)
mcols(object)[["Name"]] <- NULL
mcols(object)[["Note"]] <- NULL
mcols(object)[["Parent"]] <- NULL
mcols(object)[["end_range"]] <- NULL
mcols(object)[["experiment"]] <- NULL
mcols(object)[["gbkey"]] <- NULL
mcols(object)[["start_range"]] <- NULL
mcols(object)[["transl_except"]] <- NULL
object
}
## Updated 2022-05-04.
.rtracklayerRefseqGenesGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("gene", "gene_id", "type"),
y = names(mcols(object))
)
)
## Need to run this before gene filtering, because only exons in GTF
## file contain the NCBI gene identifiers.
ncbiGeneIds <- .getNcbiGeneIdsFromRefSeqGtf(object)
keep <- mcols(object)[["type"]] == "gene"
assert(
any(keep),
msg = "Failed to extract any genes."
)
object <- object[keep]
mcols(object) <- leftJoin(
x = mcols(object),
y = ncbiGeneIds,
by = "gene_id"
)
names(mcols(object))[
names(mcols(object)) == "gene_id"
] <- "parent_gene_id"
names(mcols(object))[
names(mcols(object)) == "gene"
] <- "gene_id"
assert(hasNoDuplicates(mcols(object)[["parent_gene_id"]]))
mcols(object)[["gbkey"]] <- NULL
mcols(object)[["note"]] <- NULL
object
}
## NOTE Consider including "ensemblTxId" from "dbxref" here.
## Updated 2023-03-01.
.rtracklayerRefseqTranscriptsGff <-
function(object) {
genes <- .rtracklayerRefseqGenesGff(object)
genesMcols <- mcols(genes)[
,
c(
"description",
"gene_biotype",
"ncbi_gene_id",
"parent_gene_id"
),
drop = FALSE
]
assert(
hasNoDuplicates(genesMcols[["parent_gene_id"]]),
is(object, "GRanges"),
isSubset(
x = c("Parent", "gene", "transcript_id"),
y = names(mcols(object))
),
areDisjointSets(
x = "gene_id",
y = names(mcols(object))
)
)
keep <- !is.na(mcols(object)[["transcript_id"]])
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
## e.g. "NM_000014.6".
keep <- grepl(
pattern = "^[A-Z]{2}_[0-9]+\\.[0-9]+$",
x = mcols(object)[["transcript_id"]]
)
object <- object[keep]
## Only keep transcript annotations that map to a parent gene.
## This is somewhat slow and may be optimizable.
keep <- bapply(
X = mcols(object)[["Parent"]],
FUN = function(x) {
any(grepl(pattern = "^gene-", x = x))
}
)
assert(
any(keep),
msg = "Failed to match transcripts against parent genes."
)
object <- object[keep]
## Ensure that matching transcripts contain a unique gene parent.
assert(
all(bapply(
X = mcols(object)[["Parent"]],
FUN = isScalar
)),
msg = "Elements do not contain a unique gene parent."
)
mcols(object)[["parent_gene_id"]] <- vapply(
X = mcols(object)[["Parent"]],
FUN = function(x) {
sub(pattern = "^gene-", replacement = "", x = x[[1L]])
},
FUN.VALUE = character(1L),
USE.NAMES = FALSE
)
names(mcols(object))[names(mcols(object)) == "gene"] <- "gene_id"
cols <- c(
setdiff(
x = colnames(mcols(object)),
y = colnames(genesMcols)
),
"parent_gene_id"
)
mcols(object) <- mcols(object)[, cols]
mcols(object) <- leftJoin(
x = mcols(object),
y = genesMcols,
by = "parent_gene_id"
)
mcols(object)[["ID"]] <- NULL
mcols(object)[["Name"]] <- NULL
mcols(object)[["Note"]] <- NULL
mcols(object)[["Parent"]] <- NULL
mcols(object)[["end_range"]] <- NULL
mcols(object)[["experiment"]] <- NULL
mcols(object)[["gbkey"]] <- NULL
mcols(object)[["start_range"]] <- NULL
mcols(object)[["transl_except"]] <- NULL
object
}
## NOTE Consider including "ensemblTxId" from "dbxref" here.
## Updated 2023-03-01.
.rtracklayerRefseqTranscriptsGtf <-
function(object) {
genes <- .rtracklayerRefseqGenesGtf(object)
genesMcols <- mcols(genes)[
,
c(
"description",
"gene_biotype",
"ncbi_gene_id",
"parent_gene_id"
),
drop = FALSE
]
assert(
hasNoDuplicates(genesMcols[["parent_gene_id"]]),
is(object, "GRanges"),
isSubset(
x = c(
"transcript_id",
"type"
),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "transcript"
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
## e.g. "NM_000014.6".
keep <- grepl(
pattern = "^[A-Z]{2}_[0-9]+\\.[0-9]+$",
x = mcols(object)[["transcript_id"]]
)
object <- object[keep]
assert(hasNoDuplicates(mcols(object)[["transcript_id"]]))
names(mcols(object))[
names(mcols(object)) == "gene_id"
] <- "parent_gene_id"
names(mcols(object))[
names(mcols(object)) == "gene"
] <- "gene_id"
cols <- c(
setdiff(
x = colnames(mcols(object)),
y = colnames(genesMcols)
),
"parent_gene_id"
)
mcols(object) <- mcols(object)[, cols]
mcols(object) <- leftJoin(
x = mcols(object),
y = genesMcols,
by = "parent_gene_id"
)
mcols(object)[["experiment"]] <- NULL
mcols(object)[["gbkey"]] <- NULL
mcols(object)[["note"]] <- NULL
object
}
## WormBase ====================================================================
## Updated 2021-08-05.
.rtracklayerWormbaseGenesGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("gene_id", "type"),
y = names(mcols(object))
)
)
keep <- mcols(object)[["type"]] == "gene"
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
assert(
hasNoDuplicates(mcols(object)[["gene_id"]]),
allAreMatchingRegex(
x = mcols(object)[["gene_id"]],
pattern = "^WBGene[[:digit:]]{8}$"
)
)
object
}
## Updated 2021-08-05.
.rtracklayerWormbaseTranscriptsGtf <-
function(object) {
assert(
is(object, "GRanges"),
isSubset(
x = c("gene_id", "transcript_id", "type"),
y = names(mcols(object))
)
)
## Keep track of gene-level metadata, that we'll join below.
genes <- .rtracklayerWormbaseGenesGtf(object)
geneCols <- grep(
pattern = "^gene_",
x = colnames(mcols(genes)),
value = TRUE
)
geneMcols <- mcols(genes, use.names = FALSE)[, geneCols]
keep <- mcols(object)[["type"]] == "transcript"
assert(
any(keep),
msg = "Failed to extract any transcripts."
)
object <- object[keep]
assert(
hasNoDuplicates(mcols(object)[["transcript_id"]]),
allAreMatchingRegex(
x = mcols(object)[["gene_id"]],
pattern = "^WBGene[[:digit:]]{8}$"
)
)
mcols <- mcols(object)
cols <- c(
setdiff(
x = colnames(mcols),
y = colnames(geneMcols)
),
"gene_id"
)
mcols <- mcols[, cols]
mcols <- leftJoin(x = mcols, y = geneMcols, by = "gene_id")
mcols(object) <- mcols
object
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.