### =========================================================================
### SplicingGraphs objects
### -------------------------------------------------------------------------
### With the 2-class design we choose below, the SplicingGraphs class does
### NOT extend the CompressedList class, but has the slot ('genes') that
### extends the CompressedList class. This avoids having the SplicingGraphs
### class inherit a very rich API (Vector + List), which would be a problem
### because many operations (like c() or relist()) would be broken, unless
### we re-implement them for SplicingGraphs objects. But: (a) that's a lot
### of work (the API is huge), and (b) we don't need those operations in the
### first place. All we need are: length(), names(), [, [[, elementNROWS(),
### and unlist().
### The GeneModel class is an internal class that is not intended to be
### exposed to the user. It's a list-like class where the elements are
### GRangesList objects, each of them representing a gene. It's currently
### defined exactly how the GRangesListList class would be if we decided to
### have one in the GenomicRanges package.
setClass("GeneModel",
contains="CompressedList",
representation(
unlistData="GRangesList",
elementMetadata="DataFrame"
),
prototype(
elementType="GRangesList",
elementMetadata=new("DFrame")
)
)
setClass("SplicingGraphs",
representation(
genes="GeneModel",
in_by_tx="GRangesList",
.bubbles_cache="environment"
)
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### GeneModel validity.
###
.valid.GeneModel.names <- function(x)
{
x_names <- names(x)
if (is.null(x_names)) {
if (length(x) == 1L)
return(NULL)
return("'x' must have names")
}
if (anyDuplicated(x_names))
return("'x' has duplicated names")
NULL
}
.valid.GeneModel.ex_by_tx <- function(x)
{
ex_by_tx <- x@unlistData
if (!is.null(names(ex_by_tx)))
return("'x@unlistData' must be unnamed")
ex_mcols <- mcols(ex_by_tx@unlistData)
valid_exon_mcolnames(colnames(ex_mcols))
}
.valid.GeneModel <- function(x)
{
c(.valid.GeneModel.names(x),
.valid.GeneModel.ex_by_tx(x))
}
setValidity2("GeneModel", .valid.GeneModel)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### GeneModel API.
###
### From the CompressedList API:
### GeneModel objects inherit the CompressedList API i.e. anything that works
### on a CompressedList object works on a GeneModel object. But the only
### things we're using/supporting in the SplicingGraphs package are: length(),
### names(), [, [[, elementNROWS(), and unlist().
### From the GRanges API:
### We only need seqnames(), strand(), and seqinfo() from the GRanges API.
setMethod("seqnames", "GeneModel",
function(x)
{
x_unlisted <- unlist(x)
unlisted_seqnames <- commonSeqnames.GRangesList(x_unlisted)
relisted_seqnames <- relist(unlisted_seqnames, x)
ans <- commonSeqnames.RleList(relisted_seqnames)
names(ans) <- names(x)
ans
}
)
setMethod("strand", "GeneModel",
function(x)
{
x_unlisted <- unlist(x)
unlisted_strand <- commonStrand.GRangesList(x_unlisted)
relisted_strand <- relist(unlisted_strand, x)
ans <- commonStrand.RleList(relisted_strand)
names(ans) <- names(x)
ans
}
)
setMethod("seqinfo", "GeneModel",
function(x) seqinfo(unlist(x, use.names=FALSE))
)
### The only GeneModel setter supported at the moment.
setReplaceMethod("isCircular", "GeneModel",
function(x, value)
{
isCircular(x@unlistData) <- value
x
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### GeneModel() constructor.
###
### Used only in this file.
###
### 'ex_by_tx' must be a *named* GRangesList object containing exons grouped
### by transcript. The names must be the gene ids, NOT the transcript ids or
### transcript names.
GeneModel <- function(ex_by_tx)
{
ex_by_tx_names <- names(ex_by_tx)
if (is.null(ex_by_tx_names)) {
ans_partitioning <- PartitioningByEnd(length(ex_by_tx))
} else {
names(ex_by_tx) <- NULL
ans_end <- end(Rle(ex_by_tx_names))
ans_names <- ex_by_tx_names[ans_end]
ans_partitioning <- PartitioningByEnd(ans_end, names=ans_names)
}
IRanges:::newCompressedList0(getClass("GeneModel"),
ex_by_tx, ans_partitioning)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SplicingGraphs validity.
###
### A SplicingGraphs object 'x' should be validated with:
###
### validObject(x, complete=TRUE)
###
.valid.SplicingGraphs.in_by_tx <- function(x)
{
x_in_by_tx <- x@in_by_tx
x_in_by_tx_names <- names(x_in_by_tx)
if (is.null(x_in_by_tx_names))
return("'x@in_by_tx' must have names")
x_ex_by_tx <- unlist(x@genes)
if (length(x_in_by_tx) != length(x_ex_by_tx))
return("'x@in_by_tx' must have the same length as 'unlist(x@genes)'")
if (!identical(x_in_by_tx_names, names(x_ex_by_tx)))
return("'x@in_by_tx' must have the same names as 'unlist(x@genes)'")
if (!identical(elementNROWS(x_in_by_tx) + 1L,
elementNROWS(x_ex_by_tx))) {
msg <- c("the shape of 'x@in_by_tx' is not compatible ",
"with the shape of 'unlist(x@genes)'")
return(paste0(msg, collapse=""))
}
NULL
}
.valid.SplicingGraphs <- function(x)
{
c(.valid.SplicingGraphs.in_by_tx(x))
}
setValidity2("SplicingGraphs", .valid.SplicingGraphs)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### updateObject()
###
setMethod("updateObject", "SplicingGraphs",
function(object, ..., verbose=FALSE)
{
object@genes <- updateObject(object@genes, ..., verbose=verbose)
object@in_by_tx <- updateObject(object@in_by_tx, ..., verbose=verbose)
for (key in ls(object@.bubbles_cache, all.names=TRUE)) {
val <- object@.bubbles_cache[[key]]
object@.bubbles_cache[[key]] <- updateObject(val, ...,
verbose=verbose)
}
object
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Restricted SplicingGraphs API.
###
### Here we only implement:
### - a few core methods from the List API,
### - seqnames(), strand(), seqinfo() from the GRanges API.
###
setMethod("length", "SplicingGraphs", function(x) length(x@genes))
setMethod("names", "SplicingGraphs", function(x) names(x@genes))
setMethod("[", "SplicingGraphs",
function(x, i, j, ..., drop=TRUE)
{
if (!missing(j) || length(list(...)) > 0L)
stop("invalid subsetting")
if (missing(i))
return(x)
if (is.null(names(x)))
stop("subsetting a SplicingGraphs with no names is not supported")
x_genes <- x@genes
seqx <- seq_along(x_genes)
names(seqx) <- names(x_genes)
i <- seqx[i]
ii <- IRanges:::unlist_as_integer(IRanges(PartitioningByEnd(x_genes))[i])
x@genes <- x_genes[i]
x@in_by_tx <- x@in_by_tx[ii]
x
}
)
setMethod("[[", "SplicingGraphs",
function (x, i, j, ...)
{
if (!missing(j) || length(list(...)) > 0L)
stop("invalid subsetting")
x@genes[[i]]
}
)
setMethod("elementNROWS", "SplicingGraphs",
function(x) elementNROWS(x@genes)
)
setMethod("unlist", "SplicingGraphs",
function(x, recursive=TRUE, use.names=TRUE)
{
unlist(x@genes, recursive=recursive, use.names=use.names)
}
)
setMethod("seqnames", "SplicingGraphs", function(x) seqnames(x@genes))
setMethod("strand", "SplicingGraphs", function(x) strand(x@genes))
setMethod("seqinfo", "SplicingGraphs", function(x) seqinfo(x@genes))
### isCircular<- and txweight<- are the only SplicingGraphs setters supported
### at the moment.
setReplaceMethod("isCircular", "SplicingGraphs",
function(x, value)
{
isCircular(x@genes) <- isCircular(x@in_by_tx) <- value
x
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### "show" method.
###
setMethod("show", "SplicingGraphs",
function(object)
{
ngene <- length(object)
ntx <- length(unlist(object, use.names=FALSE))
cat(class(object), " object with ", ngene, " gene(s) ",
"and ", ntx, " transcript(s)\n", sep="")
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SplicingGraphs() constructor
###
### 'exons_start' and 'exons_end' must be 2 integer vectors of the same length
### N representing the start and end positions of N exons that belong to the
### same gene. Returns 2 integer vectors of length N containing the splicing
### site ids assigned to the start and end positions, respectively.
.make_SSids <- function(exons_start, exons_end, on.minus.strand=FALSE)
{
if (!is.numeric(exons_start))
stop("'exons_start' must be an integer vector")
if (!is.integer(exons_start))
exons_start <- as.integer(exons_start)
if (!is.numeric(exons_end))
stop("'exons_end' must be an integer vector")
if (!is.integer(exons_end))
exons_end <- as.integer(exons_end)
N <- length(exons_start)
if (length(exons_end) != N)
stop("'exons_start' and 'exons_end' must have the same length")
## splicing sites
ssites <- data.frame(pos=c(exons_start, exons_end),
type=rep.int(1:2, c(N, N)))
ssites_sm <- S4Vectors:::selfmatchIntegerPairs(ssites$pos, ssites$type)
## unique splicing sites
ussites <- ssites[ssites_sm == seq_along(ssites_sm), , drop=FALSE]
oo <- S4Vectors:::orderIntegerPairs(ussites$pos, ussites$type,
decreasing=on.minus.strand)
ussites <- ussites[oo, , drop=FALSE]
SSid <- S4Vectors:::matchIntegerPairs(ssites$pos, ssites$type,
ussites$pos, ussites$type)
exons_start_SSid <- head(SSid, n=N)
exons_end_SSid <- tail(SSid, n=N)
list(start_SSid=exons_start_SSid, end_SSid=exons_end_SSid)
}
### 'gene' must be a GRangesList object containing the exons of a *single*
### gene grouped by transcript. More precisely, each top-level element
### in 'gene' contains the genomic ranges of the exons for a particular
### transcript of the gene.
### Should be able to deal with a GRangesList object of length 0 (i.e. a
### gene with no transcripts).
.set_splicing_graph_info <- function(gene, check.introns=TRUE)
{
if (!is(gene, "GRangesList"))
stop("'gene' must be a GRangesList object")
if (!isTRUEorFALSE(check.introns))
stop("'check.introns' must be TRUE or FALSE")
exons <- gene@unlistData
common_strand <- commonStrand.GRanges(exons, what="exons in the gene")
on.minus.strand <- common_strand == "-"
if (check.introns && length(exons) != 0L) {
## We check that, within each transcript, exons are ordered from
## 5' to 3' with gaps of at least 1 nucleotide between them.
ranges_by_tx <- ranges(gene)
if (on.minus.strand)
ranges_by_tx <- revElements(ranges_by_tx)
if (!all(isNormal(ranges_by_tx)))
stop("some transcripts in the gene don't have their exons ",
"ordered from 5' to 3' with gaps of at least 1 ",
"nucleotide between them (introns)")
}
## Set splicing site ids.
SSids <- .make_SSids(start(exons), end(exons),
on.minus.strand=on.minus.strand)
exons_mcols <- mcols(exons)
if (any(names(SSids) %in% colnames(exons_mcols))) {
in_1_string <- paste(names(SSids), collapse=", ")
stop("'unlist(gene)' already has metadata columns: ", in_1_string)
}
mcols(exons) <- cbind(mcols(exons), DataFrame(SSids))
gene@unlistData <- exons
gene_mcols <- mcols(gene)
## Set "tx_id" metadata col.
if ("tx_id" %in% colnames(gene_mcols))
stop("'gene' already has metadata column tx_id")
tx_id <- names(gene)
if (!is.null(tx_id))
gene_mcols$tx_id <- tx_id
## Set "txpath" metadata col.
if ("txpath" %in% colnames(gene_mcols))
stop("'gene' already has metadata column txpath")
if (on.minus.strand) {
txpath <- rbind(SSids$end_SSid, SSids$start_SSid)
} else {
txpath <- rbind(SSids$start_SSid, SSids$end_SSid)
}
txpath_partitioning_end <- end(PartitioningByEnd(gene)) * 2L
txpath_partitioning <- PartitioningByEnd(txpath_partitioning_end)
names(txpath_partitioning) <- tx_id
txpath <- relist(as.vector(txpath), txpath_partitioning)
gene_mcols$txpath <- txpath
mcols(gene) <- gene_mcols
gene
}
### 'x' must be a GRangesList object containing the exons of one or more
### genes grouped by transcript. More precisely, each top-level element
### in 'x' contains the genomic ranges of the exons for a particular
### transcript. Typically 'x' will be obtained from a TxDb object
### 'txdb' with 'exonsBy(txdb, by="tx")'.
### 'grouping' is an optional object that represents the grouping by gene of
### the top-level elements (i.e. transcripts) in 'x'. It can be either:
### (a) Missing (i.e. NULL). In that case, all the transcripts in 'x'
### are considered to belong to the same gene and the SplicingGraphs
### object returned by SplicingGraphs() will be unnamed.
### (b) A list of integer or character vectors, or an IntegerList, or a
### CharacterList object, of length the number of genes to process,
### and where 'grouping[[i]]' is a vector of valid subscripts in 'x'
### pointing to all the transcripts of the i-th gene.
### (c) A factor, character vector, or integer vector, of length 'x'
### with 1 level per gene.
### (d) A named GRangesList object containing transcripts grouped by gene
### i.e. each top-level element in 'grouping' contains the genomic ranges
### of the transcripts for a particular gene. In that case, the grouping
### is inferred from the tx_id (or alternatively tx_name) metadata
### column of 'unlist(grouping)' and all the values in that column must
### be in 'names(x)'.
### If 'x' was obtained with 'exonsBy(txdb, by="tx")', then the
### GRangesList object used for grouping would typically be obtained with
### 'transcriptsBy(txdb, by="gene")'.
### (e) A data.frame or DataFrame with 2 character vector columns: a
### gene_id column (factor, character vector, or integer vector),
### and a tx_id (or alternatively tx_name) column. In that case, 'x'
### must be named and all the values in the tx_id (or tx_name) column
### must be in 'names(x)'.
.checkOrMakeUniqueGroupingNames <- function(grouping)
{
grouping_names <- names(grouping)
if (is.null(grouping_names)) {
if (length(grouping) != 0L)
warning("'grouping' is unnamed. Setting names on it (with ",
"'names(grouping) <- seq_along(grouping)') as artificial ",
"gene ids.")
names(grouping) <- seq_along(grouping)
return(grouping)
}
if (anyDuplicated(grouping_names))
stop("'grouping' has duplicated names")
if (any(c(NA, "") %in% grouping_names))
stop("'grouping' names contains NAs or \"\" (empty string)")
grouping
}
### Returns a list or IntegerList or CharacterList. Always *named* with the
### gene ids.
.normarg_grouping <- function(grouping, x)
{
## (b)
if (is.list(grouping) || is(grouping, "IntegerList")
|| is(grouping, "CharacterList")) {
return(.checkOrMakeUniqueGroupingNames(grouping))
}
## (c)
if (is.factor(grouping) || is.character(grouping) || is.integer(grouping)) {
if (length(grouping) != length(x))
stop("when 'grouping' is a factor, character vector, or integer ",
"vector, it must have the same length as 'x'")
return(split(seq_along(x), grouping))
}
x_names <- names(x)
## (d)
if (is(grouping, "GRangesList")) {
if (is.null(x_names))
stop("when 'grouping' is a GRangesList, 'x' must be named ",
"with transcript ids (tx_id) or transcript names (tx_name)")
grouping <- .checkOrMakeUniqueGroupingNames(grouping)
mcols <- mcols(grouping@unlistData)
mcolnames <- colnames(mcols)
for (colname in c("tx_id", "tx_name")) {
idx <- which(mcolnames == colname)
if (length(idx) == 0L)
next
if (length(idx) >= 2L)
stop("'unlist(grouping)' has more than 1 ",
colname, " metadata column")
m <- match(mcols[[idx]], x_names)
if (any(is.na(m)))
next
return(relist(m, PartitioningByEnd(grouping)))
}
stop("'unlist(grouping)' has no tx_id or tx_name column, ",
"or they contain values that are not in 'names(x)'")
}
## (e)
if (is.data.frame(grouping) || is(grouping, "DataFrame")) {
if (is.null(x_names))
stop("when 'grouping' is a data.frame or a DataFrame, 'x' ",
"must be named with transcript ids (tx_id) or transcript ",
"names (tx_name)")
grouping_colnames <- colnames(grouping)
idx <- which(grouping_colnames == "gene_id")
if (length(idx) != 1L)
stop("when 'grouping' is a data.frame or a DataFrame, it must ",
"have exactly 1 gene_id column")
gene_id <- grouping[[idx]]
if (!is.factor(gene_id) && !is.character(gene_id)
&& !is.integer(gene_id))
stop("'grouping$gene_id' must be a factor, character vector, ",
"or integer vector")
for (colname in c("tx_id", "tx_name")) {
idx <- which(grouping_colnames == colname)
if (length(idx) == 0L)
next
if (length(idx) >= 2L)
stop("'grouping' has more than 1 ", colname, " column")
m <- match(grouping[[idx]], x_names)
if (any(is.na(m)))
next
return(split(m, gene_id))
}
stop("'grouping' has no tx_id or tx_name column, ",
"or they contain values that are not in 'names(x)'")
}
stop("invalid 'grouping'")
}
### TODO: Improve handling of invalid genes i.e. provide more details about
### which genes were considered invalid and why.
.make_ex_by_tx_from_GRangesList <- function(x, grouping=NULL,
min.ntx=2, max.ntx=NA,
check.introns=TRUE)
{
if (!is(x, "GRangesList"))
stop("'x' must be a GRangesList object")
metadata(x) <- list()
if (is.null(grouping)) {
if (!identical(min.ntx, 2) || !identical(max.ntx, NA))
stop("the 'min.ntx' and 'max.ntx' args are not supported ",
"when 'grouping' is not supplied or NULL")
ans <- .set_splicing_graph_info(x, check.introns=check.introns)
names(ans) <- NULL
return(ans)
}
## Check 'min.ntx'.
if (!isSingleNumber(min.ntx))
stop("'min.ntx' must be a single number")
if (!is.integer(min.ntx))
min.ntx <- as.integer(min.ntx)
if (min.ntx < 1L)
stop("'min.ntx' must be >= 1")
## Check 'max.ntx'.
if (!isSingleNumberOrNA(max.ntx))
stop("'max.ntx' must be a single number or NA")
if (!is.integer(max.ntx))
max.ntx <- as.integer(max.ntx)
if (!is.na(max.ntx) && max.ntx < min.ntx)
stop("'max.ntx' must be >= 'min.ntx'")
grouping <- .normarg_grouping(grouping, x)
## Keep genes with nb of transcripts >= min.ntx and <= max.ntx.
grouping_eltNROWS <- elementNROWS(grouping)
keep <- grouping_eltNROWS >= min.ntx
if (!is.na(max.ntx))
keep <- keep & grouping_eltNROWS <= max.ntx
grouping <- grouping[keep]
if (length(grouping) == 0L) {
## We need to return an empty GRangesList but it cannot be simply
## made with GRangesList() or it wouldn't have the expected metadata
## cols (outer and inner). So we call .set_splicing_graph_info() on
## an empty gene instead.
ans <- .set_splicing_graph_info(x[NULL], check.introns=check.introns)
names(ans) <- character(0)
return(ans)
}
## Main loop.
ans <- bplapply(seq_along(grouping),
function(i) {
ii <- grouping[[i]]
gene <- x[ii]
gene2 <- try(.set_splicing_graph_info(gene,
check.introns=check.introns),
silent=TRUE)
if (inherits(gene2, "try-error"))
return(NULL)
gene2
})
invalid_genes_idx <- which(sapply(ans, is.null))
nb_invalid_genes <- length(invalid_genes_idx)
if (nb_invalid_genes != 0L) {
warning(nb_invalid_genes, " invalid ",
ifelse(nb_invalid_genes == 1L, "gene was", "genes were"),
" dropped")
grouping <- grouping[-invalid_genes_idx]
ans <- ans[-invalid_genes_idx]
}
ans <- do.call(c, ans)
grouping_names <- names(grouping)
if (!is.null(grouping_names)) {
ans_names <- rep.int(grouping_names, elementNROWS(grouping))
names(ans) <- ans_names
}
ans
}
setGeneric("SplicingGraphs", signature="x",
function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
standardGeneric("SplicingGraphs")
)
setMethod("SplicingGraphs", "GRangesList",
function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
{
ans_ex_by_tx <- .make_ex_by_tx_from_GRangesList(x, grouping=grouping,
min.ntx=min.ntx, max.ntx=max.ntx,
check.introns=check.introns)
ans_genes <- GeneModel(ans_ex_by_tx)
mcols(ans_ex_by_tx) <- NULL
ans_in_by_tx <- setdiff(range(ans_ex_by_tx), ans_ex_by_tx)
common_strand <- commonStrand.GRangesList(ans_in_by_tx)
ans_in_by_tx <- revElements(ans_in_by_tx, common_strand == "-")
ans_bubbles_cache <- new.env(parent=emptyenv())
new("SplicingGraphs", genes=ans_genes,
in_by_tx=ans_in_by_tx,
.bubbles_cache=ans_bubbles_cache)
}
)
setMethod("SplicingGraphs", "TxDb",
function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
{
if (!is.null(grouping))
stop("the 'grouping' arg is not supported ",
"when 'x' is a TxDb object")
ex_by_tx <- exonsBy(x, by="tx", use.names=TRUE)
tx_by_gn <- transcriptsBy(x, by="gene")
SplicingGraphs(ex_by_tx, tx_by_gn, min.ntx=min.ntx, max.ntx=max.ntx,
check.introns=check.introns)
}
)
### Not exported. Only used in the .onLoad hook (see zzz.R file) for fixing
### the prototypes of the GeneModel and SplicingGraphs classes.
make_empty_SplicingGraphs <- function()
{
ex_by_tx <- GRangesList()
names(ex_by_tx) <- character(0)
mcols(ex_by_tx@unlistData) <- DataFrame(exon_id=integer(0),
exon_name=character(0),
exon_rank=integer(0))
ans <- SplicingGraphs(ex_by_tx, IntegerList())
## We want to make sure 'ans' is *completely* valid.
validObject(ans, complete=TRUE)
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.