SplicingGraphs: Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them

Documented in rsgedges rsgraph

### =========================================================================
### "rsgedgesByGene" (and related) methods
### -------------------------------------------------------------------------
###
### Analog to "sgedgesByGene" (and related) methods, but operate on the
### *reduced* splicing graphs.
###

### Return the fully qualified uninformative nodes.
.get_fq_uninfnodes <- function(sg)
{
    txpath <- txpath(unlist(sg))
    skeleton2 <- PartitioningByEnd(end(PartitioningByEnd(txpath)) * 2L)
    txpath2 <- relist(rep(txpath@unlistData, each=2L), skeleton2)

    R_nodes <- rep.int(CharacterList("R"), length(txpath2))
    L_nodes <- rep.int(CharacterList("L"), length(txpath2))
    tmp <- c(R_nodes, as(txpath2, "CharacterList"), L_nodes)
    f <- rep.int(seq_along(txpath2), 3L)
    txpath3 <- unlistAndSplit(tmp, f)
    names(txpath3) <- names(txpath)

    gene_id <- rep.int(names(txpath3), elementNROWS(txpath3))
    tmp <- txpath3@unlistData
    gene_id <- gene_id[c(TRUE, FALSE)]
    from <- tmp[c(TRUE, FALSE)]
    to <- tmp[c(FALSE, TRUE)]
    sgedge_id <- make_global_sgedge_id(gene_id, from, to)
    keep_idx <- which(!duplicated(sgedge_id))
    fq_from <- paste(gene_id, from, sep=":")[keep_idx]  # fully qualified ids
    fq_to <- paste(gene_id, to, sep=":")[keep_idx]  # fully qualified ids
    uninformative_sgnodes(fq_from, fq_to)
}

### 'f': factor. The "reverse factor" of 'f' is the named list of integer
### vectors that maps each level of 'f' to the positions in 'f' where that
### level is used. It can quickly be computed with:
###
###     revfactor <- split(seq_along(f), f)
###
### 'f' can be rebuilt from 'revfactor' with:
###
###     f2 <- .make_factor_from_revfactor(revfactor, length(f))
###     stopifnot(identical(f2, f)
###
.make_factor_from_revfactor <- function(revfactor, f_len)
{
    f_levels <- names(revfactor)
    idx <- integer(f_len)
    idx[] <- NA_integer_
    idx[unlist(revfactor, use.names=FALSE)] <- rep.int(seq_along(revfactor),
                                                   elementNROWS(revfactor))
    factor(f_levels[idx], levels=f_levels)
}

### 'from' and 'to' must have the same length N (nb of unique edges in the
### SplicingGraphs object before reduction).
.make_revfactor_from_uninfnodes <- function(uninfnodes, from, to)
{
    from_idx <- match(uninfnodes, from)
    to_idx <- match(uninfnodes, to)
    keep_idx <- which(!(is.na(from_idx) | is.na(to_idx)))
    from_idx <- from_idx[keep_idx]
    to_idx <- to_idx[keep_idx]
    stopifnot(all(from_idx == to_idx + 1L))  # sanity check
    if (length(from_idx) == 0L) {
        group1 <- integer(0)
    } else {
        group1 <- cumsum(c(TRUE, diff(from_idx) != 1L))
    }
    split_to_idx <- unname(splitAsList(to_idx, group1))
    split_from_idx <- unname(splitAsList(from_idx, group1))
    fancy_punion(split_to_idx, split_from_idx)
}

### 'revfactor' must be a "reverse factor" as returned by
### .make_revfactor_from_uninfnodes().
### Returns a character vector of length N containing the global rsgedge id
### (global reduced splicing graph edge id) corresponding to each input edge.
.build_sgedge2rsgedge_map_from_revfactor <- function(revfactor,
                                                     gene_id, from, to)
{
    ans <- make_global_sgedge_id(gene_id, from, to)

    unlisted_revfactor <- unlist(revfactor, use.names=FALSE)
    from_list <- relist(from[unlisted_revfactor], revfactor)
    to_list <- relist(to[unlisted_revfactor], revfactor)
    nodes_list <- fancy_punion(from_list, to_list)

    rsgedge_id <- sapply(nodes_list, paste0, collapse=",")
    rsgedge_id <- rep.int(rsgedge_id, elementNROWS(revfactor))
    rsgedge_id <- paste(gene_id[unlisted_revfactor], rsgedge_id, sep=":")

    ans[unlisted_revfactor] <- rsgedge_id
    ans
}

.build_sgedge2rsgedge_map <- function(uninfnodes, gene_id, from, to)
{
    fq_from <- paste(gene_id, from, sep=":")  # fully qualified ids
    fq_to <- paste(gene_id, to, sep=":")  # fully qualified ids
    revfactor <- .make_revfactor_from_uninfnodes(uninfnodes, fq_from, fq_to)
    .build_sgedge2rsgedge_map_from_revfactor(revfactor, gene_id, from, to)
}

### 'edges' must be a GRanges object.
.reduce_edges <- function(edges, f)
{
    if (!is(edges, "GRanges"))
        stop("'edges' must be a GRanges object")
    if (!is.factor(f))
        stop("'f' must be a factor")
    if (length(edges) != length(f))
        stop("'edges' and 'f' must have the same length")
    sm <- match(f, f)
    keep_idx <- sm == seq_along(sm)
    edges_names <- names(edges)
    edges_mcols <- mcols(edges)
    names(edges) <- mcols(edges) <- NULL
    edges_by_redge <- split(edges, f)
    partitioning <- PartitioningByEnd(edges_by_redge)
    partitioning_start <- start(partitioning)
    partitioning_end <- end(partitioning)

    ## The ranges within each top-level element of 'edges_by_redge' are an
    ## alternance of consecutive exons and introns from the same transcript
    ## and thus are adjacent (i.e. non-overlapping with no gaps between them).
    ## Therefore using reduce() or range() should be equivalent.
    redges_by_redge <- reduce(edges_by_redge)
    ans <- unlist(redges_by_redge)
    stopifnot(identical(names(ans), levels(f)))  # sanity check
    names(ans) <- edges_names[keep_idx]

    ## Reduce "from" and "to" metadata cols.
    edges_from <- edges_mcols[ , "from"]
    ans_from <- splitAsList(edges_from, f)
    edges_to <- edges_mcols[ , "to"]
    ans_to <- splitAsList(edges_to, f)
    stopifnot(identical(ans_from@unlistData[-partitioning_start],
                        ans_to@unlistData[-partitioning_end]))
    ans_from <- ans_from@unlistData[partitioning_start]
    ans_to <- ans_to@unlistData[partitioning_end]

    ## Reduce "ex_or_in" metadata col.
    edges_ex_or_in <- edges_mcols[ , "ex_or_in"]
    ans_ex_or_in <- splitAsList(edges_ex_or_in, f)
    levels(ans_ex_or_in@unlistData) <- EX_OR_IN_LEVELS2
    mixed_idx <- which(elementNROWS(ans_ex_or_in) != 1L)
    ans_ex_or_in[mixed_idx] <- factor("mixed", levels=EX_OR_IN_LEVELS2)
    ans_ex_or_in <- factor(EX_OR_IN_LEVELS2[ans_ex_or_in@unlistData],
                           levels=EX_OR_IN_LEVELS2)

    ## Reduce "tx_id" metadata col.
    edges_tx_id <- edges_mcols[ , "tx_id"]
    stopifnot(identical(edges_tx_id, edges_tx_id[sm]))  # sanity check
    ans_tx_id <- edges_tx_id[keep_idx]

    ans_mcols <- DataFrame(from=ans_from,
                           to=ans_to,
                           rsgedge_id=levels(f),
                           ex_or_in=ans_ex_or_in,
                           tx_id=ans_tx_id)

    ## Reduce hits metadata cols.
    hits_mcol_idx <- grep("hits$", colnames(edges_mcols))
    if (length(hits_mcol_idx) != 0L) {
        hits_mcols <- endoapply(edges_mcols[hits_mcol_idx],
                                function(hits)
                                  unname(unique(unlistAndSplit(hits, f))))

        ## Combine with 'ans_mcols'.
        ans_mcols <- cbind(ans_mcols, hits_mcols)
    }

    ## Set the metadata cols on 'ans' and return it.
    mcols(ans) <- ans_mcols
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### uninformativeSSids() extractor
###
### Uninformative splicing sites are nodes in the splicing graph for which 
### outdeg and indeg are 1. A straightforward implementation of
### uninformativeSSids() would be:
###
###   uninformativeSSids <- function(x)
###   {
###       is_uninfo <- outdeg(sg) == 1L & indeg(sg) == 1L
###       names(is_uninfo)[is_uninfo]
###   }
###
### but the implementation below is about 2x faster.
###

setGeneric("uninformativeSSids", signature="x",
    function(x) standardGeneric("uninformativeSSids")
)

setMethod("uninformativeSSids", "ANY",
    function(x)
    {
        sgedges <- sgedges(x)
        uninformativeSSids(sgedges)
    }
)

setMethod("uninformativeSSids", "DataFrame",
    function(x)
    {
        from <- x[ , "from"]
        to <- x[ , "to"]
        uninformative_sgnodes(from, to)
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### rsgedgesByTranscript()
###

setGeneric("rsgedgesByTranscript", signature="x",
    function(x, with.hits.mcols=FALSE)
        standardGeneric("rsgedgesByTranscript")
)

setMethod("rsgedgesByTranscript", "SplicingGraphs",
    function(x, with.hits.mcols=FALSE)
    {
        stop("not ready yet!")
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### rsgedgesByGene()
###

setGeneric("rsgedgesByGene", signature="x",
    function(x, with.hits.mcols=FALSE, keep.dup.edges=FALSE)
        standardGeneric("rsgedgesByGene")
)

setMethod("rsgedgesByGene", "SplicingGraphs",
    function(x, with.hits.mcols=FALSE, keep.dup.edges=FALSE)
    {
        if (!isTRUEorFALSE(keep.dup.edges))
            stop("'keep.dup.edges' must be TRUE or FALSE")
        if (keep.dup.edges)
            stop("'keep.dup.edges=TRUE' is not supported yet, sorry")
        edges_by_gene <- sgedgesByGene(x, with.hits.mcols=with.hits.mcols)
        edges0 <- unlist(edges_by_gene)
        edges0_mcols <- mcols(edges0)
        gene_id <- names(edges0)
        from <- edges0_mcols[ , "from"]
        to <- edges0_mcols[ , "to"]
        uninfnodes <- .get_fq_uninfnodes(x)
        sgedge2rsgedge_map <- .build_sgedge2rsgedge_map(uninfnodes,
                                                        gene_id, from, to)
        f <- factor(sgedge2rsgedge_map, levels=unique(sgedge2rsgedge_map))
        ans_flesh <- .reduce_edges(edges0, f)
        ans_flesh_names <- Rle(names(ans_flesh))
        breakpoints <- cumsum(runLength(ans_flesh_names))
        ans_names <- runValue(ans_flesh_names)
        ans_skeleton <- PartitioningByEnd(breakpoints, names=ans_names)
        relist(unname(ans_flesh), ans_skeleton)
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### rsgedges() extractor
###
### Same as sgedges() except that uninformative nodes are removed.
###

### 'sgedges' must be a DataFrame as returned by:
###     sgedges( , keep.dup.edges=FALSE)
.remove_uninformative_SSids <- function(sgedges)
{
    ex_or_in <- sgedges[ , "ex_or_in"]
    ex_or_in_levels <- levels(ex_or_in)
    if (!identical(ex_or_in_levels, EX_OR_IN_LEVELS))
        stop("Malformed input.\n",
             "  In the input data.frame (or DataFrame) representing the ",
             "original splicing graph, the \"ex_or_in\" column has invalid ",
             "levels. Could it be that it was obtained by a previous call ",
             "to rsgedges()?")
    levels(ex_or_in) <- EX_OR_IN_LEVELS2
    uninformative_SSids <- uninformativeSSids(sgedges)
    if (length(uninformative_SSids) == 0L) {
        col_idx <- match("sgedge_id", colnames(sgedges))
        colnames(sgedges)[col_idx] <- "rsgedge_id"
        sgedges$ex_or_in <- ex_or_in
        return(sgedges)
    }

    from <- sgedges[ , "from"]
    to <- sgedges[ , "to"]
    from_idx <- match(uninformative_SSids, from)
    to_idx <- match(uninformative_SSids, to)
    if (!all(from_idx == to_idx + 1L))
        stop("Malformed input.\n",
             "  In the input data.frame (or DataFrame) representing the ",
             "original splicing graph, each uninformative splicing site ",
             "id must appear in 2 consecutive rows (first in the \"to\" ",
             "column, then in the \"from\" column. Could it be that the ",
             "rows were subsetted before the data.frame (or DataFrame) ",
             "was passed to rsgedges()?")

    ## Reduce "from" and "to" cols.
    ans_from <- from[-from_idx]
    ans_to <- to[-to_idx]

    ## Reduce "sgedge_id" col.
    sgedges_id <- sgedges[ , "sgedge_id"]
    tmp <- unlist(strsplit(sgedges_id, ":", fixed=TRUE), use.names=FALSE)
    gene_id <- tmp[c(TRUE, FALSE)]
    revfactor <- .make_revfactor_from_uninfnodes(uninformative_SSids, from, to)
    sgedge2rsgedge_map <- .build_sgedge2rsgedge_map_from_revfactor(revfactor,
                                                             gene_id, from, to)
    f <- factor(sgedge2rsgedge_map, levels=unique(sgedge2rsgedge_map))
    ans_rsgedge_id <- levels(f)

    ## Reduce "ex_or_in" col.
    ex_or_in[from_idx] <- EX_OR_IN_LEVELS2[4L]
    ans_ex_or_in <- ex_or_in[-to_idx]

    ## Reduce "tx_id" col.
    tx_id <- sgedges[ , "tx_id"]
    if (!identical(unname(tx_id[from_idx]), unname(tx_id[to_idx])))
        stop("Malformed input.\n",
             "  In the input data.frame (or DataFrame) representing the ",
             "original splicing graph, the 2 rows containing a given ",
             "uninformative splicing site id must contain the same tx_id.",
             "Could it be that the \"tx_id\" column was manually altered ",
             "before the data.frame (or DataFrame) was passed to ",
             "rsgedges()?")
    ans_tx_id <- tx_id[-from_idx]

    DataFrame(from=ans_from,
              to=ans_to,
              rsgedge_id=ans_rsgedge_id,
              ex_or_in=ans_ex_or_in,
              tx_id=ans_tx_id)
}

rsgedges <- function(x)
{
    if (!is(x, "DataFrame"))
        x <- sgedges(x)
    .remove_uninformative_SSids(x)
}

### Alias for backward compatibility.
sgedges2 <- rsgedges


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### rsgraph() extractor
###
### Same as sgraph() except that uninformative nodes (i.e. SSids) are removed.
###

rsgraph <- function(x, tx_id.as.edge.label=FALSE, as.igraph=FALSE)
{
    sgraph(rsgedges(x),
           tx_id.as.edge.label=tx_id.as.edge.label, as.igraph=as.igraph)
}

### Alias for backward compatibility.
sgraph2 <- rsgraph

Bioconductor/SplicingGraphs documentation built on Nov. 3, 2024, 8:11 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Bioconductor/SplicingGraphs
Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them

R/rsgedgesByGene-methods.R
In Bioconductor/SplicingGraphs: Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them

Defines functions rsgraph rsgedges .remove_uninformative_SSids .reduce_edges .build_sgedge2rsgedge_map .build_sgedge2rsgedge_map_from_revfactor .make_revfactor_from_uninfnodes .make_factor_from_revfactor .get_fq_uninfnodes

Documented in rsgedges rsgraph

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/SplicingGraphs Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them

R/rsgedgesByGene-methods.R In Bioconductor/SplicingGraphs: Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them

Defines functions rsgraph rsgedges .remove_uninformative_SSids .reduce_edges .build_sgedge2rsgedge_map .build_sgedge2rsgedge_map_from_revfactor .make_revfactor_from_uninfnodes .make_factor_from_revfactor .get_fq_uninfnodes

Documented in rsgedges rsgraph

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/SplicingGraphs
Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them

R/rsgedgesByGene-methods.R
In Bioconductor/SplicingGraphs: Create, manipulate, visualize splicing graphs, and assign RNA-seq reads to them