gread: Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files

#' @title Generate non-overlapping intron coordinates
#'
#' @description This function takes an \code{gtf/gff} object. If the 
#' features \code{exon} and \code{intron} are present, then, it attempts 
#' to find a set of exon and intron coordinates that do NOT overlap with each 
#' other. If the intron coordinates are not present, it constructs the introns 
#' first using \code{gread::construct_introns} and then attempts to extract 
#' non-overlapping exonic and intronic coordinates. If \code{exon} feature is 
#' not present, it returns an error.
#' 
#' Note: This function is not exported at the moment.
#' 
#' @param x An object of class \code{gtf/gff} object which has to have 
#' features \code{intron} and \code{exon}.
#' @param transcript_id Column name in \code{x} corresponding to transcript id.
#' @param gene_id Column name in \code{x} corresponding to gene id.
#' @return Returns a list of \code{GRanges} objects containing 
#' \emph{non overlapping exons} and \emph{non overlapping introns}.
#' @examples
#' \dontrun{
#' path <- system.file("tests", package="gread")
#' gtf_file <- file.path(path, "sample.gtf")
#' gtf <- read_format(gtf_file)
#' non_overlaps(gtf)
#' }
non_overlaps <- function(x, transcript_id="transcript_id", gene_id="gene_id"){

    # taken from gffutils::non_overlaps    
    stopifnot(is.gtf(x) || is.gff(x))
    x = as_data_table(x)
    stopifnot("feature" %chin% names(x), 
                transcript_id %chin% names(x), 
                gene_id %chin% names(x))
    # to please R CMD CHECK
    feature=idx=epos=ecnt=keep=edge=check=V1=seqnames=pos=e.gid=i.gid=N=NULL
    uniq_features = unique(x[["feature"]])
    if (!"intron" %in% uniq_features) {
        xx = new(class(x)[1L], as_granges(x))
        x = as_data_table(construct_introns(xx))
    }
    uniq_features = unique(x[["feature"]])
    stopifnot(all(c("intron", "exon") %in% uniq_features))
    i = x[feature == "intron"]
    e = x[feature == "exon"]
    e[, `:=`(ecnt = .N, epos = 1:.N), by=c(transcript_id)]

    i_gr = as_granges(i)

    # non-overlapping intron
    # "within" contains the potential IR events. So we shouldn't delete them
    a = find_overlaps(i, e, type="any", ignore_strand=TRUE)    
    w = find_overlaps(i, e, type="within", ignore_strand=TRUE)
    
    # idea: remove all `any` events that are not as well `within`. Also, 
    # equate the corresponding intronic and exonic gene_id and obtain a 
    # logical vector idx
    a_minus_w = a[!w, on=names(a)
                ][, `:=`(idx = i[[gene_id]][queryHits] == 
                                    e[[gene_id]][subjectHits], 
                        epos = e$epos[subjectHits], 
                        ecnt = e$ecnt[subjectHits])]
    setkey(a_minus_w, queryHits)

    # if the logical vector idx sums to `0`, when grouped by `queryHits` then, 
    # this means that this intron coordinate overlaps only with an overlapping 
    # gene and so we'll have to keep it. Hence the `keep != 0`
    a_minus_w = a_minus_w[, list(N = .N, keep = sum(idx), edge = 
                    (all(epos == 1) | all(epos == ecnt))), by=queryHits]
    a_minus_w = a_minus_w[keep != 0][, keep := NULL]

    # Now resolve end-cases and special "both start,end unequal overlap" cases 
    # scenario A: we'll remove this intron and it'll be falsely detected as a 
    # CI event... We'll have to retain them!
    # ==========-------------========= (iso1) (will be removed usually)
    #                   ============== (iso2) 
    a_minus_w = a_minus_w[!(edge)][, edge := NULL]
    
    # scenario B: we'll remove introns of this fashion:
    # ============--------------============= (iso1)
    # ===============----------------======== (iso2) (will be removed usually)
    # In this case we won't identify the isoforms associated with these events 
    # here. So, we'll have to recover them as well. How? by finding intron 
    # overlaps with itself. Can't use find_overlaps here as it requires x,y 
    # arguments.
    io = findOverlaps(i_gr, drop.self=TRUE, type="any", drop.redundant=FALSE)
    io = setDT(list(queryHits=queryHits(io), subjectHits=subjectHits(io)))
    setkey(io)
    
    # you'll have to subtract "equal" or ("exact") overlaps:
    io.eq = findOverlaps(i_gr, drop.self=TRUE, type="equal", 
                drop.redundant=FALSE)
    io.eq = setDT(list(queryHits=queryHits(io.eq), 
                        subjectHits=subjectHits(io.eq)))
    setkey(io.eq)
    io = io[!io.eq]
    setkey(io, queryHits)
    io[, check := i$start[queryHits] != i$start[subjectHits] & 
                    i$end[queryHits] != i$end[subjectHits]]
    io.fil = io[, sum(check) == .N, by=queryHits][V1 == TRUE]

    # all these io.fil must be retained. So, we've to remove them from 
    # a_minus_w.
    a_minus_w = a_minus_w[!list(io.fil$queryHits)] 
    # old code: a_minus_w[!queryHits %in% io.fil$queryHits]
    # The above lines will take care of scenario B.
    # changes for 7th August ends here

    # Now, we have in a_minus_w all the indices of introns that we'd like 
    # to remove as, they overlap with at least one exon of their own gene, 
    # meaning there is a smaller intron at the same position for this gene 
    # that we've already accounted for
    ni = i[!a_minus_w$queryHits]
    setkey(ni, "seqnames", "start", "end")
    tmp__ = ni[[transcript_id]]
    ni[, c(transcript_id) := as.list(tmp__)]
    ni = ni[, c("seqnames", "start", "end", "strand", 
                    transcript_id, gene_id), with=FALSE]
    count = ni[, .N, by = list(seqnames, start, end)]
    # code here is moved to the end... July 25 (due to overlapping genes 
    # interpretation difference between intron and exon)

    # non_overlapping exon
    # edited May 30th 2013, redid the whole thing: July 17 2013
    # data.table ABSOLUTELY ROCKS!
    ne = e[, list(seqnames=seqnames[1L], pos=c(start, end)), by=c(gene_id)]
    setkey(ne)
    ne = ne[, list(seqnames=seqnames[1L], start=pos[1:(.N-1)], end=pos[2:.N]), 
                    by=c(gene_id)]

    nolaps = find_overlaps(ne, ni, type = "any")
    nolaps[, `:=`(e.gid=ne[[gene_id]][queryHits], 
                    i.gid=ni[[gene_id]][subjectHits])]
    nolaps = nolaps[e.gid == i.gid]

    ne = ne[!unique(nolaps$queryHits)]
    ne = ne[start != end]
    ne = ne[, list(seqnames=seqnames[1L], pos=c(start, end)), by=c(gene_id)]
    setkeyv(ne, c(gene_id, "pos"))
    ne = ne[ne[, .N, by=key(ne)][N == 1]][, N := NULL]
    ne = ne[, list(seqnames=seqnames[1L], start=pos[c(TRUE, FALSE)], 
                    end=pos[c(FALSE, TRUE)]), by=c(gene_id)]
    setkeyv(ne, c(gene_id))

    setnames(e, transcript_id, "transcript_id")
    on.exit(setnames(e, "transcript_id", transcript_id), add=TRUE)
    ne.rest = e[, list(strand=strand[1L], 
                    transcript_id=list(unique(transcript_id))), by=c(gene_id)]
    setnames(ne.rest, "transcript_id", transcript_id)
    ne = ne[ne.rest]
    setcolorder(ne, names(ni))
    setkey(ne, seqnames, start, end)
    
    # now get overlapping intron
    ni[count[N > 1], `:=`(strand=paste(unique(strand), sep="", collapse=""), 
                        transcript_id=list(unlist(transcript_id)), 
                        gene_id=list(unique(unlist(gene_id)))), 
       by=.EACHI] ## version 1.9.3+
    setnames(ni, c("transcript_id", "gene_id"), c(transcript_id, gene_id))
    ni = unique(shallow(ni, reset_class=TRUE))
    # calling unique on 'ni' is more efficient than this, since 'ni' is key'd. 
    # changed July 17 
    # .gff[["non_overlapping_intron"]] = ni[count, mult = "first"][, N := NULL]
    setkey(ne, NULL)
    setkey(ni, NULL)
    ne = as(setDF(ne), "GRanges")
    ni = as(setDF(ni), "GRanges")
    list(non_overlapping_exon = ne, non_overlapping_intron = ni)
}

openanalytics/gread documentation built on May 24, 2019, 2:29 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

openanalytics/gread
Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files

R/non-overlaps.R
In openanalytics/gread: Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files

R Package Documentation

Browse R Packages

We want your feedback!

openanalytics/gread Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files

R/non-overlaps.R In openanalytics/gread: Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files

R Package Documentation

Browse R Packages

We want your feedback!

openanalytics/gread
Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files

R/non-overlaps.R
In openanalytics/gread: Fast Reading and Processing of Common Gene Annotation and Next Generation Sequencing Format Files