#' @nord
matchGFGeneCollapse <- function(collapse, gene=NULL) {
choices <- c('cover', 'cover-bin', 'constitutive', 'first', 'longest', 'shortest')
if (is.numeric(collapse)) {
collapse <- as.integer(collapse)
if (collapse != 1L && !is.null(gene) && length(transcripts(x) < collapse)) {
stop("Transcript chosen for `exon.type` is out of bounds")
}
} else {
collapse <- match.arg(collapse, choices)
}
if (collapse == 'first') {
collapse <- 1L
}
collapse
}
##' Returns an annotated GRanges object by "compressing" the exons
##' across isoforms into one prototypical / idealized version of the gene..
##'
##' @param x The \code{GFGene} object
##' @param by How the caller wants the "idealized" gene to be calculated.
##' \code{all} exons? Just the coding (\code{cds}) exons? etc.
##' @param collapse How are the exons summarized? Is it the union of all exons
##' accross transcripts (\code{cover}), the constitutively spliced portions?
##' \code{constitutive}, just the \code{first} transcript? The caller can
##' also set this to an integer indicating which transcript to pick of them all.
##' cover-bin : Like a union, but the splicing boundaries of exons are maintained:
##'
##' ******** [transcript 1]
##' ****** [transcript 2]
##' **+***** [unified]
##'
##' @param cds.cover Some genes have isoforms with coding regions over utr's
##' over other isoforms. If set the \code{min} then the \code{cds} boundaries
##' are the narrowest possible (never overlapping a \code{utr}). \code{max}
##' will allos the \code{cds} to overlap \code{utr}'s.
##'
##' @return An annotated \code{GRanges} object, whose accomponied \code{values}
##' has an \code{exon.anno} collumn identifying which corresponding rantges are
##' \code{cds, utr3, utr5, utr}
##'
##' If \code{by} is either \code{utr3, utr5}, then it returns a
##' \code{GRangesList} containing ranges that are split by intermediadary
##' coding regions. The utr's are split this way to provide a "best guess"
##' of which utrs can be considered as a whole (and not spanning separate
##' transcripts).
##'
##' If this last point seems strange to you, you should just use \code{by='all'}
##' and \code{subset} the returned object by, for example,
##' \code{values(...)$exon.ann == 'utr3}.
setGeneric("idealized",
function(x, by=c('all', 'cds', 'utr5', 'utr3'),
collapse=c('cover', 'cover-bin', 'constitutive', 'first', 'longest', 'shortest'),
cds.cover=c('min', 'max'), which.chr=NULL, flank.up=0L, flank.down=0L,
...) {
standardGeneric("idealized")
})
setMethod("idealized", c(x="GFGene"),
function(x, by, collapse, cds.cover, which.chr, flank.up, flank.down, ...) {
by <- match.arg(by)
cds.cover <- match.arg(cds.cover)
collapse <- matchGFGeneCollapse(collapse, x)
if (collapse == 'cover-bin') {
stop("collapse=cover-bin not yet implemented!")
}
reducef <- switch(collapse, constitutive='interset', 'union')
metaD <- list()
## g.strand <- strand(x)
g.strand <- as.vector(strand(transcripts(x, which.chr=which.chr)[[1]][1]))
args <- list(...)
force.eval <- args$force.eval
var.name <- paste('idealized', collapse,
sprintf('by-%s', by),
sprintf('cdscover-%s', cds.cover),
sprintf('utr5*-%d', flank.up),
sprintf('utr3*-%d', flank.down),
sep=".")
if (!is.null(which.chr)) {
var.name <- paste(var.name, which.chr, sep=".")
}
cacheFetch(x, var.name, {
.idealizedBy <- switch(by, all=.idealizedByAll, cds=.idealizedByCds,
utr5=.idealizedByUtr5, utr3=.idealizedByUtr3,
longest=.idealizedByLongest)
is.pc <- isProteinCoding(x, which.chr=which.chr)
is.pc <- names(is.pc)[is.pc]
probably.rna <- length(is.pc) == 0
xcripts <- transcripts(x, which.chr=which.chr)
if (probably.rna) {
## A gene with non-protein coding transcripts is tagged as `utr` across
## across all of its exons in this scenario.
## TODO: Cleanup duplicated code in probably.rna if/else block
if (collapse %in% c('longest', 'shortest')) {
lens <- sapply(xcripts, function(xc) {
width(range(ranges(xc)))
})
collapse <- switch(collapse, longest=which.max(lens), which.min(lens))
metaD$tx_name <- values(xcripts)$tx_name[collapse]
}
if (is.numeric(collapse)) {
ranges <- ranges(xcripts[[collapse]])
} else {
ranges <- Reduce(reducef, lapply(xcripts, ranges))
}
exon.anno <- c(rep('utr', length(ranges)))
} else {
take.pc <- function(.list, namez) {
.list[names(.list) %in% namez]
}
takeOrEmpty <- function(x, idx) {
if (length(x) < idx) IRanges() else x[[idx]]
}
.cds <- lapply(take.pc(cds(x, which.chr=which.chr), is.pc), ranges)
.utr3 <- lapply(take.pc(utr3(x, which.chr=which.chr), is.pc), ranges)
.utr5 <- lapply(take.pc(utr5(x, which.chr=which.chr), is.pc), ranges)
if (collapse %in% c('longest', 'shortest')) {
lens <- sapply(take.pc(xcripts, is.pc), function(xc) {
width(range(ranges(xc)))
})
collapse <- switch(collapse, longest=which.max(lens), which.min(lens))
metaD$tx_name <- values(xcripts)$tx_name[collapse]
}
if (is.numeric(collapse)) {
## Perhaps the Nth transcript doesn't have all of the expected segments
## in which case, we want to return an empty IRanges
.exons <- list(utr5=takeOrEmpty(.utr5, collapse),
cds=takeOrEmpty(.cds, collapse),
utr3=takeOrEmpty(.utr3, collapse))
} else {
## `Reduce`-ing over an empty list (which can result from a gene not
## having a .utr3) will produce a NULL in insteand of an empty IRanges
## object, which will throw errors in the setdiff operations below
null2empty <- function(x) if (is.null(x)) IRanges() else x
.cds <- null2empty(Reduce(reducef, .cds))
.utr3 <- null2empty(Reduce(reducef, .utr3))
.utr5 <- null2empty(Reduce(reducef, .utr5))
if (cds.cover == 'min') {
.cds <- setdiff(.cds, union(.utr3, .utr5))
} else {
.utr3 <- setdiff(.utr3, .cds)
.utr5 <- setdiff(.utr5, .cds)
}
.exons <- .idealizedBy(.utr5, .cds, .utr3, cds.cover)
}
ranges <- c(.exons$utr5, .exons$cds, .exons$utr3)
exon.anno <- c(rep('utr5', length(.exons$utr5)),
rep('cds', length(.exons$cds)),
rep('utr3', length(.exons$utr3)))
}
chr <- if (!is.null(which.chr)) which.chr else chromosome(x)[1]
if (probably.rna && by == 'cds') {
gr <- GRanges()
values(gr) <- DataFrame(exon.anno=character())
} else if (probably.rna || by == 'all') {
gr <- GRanges(seqnames=rep(chr, length(ranges)),
ranges=ranges,
strand=rep(g.strand, length(ranges)))
values(gr) <- DataFrame(exon.anno=exon.anno)
gr <- gr[order(ranges(gr))]
} else {
gr <- lapply(.exons, function(e) {
g <- GRanges(seqnames=rep(chr, length(e)),
ranges=e,
strand=rep(g.strand, length(e)))
values(g) <- DataFrame(exon.anno=rep(by, length(g)))
if (length(g) > 0) {
g[order(ranges(g))]
}
g
})
gr <- GRangesList(gr)
}
if (flank.up > 0) {
if (probably.rna) {
} else {
take <- which(values(gr)$exon.anno == 'utr5')
if (length(take) > 0) {
if (g.strand == '+') {
ext.end <- start(gr[take[1L]]) - 1L
ext.start <- ext.end - flank.up + 1
utr.ext <- GRanges(seqnames=chr, strand=g.strand,
ranges=IRanges(start=ext.start, end=ext.end))
values(utr.ext) <- DataFrame(exon.anno='utr5*')
gr <- c(utr.ext, gr)
} else {
ext.start <- end(gr[take[length(take)]]) + 1L
ext.end <- ext.start + flank.up - 1L
utr.ext <- GRanges(seqnames=chr, strand=g.strand,
ranges=IRanges(start=ext.start, end=ext.end))
values(utr.ext) <- DataFrame(exon.anno='utr5*')
gr <- c(gr, utr.ext)
}
}
}
}
if (flank.down > 0) {
if (probably.rna) {
} else {
take <- which(values(gr)$exon.anno == 'utr3')
if (length(take) > 0) {
if (g.strand == '+') {
ext.start <- end(gr[take[length(take)]]) + 1L
ext.end <- ext.start + flank.down - 1L
utr.ext <- GRanges(seqnames=chr, strand=g.strand,
ranges=IRanges(start=ext.start, end=ext.end))
values(utr.ext) <- DataFrame(exon.anno='utr3*')
gr <- c(gr, utr.ext)
} else {
ext.end <- start(gr[take[1L]]) - 1L
ext.start <- ext.end - flank.down + 1
utr.ext <- GRanges(seqnames=chr, strand=g.strand,
ranges=IRanges(start=ext.start, end=ext.end))
values(utr.ext) <- DataFrame(exon.anno='utr3*')
gr <- c(utr.ext, gr)
}
}
}
}
metadata(gr) <- metaD
gr
}, force.eval=force.eval)
})
## Trying to implement cover-bin
## idealized2 <- function(x, by, collapse, cds.cover, which.chr, flank.up, flank.down, ...) {
## by <- match.arg(by)
## cds.cover <- match.arg(cds.cover)
## collapse <- matchGFGeneCollapse(collapse, x)
## reducef <- switch(collapse, constitutive='interset', 'union')
## metaD <- list()
## ## g.strand <- strand(x)
## g.strand <- as.vector(strand(transcripts(x, which.chr=which.chr)[[1]][1]))
## args <- list(...)
## force.eval <- args$force.eval
## var.name <- paste('idealized', collapse,
## sprintf('by-%s', by),
## sprintf('cdscover-%s', cds.cover),
## sprintf('utr5*-%d', flank.up),
## sprintf('utr3*-%d', flank.down),
## sep=".")
## if (!is.null(which.chr)) {
## var.name <- paste(var.name, which.chr, sep=".")
## }
## cacheFetch(x, var.name, {
## .idealizedBy <- switch(by, all=.idealizedByAll, cds=.idealizedByCds,
## utr5=.idealizedByUtr5, utr3=.idealizedByUtr3,
## longest=.idealizedByLongest)
## is.pc <- isProteinCoding(x, which.chr=which.chr)
## is.pc <- names(is.pc)[is.pc]
## probably.rna <- length(is.pc) == 0
## ## add a .txid. for unrolling later
## xcripts <- Map(function(xc, i) {
## values(xc)$.txid. <- rep(i, length(xc))
## xc
## }, transcripts(x), seq_along(xcripts))
## if (probably.rna) {
## ## A gene with non-protein coding transcripts is tagged as `utr` across
## ## across all of its exons in this scenario.
## ## TODO: Cleanup duplicated code in probably.rna if/else block
## if (collapse %in% c('longest', 'shortest')) {
## lens <- sapply(xcripts, function(xc) {
## width(range(ranges(xc)))
## })
## collapse <- switch(collapse, longest=which.max(lens), which.min(lens))
## metaD$tx_name <- values(xcripts)$tx_name[collapse]
## }
## if (is.numeric(collapse)) {
## ranges <- ranges(xcripts[[collapse]])
## } else {
## ranges <- Reduce(reducef, lapply(xcripts, ranges))
## }
## exon.anno <- c(rep('utr', length(ranges)))
## } else {
## take.pc <- function(.list, namez) {
## .list[names(.list) %in% namez]
## }
## takeOrEmpty <- function(x, idx) {
## if (length(x) < idx) IRanges() else x[[idx]]
## }
## .cds <- lapply(take.pc(cds(x, which.chr=which.chr), is.pc), ranges)
## .utr3 <- lapply(take.pc(utr3(x, which.chr=which.chr), is.pc), ranges)
## .utr5 <- lapply(take.pc(utr5(x, which.chr=which.chr), is.pc), ranges)
## if (collapse %in% c('longest', 'shortest')) {
## lens <- sapply(take.pc(xcripts, is.pc), function(xc) {
## width(range(ranges(xc)))
## })
## collapse <- switch(collapse, longest=which.max(lens), which.min(lens))
## metaD$tx_name <- values(xcripts)$tx_name[collapse]
## }
## if (is.numeric(collapse)) {
## ## Perhaps the Nth transcript doesn't have all of the expected segments
## ## in which case, we want to return an empty IRanges
## .exons <- list(utr5=takeOrEmpty(.utr5, collapse),
## cds=takeOrEmpty(.cds, collapse),
## utr3=takeOrEmpty(.utr3, collapse))
## } else {
## ## `Reduce`-ing over an empty list (which can result from a gene not
## ## having a .utr3) will produce a NULL in insteand of an empty IRanges
## ## object, which will throw errors in the setdiff operations below
## null2empty <- function(x) if (is.null(x)) IRanges() else x
## if (collapse == "cover-bin") {
## .cds <- null2empty(disjoin(do.call(c, .cds)))
## .utr3 <- null2empty(disjoin(do.call(c, .utr3)))
## .utr5 <- null2empty(disjoin(do.call(c, .utr5)))
## } else {
## .cds <- null2empty(Reduce(reducef, .cds))
## .utr3 <- null2empty(Reduce(reducef, .utr3))
## .utr5 <- null2empty(Reduce(reducef, .utr5))
## }
## if (cds.cover == 'min') {
## .cds <- setdiff(.cds, union(.utr3, .utr5))
## } else {
## .utr3 <- setdiff(.utr3, .cds)
## .utr5 <- setdiff(.utr5, .cds)
## }
## .exons <- .idealizedBy(.utr5, .cds, .utr3, cds.cover)
## }
## ranges <- c(.exons$utr5, .exons$cds, .exons$utr3)
## exon.anno <- c(rep('utr5', length(.exons$utr5)),
## rep('cds', length(.exons$cds)),
## rep('utr3', length(.exons$utr3)))
## }
## chr <- if (!is.null(which.chr)) which.chr else chromosome(x)[1]
## if (probably.rna && by == 'cds') {
## gr <- GRanges()
## values(gr) <- DataFrame(exon.anno=character())
## } else if (probably.rna || by == 'all') {
## gr <- GRanges(seqnames=rep(chr, length(ranges)),
## ranges=ranges,
## strand=rep(g.strand, length(ranges)))
## values(gr) <- DataFrame(exon.anno=exon.anno)
## gr <- gr[order(ranges(gr))]
## } else {
## gr <- lapply(.exons, function(e) {
## g <- GRanges(seqnames=rep(chr, length(e)),
## ranges=e,
## strand=rep(g.strand, length(e)))
## values(g) <- DataFrame(exon.anno=rep(by, length(g)))
## if (length(g) > 0) {
## g[order(ranges(g))]
## }
## g
## })
## gr <- GRangesList(gr)
## }
## if (flank.up > 0) {
## if (probably.rna) {
## } else {
## take <- which(values(gr)$exon.anno == 'utr5')
## if (length(take) > 0) {
## if (g.strand == '+') {
## ext.end <- start(gr[take[1L]]) - 1L
## ext.start <- ext.end - flank.up + 1
## utr.ext <- GRanges(seqnames=chr, strand=g.strand,
## ranges=IRanges(start=ext.start, end=ext.end))
## values(utr.ext) <- DataFrame(exon.anno='utr5*')
## gr <- c(utr.ext, gr)
## } else {
## ext.start <- end(gr[take[length(take)]]) + 1L
## ext.end <- ext.start + flank.up - 1L
## utr.ext <- GRanges(seqnames=chr, strand=g.strand,
## ranges=IRanges(start=ext.start, end=ext.end))
## values(utr.ext) <- DataFrame(exon.anno='utr5*')
## gr <- c(gr, utr.ext)
## }
## }
## }
## }
## if (flank.down > 0) {
## if (probably.rna) {
## } else {
## take <- which(values(gr)$exon.anno == 'utr3')
## if (length(take) > 0) {
## if (g.strand == '+') {
## ext.start <- end(gr[take[length(take)]]) + 1L
## ext.end <- ext.start + flank.down - 1L
## utr.ext <- GRanges(seqnames=chr, strand=g.strand,
## ranges=IRanges(start=ext.start, end=ext.end))
## values(utr.ext) <- DataFrame(exon.anno='utr3*')
## gr <- c(gr, utr.ext)
## } else {
## ext.end <- start(gr[take[1L]]) - 1L
## ext.start <- ext.end - flank.down + 1
## utr.ext <- GRanges(seqnames=chr, strand=g.strand,
## ranges=IRanges(start=ext.start, end=ext.end))
## values(utr.ext) <- DataFrame(exon.anno='utr3*')
## gr <- c(utr.ext, gr)
## }
## }
## }
## }
## metadata(gr) <- metaD
## gr
## }, force.eval=force.eval)
## })
.idealizedByUtr <- function(utr, .cds) {
if (length(utr) == 0) {
return(list(IRanges()))
}
cgaps <- gaps(.cds)
mm <- matchMatrix(findOverlaps(cgaps, utr))
if (nrow(mm) == 0) {
utr <- list(utr)
} else {
qh <- unique(mm[, 'query'])
sh <- unique(mm[, 'subject'])
utr.split <- lapply(qh, function(i) {
take <- mm[, 'query'] == i
utr[mm[take, 'subject']]
})
utr.rest <- setdiff(1:length(utr), sh)
utr <- list(c(utr.split, list(utr[utr.rest])))
}
utr
}
## Will split consecutive 3' UTRs into 2 GRanges if there is a cds region
## between the "idealized" 3'utr stretch
.idealizedByUtr3 <- function(.utr5, .cds, .utr3, cds.cover) {
.idealizedByUtr(.utr3, .cds)
}
.idealizedByUtr5 <- function(.utr5, .cds, .utr3, cds.cover) {
.idealizedByUtr(.utr5, .cds)
}
.idealizedByCds <- function(.utr5, .cds, .utr3, cds.cover) {
warning("idealized(gene, by='cds') is not really implemented yet.")
list(utr5=IRanges(), cds=.cds, utr3=IRanges())
}
.idealizedByAll <- function(.utr5, .cds, .utr3, cds.cover) {
list(utr5=.utr5, cds=.cds, utr3=.utr3)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.