#' @rdname truncateTxome
#' @param txdb an object representing a transcriptome
#' @param maxTxLength the maximum length of resulting transcripts
#' @param ... additional arguments
#'
#' @return a \code{TxDb} object
#' @export
#'
#' @importFrom methods setGeneric
setGeneric("truncateTxome", signature=c("txdb", "maxTxLength"),
function(txdb, maxTxLength=500, ...) standardGeneric("truncateTxome")
)
#' Truncate Transcriptome
#'
#' @rdname truncateTxome
#'
#' @param txdb a \code{TxDb} object
#' @param maxTxLength the maximum length of transcripts
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying whether
#' and how the method should be parallelized.
#' @return a \code{TxDb} object
#'
#' @examples
#' library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene)
#'
#' ## load annotation
#' txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
#'
#' ## restrict to 'chrI' transcripts
#' seqlevels(txdb) <- c("chrI")
#'
#' ## last 500 nts per tx
#' txdb_w500 <- truncateTxome(txdb)
#' txdb_w500
#'
#' ## last 100 nts per tx
#' txdb_w100 <- truncateTxome(txdb, maxTxLength=100)
#' txdb_w100
#'
#' @importFrom GenomicRanges GRangesList mcols
#' @importFrom GenomicFeatures exonsBy makeTxDbFromGRanges
#' @importFrom BiocParallel bplapply bpparam
#' @importFrom AnnotationDbi select taxonomyId
#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom methods setMethod
#' @export
setMethod("truncateTxome", "TxDb", function(txdb,
maxTxLength=500,
BPPARAM=bpparam()) {
grlExons <- exonsBy(txdb, use.names=TRUE)
dfTxGene <- select(txdb, keys=names(grlExons),
keytype="TXNAME", columns="GENEID")
mapTxToGene <- setNames(dfTxGene$GENEID, dfTxGene$TXNAME)
message("Truncating transcripts...")
clipped <- bplapply(grlExons, .clipTranscript, maxTxLength=maxTxLength,
BPPARAM=BPPARAM)
clipped <- GRangesList(clipped)
message("Done.")
message("Checking for duplicate transcripts...")
overlaps <- findOverlaps(clipped, minoverlap=maxTxLength,
ignore.strand=FALSE,
drop.self=TRUE, drop.redundant=TRUE)
## ensure genes match
if (length(overlaps) > 0) {
idx_genes_match <- mapply(function (idx1, idx2) {
mapTxToGene[names(clipped[idx1])] == mapTxToGene[names(clipped[idx2])]
}, idx=queryHits(overlaps), idx2=subjectHits(overlaps))
overlaps <- overlaps[idx_genes_match]
}
## get duplicate indices
duplicates <- unique(queryHits(overlaps))
if (length(duplicates) > 0) {
clipped <- clipped[-duplicates]
}
message(sprintf("Removed %d duplicates.", length(duplicates)))
message("Creating exon ranges...")
## flatten with tx_id in metadata
grExons <- unlist(.mutateEach(clipped, transcript_id=names(clipped)))
names(grExons) <- NULL
mcols(grExons)["type"] <- "exon"
## add gene id
mcols(grExons)["gene_id"] <- mapTxToGene[mcols(grExons)$transcript_id]
## reindex exon info
grExons <- sort(grExons)
mcols(grExons)["exon_id"] <- seq_along(grExons)
mcols(grExons)["exon_name"] <- NULL
## TODO: include `exon_rank`
message("Done.")
message("Creating tx ranges...")
## generate transcripts GRanges with clipped bounds
grTxs <- unlist(GRangesList(bplapply(clipped, .fillReduce,
BPPARAM=BPPARAM)))
mcols(grTxs)['transcript_id'] <- names(grTxs)
mcols(grTxs)["type"] <- "transcript"
## add gene id
mcols(grTxs)["gene_id"] <- mapTxToGene[grTxs$transcript_id]
message("Done.")
message("Creating gene ranges...")
grGenes <- unlist(GRangesList(bplapply(split(grTxs, grTxs$gene_id),
.fillReduce, BPPARAM=BPPARAM)))
mcols(grGenes)['gene_id'] <- names(grGenes)
mcols(grGenes)["type"] <- "gene"
message("Done.")
dfMetadata <- data.frame(
name=c("Truncated by", "Maximum Transcript Length"),
value=c("txcutr", maxTxLength)
)
makeTxDbFromGRanges(c(grGenes, grTxs, grExons),
taxonomyId=taxonomyId(txdb),
metadata=dfMetadata)
}
)
#' Clip Transcript to Given Length
#'
#' Internal function for operating on individual \code{GRanges}, where ranges
#' represent exons in a transcript. This is designed to be used in an
#' \code{*apply} function over a \code{GRangesList} object.
#'
#' @param gr a \code{GRanges} object
#' @param maxTxLength a positive integer
#'
#' @return the clipped \code{GRanges} object
#'
#' @importFrom GenomicRanges GRanges width strand start end intersect
#' @importFrom IRanges IRanges
#'
.clipTranscript <- function (gr, maxTxLength) {
if (sum(width(gr)) <= maxTxLength) { ## already short enough
gr
} else { ## need to adjust
## adjustment is directed
txStrand <- strand(gr)
if (all(txStrand == "+")) {
## order by 3' ends
idx <- order(-end(gr))
## compute cumulative lengths
cumLength <- cumsum(width(gr[idx]))
## index of exon that exceeds maximum length
idxLast <- min(which(cumLength > maxTxLength))
## compute cutoff (genomic position)
startNew <- start(gr[idx[idxLast]]) + (cumLength[idxLast] - maxTxLength)
## new transcript interval
grMask <- GRanges(seqnames(gr[1]),
IRanges(startNew, max(end(gr))),
strand="+")
## clip exons with interval
intersect(gr, grMask)
} else if (all(txStrand == "-")) {
## order by 3' ends
idx <- order(start(gr))
## compute cumulative lengths
cumLength <- cumsum(width(gr[idx]))
## index of exon that exceeds maximum length
idxLast <- min(which(cumLength > maxTxLength))
## compute cutoff (genomic position)
endNew <- end(gr[idx[idxLast]]) - (cumLength[idxLast] - maxTxLength)
## new transcript interval
grMask <- GRanges(seqnames(gr[1]),
IRanges(min(start(gr)), endNew),
strand="-")
## clip exons with interval
intersect(gr, grMask)
} else {
warning("Skipping Transcript: Encountered inconsistent strand annotation!", gr)
gr
}
}
}
#' Convert GRanges to Single Range
#'
#' @param gr a \code{GRanges} with ranges to be merged.
#' @param validate logical determining whether entries should be checked for compatible
#' seqnames and strands.
#'
#' @return \code{GRanges} with single interval
#'
#' @details The validation assumes seqnames and strand are \code{Rle} objects.
#'
#' @importFrom GenomicRanges seqnames start end strand reduce start<- end<-
#' @importFrom S4Vectors nrun
.fillReduce <- function (gr, validate=TRUE) {
if (validate) {
stopifnot(nrun(seqnames(gr)) == 1,
nrun(strand(gr)) == 1)
}
## TODO: Check if faster to construct new GRanges
## Current implementation makes retention of seqinfo simple.
start(gr) <- min(start(gr))
end(gr) <- max(end(gr))
reduce(gr)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.