#' Prepare CDS
#' @description Prepare CDS library from a TxDb object.
#' @param txdb A TxDb object.
#' @param withUTR Including UTR information or not.
#' @return A GRanges object with metadata which include:
#' tx_id: transcript id;
#' tx_name: transcript name;
#' gene_id: gene id;
#' isFirstExonInCDS: is first exon in CDS or not;
#' idFirstExonInCDS: the id for the first exon;
#' isLastExonInCDS: is last exon in CDS or not;
#' wid.cumsu: cumulative sums of number of bases in CDS;
#' internalPos: offset position from 1 base;
#' @importClassesFrom GenomicFeatures TxDb
#' @importFrom GenomicFeatures cdsBy fiveUTRsByTranscript
#' @importFrom AnnotationDbi select
#' @importFrom methods as is
#' @import GenomicRanges
#' @export
#' @examples
#' library(GenomicFeatures)
#' txdb_file <- system.file("extdata", "Biomart_Ensembl_sample.sqlite",
#' package="GenomicFeatures")
#' txdb <- loadDb(txdb_file)
#' CDS <- prepareCDS(txdb)
#'
prepareCDS <- function(txdb, withUTR = FALSE){
stopifnot(is(txdb, "TxDb"))
if(withUTR){
cds <- cdsBy(txdb, by = "tx", use.names = TRUE)
utr5 <- fiveUTRsByTranscript(txdb, use.names = TRUE)
utr3 <- threeUTRsByTranscript(txdb, use.names = TRUE)
CDS <- unlist(cds)
CDS$cds_id <- NULL
CDS$cds_name <- NULL
CDS$feature <- "CDS"
CDS$tx_name <- rep(names(cds), lengths(cds))
utrs <- list(UTR5=utr5, UTR3=utr3)
utrs <- mapply(utrs, names(utrs), FUN=function(.ele, .name){
UTR <- unlist(.ele)
UTR$exon_id <- NULL
UTR$exon_name <- NULL
UTR$feature <- .name
UTR$tx_name <- rep(names(.ele), lengths(.ele))
UTR
})
utrs <- unlist(GRangesList(utrs), use.names = FALSE)
exons <- c(utrs, CDS)
exons <- sort(exons)
exons$tx_id <-
as.numeric(factor(exons$tx_name, levels = unique(exons$tx_name)))
exons <-
exons[order(exons$tx_id, start(exons)*ifelse(strand(exons)=="-", -1, 1))]
suppressMessages(id_map <-
select(txdb, keys=unique(exons$tx_name),
columns = c("TXNAME", "GENEID", "TXTYPE"),
keytype="TXNAME"))
exons$gene_id <- id_map[match(exons$tx_name, id_map$TXNAME), "GENEID"]
exons$tx_type <- id_map[match(exons$tx_name, id_map$TXNAME), "TXTYPE"]
exons$oid <- paste(exons$tx_name, exons$feature)
isCDS <- exons$feature == "CDS"
exons$isFirstExonInCDS <- isCDS & !duplicated(exons$oid)
exons$idFirstExonInCDS <- ## make sure that table is sorted
rep(which(exons$isFirstExonInCDS), table(exons$tx_id))
exons$isLastExonInCDS <- isCDS & rev(!duplicated(rev(exons$oid)))
exons$isFirstExonInTx <- !duplicated(exons$tx_id)
exons$idFirstExonInTx <-
rep(which(exons$isFirstExonInTx), table(exons$tx_id))
exons$isLastExonInTx <- rev(!duplicated(rev(exons$tx_id)))
exons.wid <- width(exons)
isUTR5 <- exons$feature == "UTR5"
CDS.wid.cumsum <- cumsum(exons.wid[!isUTR5])
exons$wid.cumsum <- 0
exons.sub <- exons[!isUTR5]
idFirstExonInCDS <-
rep(which(exons.sub$isFirstExonInCDS), table(exons.sub$tx_id))
exons$wid.cumsum[!isUTR5] <-
CDS.wid.cumsum - c(0, CDS.wid.cumsum)[idFirstExonInCDS]
exons$internalPos <- 0
exons$internalPos[!(isUTR5 | exons$isFirstExonInCDS)] <-
exons$wid.cumsum[!(isUTR5 | exons$isLastExonInTx)]
## for UTR5
exons.sub <- rev(exons[isUTR5])
exons.sub$tx_id <-
as.numeric(factor(exons.sub$tx_name, levels = unique(exons.sub$tx_name)))
exons.sub$isFirstUTR5 <- !duplicated(exons.sub$oid)
exons.sub$idFirstUTR5 <-
rep(which(exons.sub$isFirstUTR5), table(exons.sub$tx_id))
exons.sub$isLastUTR5 <- rev(!duplicated(rev(exons.sub$oid)))
exons.sub.wid <- width(exons.sub)
exons.sub.cumsum <- cumsum(exons.sub.wid)
exons.sub$wid.cumsum <-
exons.sub.cumsum - c(0, exons.sub.cumsum)[exons.sub$idFirstUTR5]
exons.sub$internalPos <- 0
exons.sub$internalPos[!exons.sub$isFirstUTR5] <-
exons.sub$wid.cumsum[!exons.sub$isLastUTR5]
exons$wid.cumsum[isUTR5] <- -1 * rev(exons.sub$wid.cumsum)
exons$internalPos[isUTR5] <- -1 * rev(exons.sub$internalPos)
exons$oid <- NULL
return(exons)
}
cds <- cdsBy(txdb, by="tx", use.names = TRUE)
CDS <- unlist(cds)
CDS$cds_name <- NULL
CDS$tx_id <- rep(seq_along(cds), lengths(cds))
CDS$tx_name <- rep(names(cds), lengths(cds))
suppressMessages(id_map <-
select(txdb, keys=unique(CDS$tx_name),
columns = c("TXNAME", "GENEID", "TXTYPE"),
keytype="TXNAME"))
CDS$gene_id <- id_map[match(CDS$tx_name, id_map$TXNAME), "GENEID"]
CDS$tx_type <- id_map[match(CDS$tx_name, id_map$TXNAME), "TXTYPE"]
CDS$isFirstExonInCDS <- !duplicated(CDS$tx_id)
CDS$idFirstExonInCDS <- rep(which(!duplicated(CDS$tx_id)), lengths(cds))
CDS$isLastExonInCDS <- rev(!duplicated(rev(CDS$tx_id)))
CDS.wid <- width(CDS)
CDS.wid.cumsum <- cumsum(CDS.wid)
CDS$wid.cumsum <- CDS.wid.cumsum - c(0, CDS.wid.cumsum)[CDS$idFirstExonInCDS]
CDS$internalPos <- 0
CDS$internalPos[!CDS$isFirstExonInCDS] <- CDS$wid.cumsum[!CDS$isLastExonInCDS]
CDS
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.