#' Obtain the genomic state per region from annotation
#'
#' This function summarizes the annotation contained in a
#' [TxDb][GenomicFeatures::TxDb-class] at each given base of the genome based
#' on annotated transcripts. It groups contiguous base pairs classified as the
#' same type into regions.
#'
#' @param txdb A [TxDb][GenomicFeatures::TxDb-class] object with chromosome lengths
#' (check `seqlengths(txdb)`). If you are using a
#' [TxDb][GenomicFeatures::TxDb-class] object created from a GFF/GTF file, you will
#' find this <https://support.bioconductor.org/p/93235/> useful.
#' @param chrs The names of the chromosomes to use as denoted in the
#' `txdb` object. Check [isActiveSeq][GenomicFeatures::TxDb-class].
#' @param ... Arguments passed to [extendedMapSeqlevels].
#'
#' @return A `GRangesList` object with two elements: `fullGenome` and
#' `codingGenome`. Both have metadata information for the type of region
#' (theRegion), transcript IDs (tx_id), transcript name (tx_name), and gene ID
#' (gene_id). `fullGenome` classifies each region as either being exon,
#' intron or intergenic. `codingGenome` classfies the regions as being
#' promoter, exon, intro, 5UTR, 3UTR or intergenic.
#'
#' @author Andrew Jaffe, Leonardo Collado-Torres
#' @seealso [TxDb][GenomicFeatures::TxDb-class]
#' @export
#'
#' @importFrom GenomicFeatures isActiveSeq 'isActiveSeq<-' intronsByTranscript
#' fiveUTRsByTranscript threeUTRsByTranscript exonsBy
#' @importFrom IRanges CharacterList IntegerList
#' @import S4Vectors
#' @importFrom GenomicRanges GRangesList seqnames
#' @importFrom GenomeInfoDb seqlengths seqlevels renameSeqlevels
#' @importMethodsFrom AnnotationDbi select
#' @importMethodsFrom GenomicRanges names 'names<-' reduce '$'
#' '$<-' '[' '[<-' sort disjoin length findOverlaps
#' strand 'strand<-' gaps width
#' @importMethodsFrom IRanges names unlist relist length lapply '['
#' @import S4Vectors
#' @importMethodsFrom GenomicFeatures promoters
#' @importFrom methods is
#'
#' @examples
#' ## Load the example data base from the GenomicFeatures vignette
#' library("GenomicFeatures")
#' samplefile <- system.file("extdata", "hg19_knownGene_sample.sqlite",
#' package = "GenomicFeatures"
#' )
#' txdb <- loadDb(samplefile)
#'
#' ## Generate genomic state object, only for chr6
#' sampleGenomicState <- makeGenomicState(txdb, chrs = "chr6")
#' #
#' \dontrun{
#' ## Create the GenomicState object for Hsapiens.UCSC.hg19.knownGene
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
#'
#' ## Creating this GenomicState object takes around 8 min for all chrs and
#' ## around 30 secs for chr21
#' GenomicState.Hsapiens.UCSC.hg19.knownGene.chr21 <-
#' makeGenomicState(txdb = txdb, chrs = "chr21")
#'
#' ## For convinience, this object is already included in derfinder
#' library("testthat")
#' expect_that(
#' GenomicState.Hsapiens.UCSC.hg19.knownGene.chr21,
#' is_equivalent_to(genomicState)
#' )
#'
#' ## Hsapiens ENSEMBL GRCh37
#' library("GenomicFeatures")
#' ## Can take several minutes and speed will depend on your internet speed
#' xx <- makeTxDbPackageFromBiomart(
#' version = "0.99", maintainer = "Your Name",
#' author = "Your Name"
#' )
#' txdb <- loadDb(file.path(
#' "TxDb.Hsapiens.BioMart.ensembl.GRCh37.p11", "inst",
#' "extdata", "TxDb.Hsapiens.BioMart.ensembl.GRCh37.p11.sqlite"
#' ))
#'
#' ## Creating this GenomicState object takes around 13 min
#' GenomicState.Hsapiens.ensembl.GRCh37.p11 <- makeGenomicState(
#' txdb = txdb,
#' chrs = c(1:22, "X", "Y")
#' )
#'
#' ## Save for later use
#' save(GenomicState.Hsapiens.ensembl.GRCh37.p11,
#' file = "GenomicState.Hsapiens.ensembl.GRCh37.p11.Rdata"
#' )
#' }
#'
makeGenomicState <- function(txdb, chrs = c(seq_len(22), "X", "Y"), ...) {
stopifnot(is(txdb, "TxDb"))
## Use UCSC names for homo_sapiens by default
chrs <- extendedMapSeqlevels(chrs, ...)
## Select chrs to use
active_seq_original <- isActiveSeq(txdb)
isActiveSeq(txdb) <- names(isActiveSeq(txdb)) %in% chrs
## Check that the chrs have lengths. This happens when the TxDb was created
## from a GFF/GTF file.
all(!is.na(seqlengths(txdb)))
######### add introns
introns <- intronsByTranscript(txdb)
## Get gene_id and tx_name associated with tx_id
map <- select(txdb,
keys = names(introns), keytype = "TXID",
columns = c("TXID", "TXNAME", "GENEID")
) # for both R and R-devel
## Subset on transcripts that have a GENEID
map <- map[!is.na(map$GENEID), ]
## Group introns by gene
tx_names <- CharacterList(split(map$TXNAME, map$GENEID))
tx_id <- CharacterList(split(map$TXID, map$GENEID))
tx2gene <- data.frame(tx = unlist(unname(tx_id)), gene = as.integer(factor(rep(
names(tx_id),
elementNROWS(tx_id)
))))
txidRep <- rep(as.numeric(names(introns)), elementNROWS(introns))
txnameRep <- rep(
names(intronsByTranscript(txdb, use.names = TRUE)),
elementNROWS(introns)
)
geneRep <- tx2gene$gene[match(txidRep, tx2gene$tx)]
#### reduce introns
intronsRed <- reduce(introns@unlistData, with.revmap = TRUE)
introns2 <- introns@unlistData
mcols(introns2) <- DataFrame(txidRep, txnameRep, geneRep,
check.names = FALSE
)
mapping <- mcols(intronsRed)$revmap
addTx <- relist(
mcols(introns2)[unlist(mapping), "txidRep"],
mapping
)
addTxName <- relist(
mcols(introns2)[unlist(mapping), "txnameRep"],
mapping
)
addGene <- relist(
mcols(introns2)[unlist(mapping), "geneRep"],
mapping
)
iRegion <- rep("intronic", length(addTx))
values(intronsRed) <- DataFrame(iRegion, addTx, addTxName,
addGene,
check.names = FALSE
)
names(mcols(intronsRed)) <- c(
"region", "TxID", "TxName",
"Gene"
)
## promoters next
promoters <- promoters(txdb)
mcols(promoters)$Gene <- tx2gene$gene[match(
mcols(promoters)$tx_id,
tx2gene$tx
)]
promotersRed <- reduce(promoters, with.revmap = TRUE)
mapping <- mcols(promotersRed)$revmap
addTx <- relist(
mcols(promoters)[unlist(mapping), "tx_id"],
mapping
)
addTxName <- relist(
mcols(promoters)[unlist(mapping), "tx_name"],
mapping
)
addGene <- relist(
mcols(promoters)[unlist(mapping), "Gene"],
mapping
)
pRegion <- rep("promoter", length(addTx))
values(promotersRed) <- DataFrame(pRegion, addTx, addTxName,
addGene,
check.names = FALSE
)
names(mcols(promotersRed)) <- c(
"region", "TxID", "TxName",
"Gene"
)
##### UTRs 5'
utr5 <- fiveUTRsByTranscript(txdb)
txidRep5 <- rep(as.numeric(names(utr5)), elementNROWS(utr5))
geneRep5 <- tx2gene$gene[match(txidRep5, tx2gene$tx)]
txnameRep5 <- rep(
names(fiveUTRsByTranscript(txdb, use.names = TRUE)),
elementNROWS(utr5)
)
utr5a <- utr5@unlistData
mcols(utr5a) <- DataFrame(txidRep5, txnameRep5, geneRep5,
check.names = FALSE
)
utr5red <- reduce(utr5a, with.revmap = TRUE)
mapping <- mcols(utr5red)$revmap
addTx <- relist(
mcols(utr5a)[unlist(mapping), "txidRep5"],
mapping
)
addTxName <- relist(
mcols(utr5a)[unlist(mapping), "txnameRep5"],
mapping
)
addGene <- relist(
mcols(utr5a)[unlist(mapping), "geneRep5"],
mapping
)
uRegion <- rep("5UTR", length(addTx))
values(utr5red) <- DataFrame(uRegion, addTx, addTxName, addGene,
check.names = FALSE
)
names(mcols(utr5red)) <- c("region", "TxID", "TxName", "Gene")
# 3'
utr3 <- threeUTRsByTranscript(txdb)
txidRep3 <- rep(as.numeric(names(utr3)), elementNROWS(utr3))
geneRep3 <- tx2gene$gene[match(txidRep3, tx2gene$tx)]
txnameRep3 <- rep(
names(threeUTRsByTranscript(txdb, use.names = TRUE)),
elementNROWS(utr3)
)
utr3a <- utr3@unlistData
mcols(utr3a) <- DataFrame(txidRep3, txnameRep3, geneRep3,
check.names = FALSE
)
utr3red <- reduce(utr3a, with.revmap = TRUE)
mapping <- mcols(utr3red)$revmap
addTx <- relist(
mcols(utr3a)[unlist(mapping), "txidRep3"],
mapping
)
addTxName <- relist(
mcols(utr3a)[unlist(mapping), "txnameRep3"],
mapping
)
addGene <- relist(
mcols(utr3a)[unlist(mapping), "geneRep3"],
mapping
)
uRegion <- rep("3UTR", length(addTx))
values(utr3red) <- DataFrame(uRegion, addTx, addTxName, addGene,
check.names = FALSE
)
names(mcols(utr3red)) <- c("region", "TxID", "TxName", "Gene")
### exons
exons <- exonsBy(txdb)
txidRep <- rep(as.numeric(names(exons)), elementNROWS(exons))
geneRep <- tx2gene$gene[match(txidRep, tx2gene$tx)]
txnameRep <- rep(
names(exonsBy(txdb, use.names = TRUE)),
elementNROWS(exons)
)
exonsa <- exons@unlistData
mcols(exonsa) <- DataFrame(txidRep, txnameRep, geneRep, check.names = FALSE)
exonsRed <- reduce(exonsa, with.revmap = TRUE)
mapping <- mcols(exonsRed)$revmap
addTx <- relist(
mcols(exonsa)[unlist(mapping), "txidRep"],
mapping
)
addTxName <- relist(
mcols(exonsa)[unlist(mapping), "txnameRep"],
mapping
)
addGene <- relist(
mcols(exonsa)[unlist(mapping), "geneRep"],
mapping
)
eRegion <- rep("exon", length(addTx))
values(exonsRed) <- DataFrame(eRegion, addTx, addTxName,
addGene,
check.names = FALSE
)
names(mcols(exonsRed)) <- c("region", "TxID", "TxName", "Gene")
############ merge
codingGR <- c(
intronsRed, exonsRed, promotersRed, utr3red,
utr5red
)
codingGR <- sort(codingGR)
######## disjoin
dGR <- disjoin(codingGR)
theTx <- theGene <- IntegerList(vector("list", length(dGR)))
theTxName <- CharacterList(vector("list", length(dGR)))
theRegion <- rep(NA, length(dGR))
## introns
ooIntrons <- findOverlaps(dGR, intronsRed, type = "within")
theRegion[unique(queryHits(ooIntrons))] <- "intron"
theTx[unique(queryHits(ooIntrons))] <-
intronsRed$TxID[subjectHits(ooIntrons)]
theTxName[unique(queryHits(ooIntrons))] <-
intronsRed$TxName[subjectHits(ooIntrons)]
theGene[unique(queryHits(ooIntrons))] <-
intronsRed$Gene[subjectHits(ooIntrons)]
## promoters
ooPromoters <- findOverlaps(dGR, promotersRed, type = "within")
theRegion[unique(queryHits(ooPromoters))] <- "promoter"
theTx[unique(queryHits(ooPromoters))] <-
promotersRed$TxID[subjectHits(ooPromoters)]
theTxName[unique(queryHits(ooPromoters))] <-
promotersRed$TxName[subjectHits(ooPromoters)]
theGene[unique(queryHits(ooPromoters))] <-
promotersRed$Gene[subjectHits(ooPromoters)]
## Exons
ooExons <- findOverlaps(dGR, exonsRed, type = "within")
theRegion[unique(queryHits(ooExons))] <- "exon"
theTx[unique(queryHits(ooExons))] <- exonsRed$TxID[subjectHits(ooExons)]
theTxName[unique(queryHits(ooExons))] <-
exonsRed$TxName[subjectHits(ooExons)]
theGene[unique(queryHits(ooExons))] <- exonsRed$Gene[subjectHits(ooExons)]
## 5' UTRs
oo5UTR <- findOverlaps(dGR, utr5red, type = "within")
theRegion[unique(queryHits(oo5UTR))] <- "5UTR"
theTx[unique(queryHits(oo5UTR))] <- utr5red$TxID[subjectHits(oo5UTR)]
theTxName[unique(queryHits(oo5UTR))] <- utr5red$TxName[subjectHits(oo5UTR)]
theGene[unique(queryHits(oo5UTR))] <- utr5red$Gene[subjectHits(oo5UTR)]
## 3' UTRs
oo3UTR <- findOverlaps(dGR, utr3red, type = "within")
theRegion[unique(queryHits(oo3UTR))] <- "3UTR"
theTx[unique(queryHits(oo3UTR))] <- utr3red$TxID[subjectHits(oo3UTR)]
theTxName[unique(queryHits(oo3UTR))] <- utr3red$TxName[subjectHits(oo3UTR)]
theGene[unique(queryHits(oo3UTR))] <- utr3red$Gene[subjectHits(oo3UTR)]
## clean up vectors
theTx <- IntegerList(lapply(theTx, function(x) sort(unique(x))))
theTxName <- CharacterList(lapply(theTxName, function(x) sort(unique(x))))
theGene <- IntegerList(lapply(theGene, function(x) sort(unique(x))))
values(dGR) <- DataFrame(theRegion, theTx, theTxName, theGene,
check.names = FALSE
)
names(values(dGR)) <- c(
"theRegion", "tx_id", "tx_name",
"gene"
)
codingGR <- dGR
######## introns, exons, promoters
fullGR <- c(intronsRed, exonsRed)
######## disjoin
dGR <- disjoin(fullGR)
theTx <- theGene <- IntegerList(vector("list", length(dGR)))
theTxName <- CharacterList(vector("list", length(dGR)))
theRegion <- rep(NA, length(dGR))
## introns
ooIntrons <- findOverlaps(dGR, intronsRed, type = "within")
theRegion[unique(queryHits(ooIntrons))] <- "intron"
theTx[unique(queryHits(ooIntrons))] <-
intronsRed$TxID[subjectHits(ooIntrons)]
theTxName[unique(queryHits(ooIntrons))] <-
intronsRed$TxName[subjectHits(ooIntrons)]
theGene[unique(queryHits(ooIntrons))] <-
intronsRed$Gene[subjectHits(ooIntrons)]
## Exons
ooExons <- findOverlaps(dGR, exonsRed, type = "within")
theRegion[unique(queryHits(ooExons))] <- "exon"
theTx[unique(queryHits(ooExons))] <- exonsRed$TxID[subjectHits(ooExons)]
theTxName[unique(queryHits(ooExons))] <-
exonsRed$TxName[subjectHits(ooExons)]
theGene[unique(queryHits(ooExons))] <- exonsRed$Gene[subjectHits(ooExons)]
## clean up vectors
theTx <- IntegerList(lapply(theTx, function(x) sort(unique(x))))
theTxName <- CharacterList(lapply(theTxName, function(x) sort(unique(x))))
theGene <- IntegerList(lapply(theGene, function(x) sort(unique(x))))
values(dGR) <- DataFrame(theRegion, theTx, theTxName, theGene,
check.names = FALSE
)
names(values(dGR)) <- c(
"theRegion", "tx_id", "tx_name",
"gene"
)
fullGR <- dGR
######## reduce segments of the same type
rIndexes <- split(seq_len(length(fullGR)), fullGR$theRegion)
grList <- lapply(rIndexes, function(i) {
r <- fullGR[i]
rr <- reduce(r, with.revmap = TRUE)
mapping <- sapply(mcols(rr)$revmap, "[", 1)
values(rr) <- mcols(r)[mapping, ]
return(rr)
})
fullGR <- unlist(GRangesList(grList))
fullGR <- sort(fullGR)
rIndexes2 <- split(seq_len(length(codingGR)), codingGR$theRegion)
grList2 <- lapply(rIndexes2, function(i) {
r <- codingGR[i]
rr <- reduce(r, with.revmap = TRUE)
mapping <- sapply(mcols(rr)$revmap, "[", 1)
values(rr) <- mcols(r)[mapping, ]
return(rr)
})
codingGR <- unlist(GRangesList(grList2))
codingGR <- sort(codingGR)
#### gaps to get intergenic candidate regions
## full
mcols(fullGR)$Gene_Strand <- strand(fullGR)
strand(fullGR) <- Rle("*")
fullInterGR <- gaps(fullGR)
strand(fullGR) <- as.character(mcols(fullGR)$Gene_Strand)
mcols(fullGR)$Gene_Strand <- NULL
fList <- split(fullInterGR, seqnames(fullInterGR))
for (i in seq(along = fList)) {
x <- fList[[i]]
fList[[i]] <- x[width(x) != seqlengths(fullInterGR)[i]]
}
fullInterGR <- unlist(fList)
addTxName <- CharacterList(vector("list", length(fullInterGR)))
addTx <- addGene <- IntegerList(vector("list", length(fullInterGR)))
gRegion <- rep("intergenic", length(fullInterGR))
values(fullInterGR) <- DataFrame(gRegion, addTx, addTxName,
addGene,
check.names = FALSE
)
names(mcols(fullInterGR)) <- names(mcols(fullGR))
fullGenome <- c(fullInterGR, fullGR)
fullGenome <- sort(fullGenome)
names(fullGenome) <- seq(along = fullGenome)
## coding
mcols(codingGR)$Gene_Strand <- strand(codingGR)
strand(codingGR) <- Rle("*")
codingInterGR <- gaps(codingGR)
strand(codingGR) <- as.character(mcols(codingGR)$Gene_Strand)
mcols(codingGR)$Gene_Strand <- NULL
fList <- split(codingInterGR, seqnames(codingInterGR))
for (i in seq(along = fList)) {
x <- fList[[i]]
fList[[i]] <- x[width(x) != seqlengths(codingInterGR)[i]]
}
codingInterGR <- unlist(fList)
addTxName <- CharacterList(vector("list", length(codingInterGR)))
addTx <- addGene <- IntegerList(vector("list", length(codingInterGR)))
gRegion <- rep("intergenic", length(codingInterGR))
values(codingInterGR) <- DataFrame(gRegion, addTx, addTxName,
addGene,
check.names = FALSE
)
names(mcols(codingInterGR)) <- names(mcols(codingGR))
codingGenome <- c(codingInterGR, codingGR)
codingGenome <- sort(codingGenome)
names(codingGenome) <- seq(along = codingGenome)
## Use UCSC names for homo_sapiens by default
codingGenome <- renameSeqlevels(
codingGenome,
extendedMapSeqlevels(seqlevels(codingGenome), ...)
)
fullGenome <- renameSeqlevels(
fullGenome,
extendedMapSeqlevels(seqlevels(fullGenome), ...)
)
GenomicState <- GRangesList(
fullGenome = fullGenome,
codingGenome = codingGenome
)
## restore the active seqlevels
isActiveSeq(txdb) <- active_seq_original
## Done
return(GenomicState)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.