#' coverage of chromosome regions
#'
#' calculate the coverage of 5'UTR, CDS and 3'UTR per transcript per bin.
#'
#'
#' @param cvglists A list of \link[IRanges:AtomicList-class]{SimpleRleList} or
#' \link[IRanges:AtomicList-class]{RleList}. It represents the coverage for
#' samples.
#' @param TxDb An object of \code{\link[GenomicFeatures:TxDb-class]{TxDb}}. It
#' is used for extracting the annotations.
#' @param upstream.cutoff,downstream.cutoff cutoff length for upstream or
#' downstream of transcript.
#' @param nbinsCDS,nbinsUTR,nbinsUpstream,nbinsDownstream The number of bins
#' for CDS, UTR, upstream and downstream.
#' @param includeIntron A logical value which indicates including intron or
#' not.
#' @param minCDSLen,minUTRLen minimal length of CDS or UTR of transcript.
#' @param maxCDSLen,maxUTRLen maximal length of CDS or UTR of transctipt.
#' @author Jianhong Ou
#' @seealso \link{binOverGene}, \link{plotBinOverRegions}
#' @export
#' @import IRanges
#' @import GenomicRanges
#' @importFrom GenomicFeatures transcripts
#' @importFrom S4Vectors mcols elementMetadata
#' @importFrom BiocGenerics strand start
#' @importFrom GenomeInfoDb seqinfo seqlevels seqnames
#' @examples
#'
#' if(Sys.getenv("USER")=="jianhongou"){
#' path <- system.file("extdata", package="ChIPpeakAnno")
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(rtracklayer)
#' files <- dir(path, "bigWig")
#' if(.Platform$OS.type != "windows"){
#' cvglists <- lapply(file.path(path, files), import,
#' format="BigWig", as="RleList")
#' names(cvglists) <- sub(".bigWig", "", files)
#' d <- binOverRegions(cvglists, TxDb.Hsapiens.UCSC.hg19.knownGene)
#' plotBinOverRegions(d)
#' }
#' }
#'
binOverRegions <- function(cvglists, TxDb,
upstream.cutoff=1000L,
downstream.cutoff=upstream.cutoff,
nbinsCDS=100L, nbinsUTR=20L,
nbinsUpstream=20L,
nbinsDownstream=nbinsUpstream,
includeIntron=FALSE,
minCDSLen=nbinsCDS,
minUTRLen=nbinsUTR,
maxCDSLen=Inf,
maxUTRLen=Inf){
if(inherits(cvglists, c("SimpleRleList", "RleList", "CompressedRleList"))){
cvglistsName <- substitute(deparse(cvglists))
cvglists <- list(cvglists)
names(cvglists) <- cvglistsName
}
if(!is(cvglists, "list")){
stop("cvglists must be a list of SimpleRleList or RleList")
}
cls <- sapply(cvglists, inherits,
what = c("SimpleRleList", "RleList", "CompressedRleList"))
if(any(!cls))
stop("cvglists must be a list of SimpleRleList or RleList")
stopifnot(is(TxDb, "TxDb"))
stopifnot(maxCDSLen>minCDSLen)
stopifnot(minCDSLen>=nbinsCDS)
stopifnot(maxUTRLen>minUTRLen)
stopifnot(minUTRLen>=nbinsUTR)
features <- toGRanges(TxDb, feature="geneModel")
features.reduced <- reduce(features)
feature_type <- c("upstream", "5UTR", "CDS", "3UTR", "downstream")
features <- features[features$feature_type %in% c("5UTR", "CDS", "3UTR")]
transcripts <- transcripts(TxDb, columns=c("tx_name", "gene_id"))
trx <- unique(mcols(transcripts))
features$gene_id <- trx[match(features$tx_name, trx$tx_name), "gene_id"]
features <- features[lengths(features$gene_id)>0]
features$tx_name <- sapply(features$gene_id, function(.ele) .ele[1])
features$gene_id <- NULL
features.disjoin <- disjoin(features, with.revmap=TRUE)
features.disjoin.1 <-
features.disjoin[rep(seq_along(features.disjoin),
lengths(features.disjoin$revmap))]
features.disjoin.1$revmap <- unlist(features.disjoin$revmap)
features.disjoin.1$feature_type <-
features$feature_type[features.disjoin.1$revmap]
features.disjoin.1$tx_name <- features$tx_name[features.disjoin.1$revmap]
features.disjoin.1$revmap2 <-
rep(seq_along(features.disjoin), lengths(features.disjoin$revmap))
feature_type2 <-
split(features.disjoin.1$feature_type, features.disjoin.1$revmap2)
feature_type2 <- lapply(feature_type2, unique)
feature_type2 <- feature_type2[lengths(feature_type2)==1]
features.disjoin.1 <-
features.disjoin.1[match(as.numeric(names(feature_type2)),
features.disjoin.1$revmap2)]
features.disjoin.1$seqn <- paste(features.disjoin.1$tx_name,
features.disjoin.1$feature_type,
as.character(seqnames(features.disjoin.1)))
features1 <- GRanges(seqnames = features.disjoin.1$seqn,
ranges = ranges(features.disjoin.1),
strand = strand(features.disjoin.1))
features1 <- reduce(features1)
mcols(features1) <-
data.frame(do.call(rbind,
strsplit(as.character(seqnames(features1)), " ")),
stringsAsFactors=FALSE)
colnames(mcols(features1)) <- c("tx_name", "feature_type", "seqn")
features1 <- GRanges(seqnames = features1$seqn,
ranges = ranges(features1),
strand = strand(features1),
tx_name = features1$tx_name,
feature_type = features1$feature_type)
seqinfo(features1) <- seqinfo(features)[seqlevels(features1)]
features <- features1
rm(list=c("features.disjoin", "features.disjoin.1",
"feature_type2", "features1"))
txs <- unique(features$tx_name)
txs <- txs[!is.na(txs)]
if(includeIntron){
cds <- features[features$feature_type=="CDS"]
cds <- GRanges(seqnames = paste(as.character(seqnames(cds)),
cds$tx_name),
ranges = ranges(cds), strand = strand(cds))
cds <- reduce(cds, min.gapwidth=1e9)
mcols(cds) <-
data.frame(do.call(rbind,
strsplit(as.character(seqnames(cds)), " ")),
stringsAsFactors = FALSE)
colnames(mcols(cds)) <- c("seqn", "tx_name")
cds <- GRanges(seqnames = cds$seqn,
ranges = ranges(cds), strand = strand(cds),
tx_name = cds$tx_name, feature_type = "CDS")
seqinfo(cds) <- seqinfo(features)[seqlevels(cds)]
features <- features[features$feature_type %in% c("5UTR", "3UTR")]
features <- c(features, cds)
}
## resort by txs
features <- features[order(as.numeric(factor(features$tx_name,
levels = txs)))]
## features must contain 5UTR, CDS and 3UTR
txs <- split(features$feature_type, features$tx_name)
txs <- lapply(txs, unique)
txs <- names(txs)[lengths(txs)==3]
features <- features[features$tx_name %in% txs]
## add upstream and downstream
if(upstream.cutoff>0 && downstream.cutoff>0){
genes <- GRanges(seqnames = paste(as.character(seqnames(features)),
features$tx_name),
ranges = ranges(features), strand = strand(features))
genes <- reduce(genes, min.gapwidth=1e9)
mcols(genes) <-
data.frame(do.call(rbind,
strsplit(as.character(seqnames(genes)), " ")),
stringsAsFactors = FALSE)
colnames(mcols(genes)) <- c("seqn", "tx_name")
genes <-
GRanges(seqnames = genes$seqn,
ranges = ranges(genes), strand = strand(genes),
tx_name = genes$tx_name)
seqinfo(genes) <- seqinfo(features)[seqlevels(genes)]
suppressWarnings({
upstream <- promoters(genes,
upstream = upstream.cutoff,
downstream = 0)})
upstream <- trim(upstream)
upstream <- upstream[width(upstream)>=nbinsUpstream]
revert.strand <- function(gr){
l <- as.character(strand(gr))
l.plus <- which(l=="+")
l.minus <- which(l=="-")
if(length(l.plus)>=1){
l[l.plus] <- "-"
}
if(length(l.minus)>=1){
l[l.minus] <- "+"
}
strand(gr) <- l
gr
}
genes.rev.strand <- revert.strand(genes)
suppressWarnings({downstream <-
promoters(genes.rev.strand,
upstream = downstream.cutoff,
downstream = 0)})
downstream <- revert.strand(downstream)
downstream <- trim(downstream)
downstream <- downstream[width(downstream)>=nbinsDownstream]
upstream$feature_type <- "upstream"
downstream$feature_type <- "downstream"
#remove the overlapping parts
rmOverlaps <- function(gr, tobeRemoved=features.reduced){
gr.intersect <- intersect(gr, tobeRemoved, ignore.strand=TRUE)
gr.intersect$tx_name <- NA
gr.intersect$feature_type <- NA
gr.disjoin <- disjoin(c(gr, gr.intersect),
with.revmap=TRUE, ignore.strand=TRUE)
gr.disjoin <- gr.disjoin[lengths(gr.disjoin$revmap)==1]
gr.disjoin$revmap <- unlist(gr.disjoin$revmap)
strand(gr.disjoin) <- strand(gr[gr.disjoin$revmap])
mcols(gr.disjoin) <- mcols(gr[gr.disjoin$revmap])
gr.disjoin
}
upstream <- rmOverlaps(upstream)
downstream <- rmOverlaps(downstream)
features <- c(features, upstream, downstream)
}
## split the features by seqnames and generate Views for cvglist
seqn <- Reduce(intersect, lapply(cvglists, names))
seqn <- intersect(seqn, seqlevels(features))
if(length(seqn)<1){
stop("Please check the names of cvglist. ",
"None of them in the seqlevels of TxDb.")
}
features <- features[seqnames(features) %in% seqn]
## make sure no overlaps in all the region
features.disjoin <- disjoin(features, with.revmap=TRUE)
features.disjoin <- features.disjoin[lengths(features.disjoin$revmap)==1]
features.disjoin$revmap <- unlist(features.disjoin$revmap)
features.disjoin$feature_type <-
features$feature_type[features.disjoin$revmap]
features.disjoin$tx_name <- features$tx_name[features.disjoin$revmap]
features.disjoin$revmap <- NULL
features.disjoin.tx_name <-
split(features.disjoin$feature_type, features.disjoin$tx_name)
features.disjoin.tx_name <- lapply(features.disjoin.tx_name, unique)
features.disjoin.tx_name <-
names(features.disjoin.tx_name)[lengths(features.disjoin.tx_name) ==
max(lengths(features.disjoin.tx_name))]
features.disjoin <- features.disjoin[features.disjoin$tx_name %in%
features.disjoin.tx_name]
seqinfo(features.disjoin) <- seqinfo(features)[seqlevels(features.disjoin)]
features <- features.disjoin
rm(features.disjoin)
## make sure features are sorted by pos
features <-
features[order(
as.numeric(factor(features$tx_name,
levels = txs)),
as.numeric(factor(features$feature_type,
levels =
feature_type[feature_type %in%
unique(
features$feature_type)])),
start(features))]
## filter by minCDSLen, min5UTR, min3UTR
filterByLen <- function(type, minlen, maxLen){
x <- features[features$feature_type %in% type]
if(length(x)<1) return(x)
x.width <- rowsum(width(x), x$tx_name, reorder = FALSE)
rownames(x.width)[x.width[, 1] >= minlen & x.width[, 1] < maxLen]
}
txs <- Reduce(intersect, mapply(filterByLen,
c("CDS", "5UTR", "3UTR",
"upstream", "downstream"),
c(minCDSLen, minUTRLen, minUTRLen,
nbinsUpstream, nbinsDownstream),
c(maxCDSLen, maxUTRLen, maxUTRLen,
Inf, Inf)))
if(length(txs)<2){
stop("less than 2 CDS left.")
}
features <- features[features$tx_name %in% txs]
## features must contain 5UTR, CDS, 3UTR, and/or upstream, downstream.
txs <- split(features$feature_type, features$tx_name)
txs <- lapply(txs, unique)
txs <- names(txs)[lengths(txs)==length(unique(features$feature_type))]
if(length(txs)<2){
stop("less than 2 CDS left.")
}
features <- features[features$tx_name %in% txs]
## calculate fators to balance the region be count multiple times
len <- length(unique(features$tx_name))
features.s <- split(features, seqnames(features))
features.s <- features.s[seqn]
cvglists <- lapply(cvglists, function(.ele) .ele[seqn])
features.l <- as(features.s, "IntegerRangesList")
bins=c("upstream"=nbinsUpstream,
"5UTR"=nbinsUTR,
"CDS" =nbinsCDS,
"3UTR"=nbinsUTR,
"downstream"=nbinsDownstream)
features.view <- lapply(cvglists, function(.ele) {
vw <- Views(.ele, features.l)
cntByChr <- lapply(vw, function(.vw){
.df <- elementMetadata(.vw)
.gr <- GRanges(paste(.df$feature_type, .df$tx_name), ranges(.vw),
feature_type=.df$feature_type, tx_name=.df$tx_name)
.gr.s <- split(.gr, seqnames(.gr))
.cvg <- subject(.vw)
.cvgs <- rep(list(.cvg), length(.gr.s))
.gr.l <- as(.gr.s, "IntegerRangesList")
names(.cvgs) <- names(.gr.l)
.cvgs <- as(.cvgs, "SimpleRleList")
.cvg.sub <- .cvgs[.gr.l]
### split the RleList into bins
.ir <- IRanges(rep(1, length(.cvg.sub)), lengths(.cvg.sub))
.ir <- IRanges::tile(.ir, n=bins[sub("^(.*?) .*$", "\\1",
names(.cvg.sub))])
names(.ir) <- names(.cvg.sub)
.cnt <- viewMeans(Views(.cvg.sub, .ir))
.cnt <- split(.cnt, sub("^(.*?) .*$", "\\1", names(.cnt)))
## make sure 5'->3'
.cnt <- lapply(.cnt, function(x){
strd <-
as.character(strand(features))[match(sub("^(.*?) (.*$)", "\\2",
names(x)),
features$tx_name)]
x <- split(x, strd)
x <- lapply(x, do.call, what=rbind)
if("-" %in% names(x)){
x[["-"]] <- x[["-"]][, ncol(x[["-"]]):1, drop=FALSE]
}
do.call(rbind, x)
})
lapply(.cnt, colSums)
})
cntByFeature <- swapList(cntByChr)
cntByFeature <- lapply(cntByFeature, function(.ele){
colSums(do.call(rbind, .ele))/len
})
})
d <- swapList(features.view)
d <- d[feature_type[feature_type %in% names(d)]]
d <- lapply(d, do.call, what=cbind)
return(d)
}
#' plot the coverage of regions
#'
#' plot the output of \link{binOverRegions} or \link{binOverGene}
#'
#'
#' @param dat A list of matrix which indicate the coverage of regions per bin
#' @param ... Parameters could be used by \link[graphics]{matplot}
#' @author Jianhong Ou
#' @seealso \link{binOverRegions}, \link{binOverGene}
#' @export
#' @importFrom S4Vectors elementNROWS
#' @importFrom graphics abline axis legend matplot
#' @examples
#'
#' if(interactive()){
#' path <- system.file("extdata", package="ChIPpeakAnno")
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(rtracklayer)
#' files <- dir(path, "bigWig")
#' if(.Platform$OS.type != "windows"){
#' cvglists <- lapply(file.path(path, files), import,
#' format="BigWig", as="RleList")
#' names(cvglists) <- sub(".bigWig", "", files)
#' d <- binOverGene(cvglists, TxDb.Hsapiens.UCSC.hg19.knownGene)
#' plotBinOverRegions(d)
#' }
#' }
#'
plotBinOverRegions <- function(dat, ...){
feature_type <- c("upstream", "5UTR", "CDS", "gene", "3UTR", "downstream")
if(!all(names(dat) %in% feature_type)){
stop("names of dat must be upstream, 5UTR, CDS, 3UTR or downstream")
}
feature_type <- feature_type[feature_type %in% names(dat)]
dat <- dat[feature_type]
bins <- elementNROWS(dat)
dat <- do.call(rbind, dat)
matplot(dat, type = "l", xaxt="n", ...)
bins.sum <- cumsum(bins) + .5
bins.sum <- bins.sum[-length(bins.sum)]
abline(v = bins.sum, lty=2, col="gray")
axis(side = 1, at = bins.sum,
labels = rep("", length(bins.sum)),
tick = TRUE, lwd=-1, lwd.ticks=1, ...)
bins.sum2 <- c(0, bins.sum) + bins/2
axis(side = 1, at = bins.sum2,
labels = feature_type, lwd=-1, ...)
legend.arg <- formals("legend")
legend.arg.names <- names(legend.arg)
legend.arg$x <- "topright"
legend.arg$box.col <- NA
legend.arg$merge <- FALSE
legend.arg$legend <- colnames(dat)
matplot.arg <- formals("matplot")
for(arg in c("col", "lty", "lwd", "bg")){
legend.arg[[arg]] <- matplot.arg[[arg]]
}
legend.arg <- c(list(...), legend.arg)
legend.arg <- legend.arg[!duplicated(names(legend.arg))]
legend.arg <- legend.arg[legend.arg.names]
legend.arg <- legend.arg[lengths(legend.arg)>0]
cex <- 1
text.col <- par("col")
text.font <- NULL
if(is.null(legend.arg[["cex"]])){
legend.arg[["cex"]] <- cex
}
do.call(legend, legend.arg)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.