###########################################################################
##
## Copyright 2013, 2014 Charles Danko and Minho Chae.
##
## This program is part of the groHMM R package
##
## groHMM is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
## or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
## for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program. If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################
#' getTxDensity Calculates transcript density.
#'
#' Calculates transcript density for transcripts which overlaps with
#' annotations.
#' For 'combined' or 'broken up' errors,
#' best overlapped transcripts or annotations are used.
#'
#' @param tx GRanges of transcripts.
#' @param annox GRanges of non-overlapping annotations.
#' @param plot Logical. If TRUE, plot transcript density. Default: FALSE
#' @param scale Numeric. Scaled size of a gene for transcript density
#' calculation.
#' Default: 1000L
#' @param BPPARAM Registered backend for BiocParallel.
#' Default: BiocParallel::bpparam()
#' @return Returns a list of FTD, TTD, PostTTS, and TUA.
#' @author Minho Chae
#' @examples
#' library(GenomicRanges)
#' tx <- GRanges("chr7", IRanges(start=seq(1000,4000, by=1000),
#' width=seq(1000, 1300, by=100)), strand=rep("+", 4))
#' annox <- GRanges("chr7", IRanges(start=seq(1100,4100, by=1000),
#' width=seq(900, 1200, by=100)), strand=rep("+", 4))
#' ## Not run:
#' # density <- getTxDensity(tx, annox)
getTxDensity <- function(tx, annox, plot=FALSE, scale=1000L,
BPPARAM=bpparam()) {
ol <- findOverlaps(tx, annox)
## Count tx
combinedGenes <- unique(queryHits(ol[duplicated(queryHits(ol)), ]))
## Count annox
brokenUp <- unique(subjectHits(ol[duplicated(subjectHits(ol)), ]))
message("Merged annotations: ", length(combinedGenes))
message("Dissociated a single annotation: ", length(brokenUp))
message("Overlaps between transcript and annotation:")
message("Total = ", length(ol))
## For each annox, find the best matching tx, combinedGenes case...
intx_rg <- pintersect(tx[queryHits(ol), ], annox[subjectHits(ol), ])
intx_rg_oRatio <- width(intx_rg) / width(tx[queryHits(ol), ])
## Tx matches multiple times on a same annox
remove_rg <-
unique(unlist(lapply(combinedGenes, function(x) {
inx <- which(queryHits(ol) == x)
m <- which.max(intx_rg_oRatio[inx])
inx[-m]
})))
if (length(remove_rg) > 0)
ol <- ol[-remove_rg, ]
## For each transcript, find the best matching annox, brokenUp case...,
brokenUp <- unique(subjectHits(ol[duplicated(subjectHits(ol)), ]))
intx_bu <- pintersect(tx[queryHits(ol), ], annox[subjectHits(ol), ])
intx_bu_oRatio <- width(intx_bu) / width(annox[subjectHits(ol), ])
## Annox matches multiple times on a same transcript
remove_bu <- unique(unlist(lapply(brokenUp, function(x) {
inx <- which(subjectHits(ol) == x)
m <- which.max(intx_bu_oRatio[inx])
inx[-m]
})))
if (length(remove_bu) > 0)
ol <- ol[-remove_bu, ]
message(" Used for density = ", length(ol))
olTx <- tx[queryHits(ol), ]
## Now get the coverage of selected transcripts
olStrand <- as.character(strand(tx[queryHits(ol), ]))
## Get the extended region for annox
up <- 1L
down <- 2L
message("Calculate overlapping ... ", appendLF=FALSE)
promo <- unlist(GRangesList(bplapply(subjectHits(ol), function(x) {
w <- width(annox[x, ])
promoters(annox[x, ], upstream=round(w*up), downstream=round(w*down))
}
, BPPARAM=BPPARAM)))
pintx <- pintersect(promo, olTx)
## Get the overlapped coverage
olcvg <- bplapply(1:length(ol), function(x) {
t <- olTx[x, ]
p <- promo[x, ]
i <- pintx[x, ]
## Position is relative to the minimum start
minStart <- min(start(t), start(p))
t <- shift(t, -minStart + 1)
p <- shift(p, -minStart + 1)
i <- shift(i, -minStart + 1)
r <- reduce(c(t, p, ignore.mcols=TRUE))
rTF <- logical(length=width(r))
rTF[start(i):end(i)] <- TRUE
if (olStrand[x] == "+")
Rle(rTF[start(p):end(p)])
else
rev(Rle(rTF[start(p):end(p)]))
}
, BPPARAM=BPPARAM)
message("OK")
message("Scale overlapping ... ", appendLF=FALSE)
## Get the scaled coverage
cvgWidth <- round(up * scale) + round(down * scale)
sccvg <- bplapply(olcvg, function(x) {
getLIValues(x, cvgWidth)
}
, BPPARAM=BPPARAM)
message("OK")
M <- sapply(sccvg, function(x) as.integer(x))
profile <- apply(M, 1, sum) / length(ol)
if (plot) {
## nocov start
plot(
- (up * scale):(down * scale-1), profile, ylim=c(0, 1), type="l",
xlab="Relative to TSS", ylab="Density")
abline(v=0, col="blue", lty=2)
abline(v=up*scale, col="blue", lty=2)
} # nocov end
FivePrimeFP <- sum(profile[1:scale]) / scale
TP <- sum(profile[(scale + 1):(scale * 2)]) / scale
PostTTS <- sum(profile[(scale * 2 + 1):(scale * 3)]) / scale
TUA <- (TP + (TP - FivePrimeFP)) / (1 + TP)
return(list(
FivePrimeFalsePositive=FivePrimeFP,
TruePositive=TP,
PostTranscriptionalTerminationSite=PostTTS,
TranscriptionUnitAccuracy=TUA))
}
getWP <- function (lv, lw) {
return( ( (lv-1) / (lw-1)) * (0:(lw-1)) + 1)
}
getLIValue <- function (x0, y0, x1, y1, x) {
alpha <- (x - x0) / (x1 - x0)
return(y0 + alpha * (y1 - y0))
}
## Get linear interpolation data
getLIValues <- function (vals, n) {
vals <- as.integer(vals)
wp <- getWP(length(vals), n)
result <- seq(n)
if (length(vals) == 1) {
result[1:length(result)] <- vals
return(result)
}
result[1] <- vals[1]
result[length(result)] <- vals[length(vals)]
if (n > 2) {
for (i in 2:(n-1)) {
x <- wp[i]
x0 <- floor(x)
result[i] <- getLIValue(x0, vals[x0], x0+1, vals[x0+1], x)
}
}
return(Rle(round(result)))
}
#' evaluateHMM Evaluates HMM calling.
#'
#' Evaluates HMM calling of transcripts compared to known annotations.
#'
#' @param tx GRanges of transcripts predicted by HMM.
#' @param annox GRanges of non-overlapping annotations.
#' @return a list of error information; merged annotations, dissociated
#' annotation, total, and rate.
#' @author Minho Chae
#' @examples
#' library(GenomicRanges)
#' tx <- GRanges("chr7", IRanges(start=seq(100, 1000, by=200),
#' width=seq(100, 1000, by=100)), strand="+")
#' annox <- GRanges("chr7", IRanges(start=seq(110, 1100, by=150),
#' width=seq(100, 1000, by=150)), strand="+")
#' error <- evaluateHMMInAnnotations(tx, annox)
evaluateHMMInAnnotations <- function (tx, annox) {
o <- findOverlaps(tx, annox)
## count tx
merged <- length(unique(queryHits(o[duplicated(queryHits(o)), ])))
## count annox
dissociated <- length(unique(subjectHits(o[duplicated(subjectHits(o)), ])))
eval <- data.frame(
merged=merged, dissociated=dissociated,
total=merged + dissociated,
errorRate=c(merged + dissociated) /
(length(tx) + length(annox)),
txSize=length(tx))
intx <- pintersect(tx[queryHits(o), ], annox[subjectHits(o), ])
overlap <- data.frame(
txRatio=width(intx)/width(tx[queryHits(o), ]),
annoxRatio= width(intx)/width(annox[subjectHits(o), ]),
overBases=width(intx),
similarity=width(intx)/
pmax(
width(tx[queryHits(o), ]),
width(annox[subjectHits(o), ])))
return(list(eval=eval, overlap=overlap))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.