###########################################################################
##
## Copyright 2013, 2014 Charles Danko and Minho Chae.
##
## This program is part of the groHMM R package
##
## groHMM is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
## or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
## for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program. If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################
################################################################################
##
## 4/24/2012
##
## Thanks to Andre Martins for the following two functions, useful for
## calculating the confidence interval for a ratio of Poisson random variables.
##
## Based on: Ederer F, Mantel N (1974); Confidence Limits on the Ratio of Two
## Poisson Variables. AMERICAN JOURNAL OP EPIDEMIOLOGY.
##
###############################################################################
approx_ratio_CI <- function(x1, x2, alpha=0.05) {
t <- qnorm(1 - alpha/2)
n <- x1 + x2
zp <- (t^2/2 + x1 + 1/2)^2 - (n + t^2) / n * (x1 + 1/2)^2
zn <- (t^2/2 + x1 - 1/2)^2 - (n + t^2) / n * (x1 - 1/2)^2
a <- (t^2/2 + x1 + 1/2 + sqrt(zp)) / (n + t^2/2 - x1 - 1/2 - sqrt(zp))
b <- (t^2/2 + x1 - 1/2 - sqrt(zn)) / (n + t^2/2 - x1 + 1/2 + sqrt(zn))
return(c(b, a))
}
approx_ratios_CI <- function(num.counts, denom.counts, alpha=0.05) {
stopifnot(length(num.counts) == length(denom.counts))
N <- length(num.counts)
result <- matrix(data=0, nrow=N, ncol=2)
for (i in 1:N)
result[i, ] <- approx_ratio_CI(num.counts[i], denom.counts[i], alpha)
return(result)
}
#' Returns the pausing index for different genes. TODO: DESCRIBE THE PAUSING
#' INDEX.
#'
#' @param features A GRanges object representing a set of genomic coordinates.
#' @param reads A GRanges object representing a set of mapped reads.
#' @param size The size of the moving window.
#' @param up Distance upstream of each f to align and histogram.
#' @param down Distance downstream of each f to align and histogram (NULL).
#' @param BPPARAM Registered backend for BiocParallel.
#' Default: BiocParallel::bpparam()
#' @return Returns a data.frame of the pausing indices for the input genes.
#' @author Charles G. Danko and Minho Chae.
#' @return Returns the pausing index for different genes.
#' @examples
#' library(GenomicAlignments)
#' features <- GRanges("chr7", IRanges(2394474,2420377), strand="+")
#' reads <- as(readGAlignments(system.file("extdata", "S0mR1.bam",
#' package="groHMM")), "GRanges")
#' ## Not run:
#' # pi <- pausingIndex(features, reads)
##
## Arguments:
## f -> data.frame of: CHR, START, END, STRAND.
## p -> data.frame of: CHR, START, END, STRAND.
## size -> The size of the moving window.
## up -> Distance upstream of each f to align and histogram.
## down -> Distance downstream of each f to align and histogram (NULL).
##
## TODO:
## (1) Write C function.
## (2) Implement: ...
## (a) promWind-> As an alternative to up/down, specify a specific window
## in which to check for the pause-peak.
## Uses the same format as "f", expect that features are mapped with
## respect to the start of f, where 0 indicates the start of
## transcription, and negative numbers specify upstream sequence.
## This is likely useful for rate; perhaps for identifying internal
## paused-peaks...
pausingIndex <- function(features, reads, size=50, up=1000, down=1000,
BPPARAM=bpparam()) {
## make sure reads are sorted
reads <- reads[order(as.character(seqnames(reads)), start(reads)), ]
f <- data.frame(chrom=as.character(seqnames(features)),
start=as.integer(start(features)), end=as.integer(end(features)),
strand=as.character(strand(features)))
if ("symbol" %in% names(mcols(features))){
f <- cbind(f, symbol=features$symbol)
} else {
f <- cbind(f, symbol=GeneID <- as.character(seq_len(NROW(f))))
}
p <- data.frame(
chrom=as.character(seqnames(reads)), start=as.integer(start(reads)),
end=as.integer(end(reads)), strand=as.character(strand(reads)))
C <- sort(as.character(unique(f[[1]])))
Pause <- rep(0, NROW(f))
Body <- rep(0, NROW(f))
Fish <- rep(0, NROW(f))
GeneID <- rep("", NROW(f))
CIl <- rep(0, NROW(f))
CIu <- rep(0, NROW(f))
## Pass back information for the fisher test...
PauseCounts <- rep(0, NROW(f))
BodyCounts <- rep(0, NROW(f))
UpCounts <- rep(0, NROW(f))
UgCounts <- rep(0, NROW(f))
size <- as.integer(size)
up <- as.integer(up)
down <- as.integer(down)
###### Calculate PLUS and MINUS index, for DRY compliance.
PLUS_INDX <- which(f[[4]] == "+")
MINU_INDX <- which(f[[4]] == "-")
###### Identify TSS -- Start for '+' strand, End for '-' strand.
c_tss_indx <- rep(0, NROW(f))
c_tss_indx[PLUS_INDX] <- 2
c_tss_indx[MINU_INDX] <- 3
c_tss <- unlist(lapply(c(1:NROW(f)), function(x) {
f[x, c_tss_indx[x]]
}
))
###### Now calculate left and right position for gene body, based
### on '+' or '-'.
### Calculate gene end. Gene start is contiguous with the coordinates
###for the promoter.
c_gene_end_indx <- rep(0, NROW(f))
c_gene_end_indx[PLUS_INDX] <- 3
c_gene_end_indx[MINU_INDX] <- 2
c_gene_end <- unlist(lapply(c(1:NROW(f)), function(x) {
f[x, c_gene_end_indx[x]]
}
))
### Assign left and right.
gLEFT <- rep(0, NROW(c_tss))
gRIGHT <- rep(0, NROW(c_tss))
gLEFT[PLUS_INDX] <- c_tss[PLUS_INDX] + down
gRIGHT[PLUS_INDX] <- c_gene_end[PLUS_INDX]
gLEFT[MINU_INDX] <- c_gene_end[MINU_INDX]
gRIGHT[MINU_INDX] <- c_tss[MINU_INDX] - down
## Run parallel version.
mcp <- bplapply(
c(1:NROW(C)), pausingIndex_foreachChrom, C=C, f=f, p=p, gLEFT=gLEFT,
gRIGHT=gRIGHT, c_tss=c_tss, size=size, up=up, down=down,
BPPARAM=BPPARAM)
## Unlist and re-order values for printing in a nice data.frame.
for (i in 1:NROW(C)) {
## Which KG? prb?
indxF <- which(as.character(f[[1]]) == C[i])
indxPrb <- which(as.character(p[[1]]) == C[i])
if ( (NROW(indxF) >0) & (NROW(indxPrb) >0)) {
Pause[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["Pause"]]
Body[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["Body"]]
Fish[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["Fish"]]
GeneID[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["GeneID"]]
PauseCounts[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["PauseCounts"]]
BodyCounts[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["BodyCounts"]]
UpCounts[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["UpCounts"]]
UgCounts[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["UgCounts"]]
CIl[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["CIl"]]
CIu[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["CIu"]]
}
}
return(data.frame(
Pause=Pause, Body=Body, Fisher=Fish, GeneID=GeneID, CIlower=CIl,
CIupper=CIu, PauseCounts=PauseCounts, BodyCounts=BodyCounts,
uPCounts=UpCounts, uGCounts=UgCounts))
}
pausingIndex_foreachChrom <- function(i, C, f, p, gLEFT, gRIGHT, c_tss, size,
up, down) {
## Which KG? prb?
indxF <- which(as.character(f[[1]]) == C[i])
indxPrb <- which(as.character(p[[1]]) == C[i])
if ( (NROW(indxF) >0) & (NROW(indxPrb) >0)) {
## Order -- Make sure, b/c this is one of our main assumptions.
## Otherwise violated for DBTSS.
Ford <- order(f[indxF, 2])
Pord <- order(p[indxPrb, 2])
## Type coersions.
FeatureStart <- as.integer(gLEFT[indxF][Ford])
FeatureEnd <- as.integer(gRIGHT[indxF][Ford])
FeatureStr <- as.character(f[indxF, 4][Ford])
FeatureTSS <- as.integer(c_tss[indxF][Ford])
PROBEStart <- as.integer(p[indxPrb, 2][Pord])
PROBEEnd <- as.integer(p[indxPrb, 3][Pord])
PROBEStr <- as.character(p[indxPrb, 4][Pord])
## Set dimensions.
dim(FeatureStart) <- c(NROW(FeatureStart), NCOL(FeatureStart))
dim(FeatureEnd) <- c(NROW(FeatureEnd), NCOL(FeatureEnd))
dim(FeatureTSS) <- c(NROW(FeatureTSS), NCOL(FeatureTSS))
dim(FeatureStr) <- c(NROW(FeatureStr), NCOL(FeatureStr))
dim(PROBEStart) <- c(NROW(PROBEStart), NCOL(PROBEStart))
dim(PROBEEnd) <- c(NROW(PROBEEnd), NCOL(PROBEEnd))
dim(PROBEStr) <- c(NROW(PROBEStr), NCOL(PROBEStr))
## Run the calculations on the gene.
## Calculate the maximal 50 bp window.
HPause <- .Call(
"NumberOfReadsInMaximalSlidingWindow", FeatureTSS, FeatureStr,
PROBEStart, PROBEEnd, PROBEStr, size, up, down, PACKAGE="groHMM")
## Run the calculate on the gene body...
HGeneBody <- .Call(
"CountReadsInFeatures", FeatureStart, FeatureEnd, FeatureStr,
PROBEStart, PROBEEnd, PROBEStr, PACKAGE="groHMM")
## Get size of gene body
Difference <- FeatureEnd-FeatureStart
Difference[Difference < 0] <- 0
## Genes < 1kb, there is no suitable area in the body of the gene.
## Now use Fisher's Exact.
## Make uniform reads.
Up <- round(HPause + HGeneBody) * size / (size + Difference)
Ug <- round(HPause + HGeneBody) * Difference / (size + Difference)
HFish <- unlist(lapply(c(1:NROW(Ford)), function(x) {
fisher.test(
data.frame(
c(HPause[x], round(Up[x])),
c(HGeneBody[x], round(Ug[x]))
)
)$p.value
} ))
## Make return values.
Pause_c <- as.double(HPause / size)
Body_c <- as.double( (HGeneBody + 1) / Difference)
## 6-5-2012 -- Add a pseudocount here,
## forcing at least 1 read in the gene body.
Fish_c <- as.double(HFish)
GeneID_c <- as.character(f[indxF, 5][Ford])
PauseCounts_c <- HPause
BodyCounts_c <- HGeneBody
UpCounts_c <- round(Up)
UgCounts_c <- round(Ug)
aCI <- approx_ratios_CI(HPause, HGeneBody)
scaleFactor <- Difference/size ## Body/ pause, must be 1/ PI units.
CIl_c <- as.double(aCI[, 1]*scaleFactor)
CIu_c <- as.double(aCI[, 2]*scaleFactor)
return(list(
Pause=Pause_c, Body=Body_c, Fish=Fish_c, GeneID=GeneID_c,
PauseCounts=PauseCounts_c, BodyCounts=BodyCounts_c,
UpCounts=UpCounts_c, UgCounts=UgCounts_c, CIl=CIl_c, CIu=CIu_c,
ord=Ford))
}
return(integer(0))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.