#' @title GC Effects Aware Peak Calling
#'
#' @description
#' This function calls ChIP-seq peaks using potential GC effects information.
#' Enrichment scores are calculated on sliding windows of prefiltered
#' large regions, with GC effects considered. Permutation analysis is
#' used to determine significant binding peaks.
#'
#' @param coverage A list object returned by function \code{read5endCoverage}.
#'
#' @param gcbias A list object returned by function \code{gcEffects}.
#'
#' @param bdwidth A non-negative integer vector with two elements
#' specifying ChIP-seq binding width and peak detection half window size.
#' Usually generated by function \code{bindWidth}. A bad estimation of
#' bdwidth results no meaning of downstream analysis. The values
#' need to be the same as it is when calculating \code{gcbias}.
#'
#' @param flank A non-negative integer specifying the flanking width of
#' ChIP-seq binding. This parameter provides the flexibility that reads
#' appear in flankings by decreased probabilities as increased distance
#' from binding region. This paramter helps to define effective GC
#' content calculation. Default is NULL, which means this paramater will
#' be calculated from \code{bdwidth}. However, if customized numbers
#' provided, there won't be recalucation for this parameter; instead, the
#' 2nd elements of \code{bdwidth} will be recalculated based on \code{flank}.
#' The value needs to be the same as it is when calculating \code{gcbias}.
#'
#' @param prefilter A non-negative integer specifying the minimum of reads
#' to qualify a potential binding region. Regions with total of reads from
#' forward and reverse strands larger or equivalent to \code{prefilter} are
#' selected for downstream analysis. Default is 4.
#'
#' @param permute A non-negative integer specifying times of permutation to
#' be performed. Default is 5. When whole large genome is used, such as
#' human genome, 5 times of permutation could be enough.
#'
#' @param pv A numeric specifying p-value cutoff for significant
#' binding peaks. Default is 0.05.
#'
#' @param plot A logical vector which, when TRUE (default), returns density
#' plots of real and permutation enrichment scores.
#'
#' @param genome A \link[BSgenome]{BSgenome} object containing the sequences
#' of the reference genome that was used to align the reads, or the name of
#' this reference genome specified in a way that is accepted by the
#' \code{\link[BSgenome]{getBSgenome}} function defined in the \pkg{BSgenome}
#' software package. In that case the corresponding BSgenome data package
#' needs to be already installed (see \code{?\link[BSgenome]{getBSgenome}} in
#' the \pkg{BSgenome} package for the details). The value needs to be the same
#' as it is when calculating \code{gcbias}.
#'
#' @param gctype A character vector specifying choice of method to calculate
#' effective GC content. Default \code{ladder} is based on uniformed fragment
#' distribution. A more smoother method based on tricube assumption is also
#' allowed. However, tricube should be not used if estimated peak half size
#' is 3 times or more larger than estimated bind width. The value
#' needs to be the same as it is when calculating \code{gcbias}.
#'
#' @return A GRanges of peaks with meta columns:
#' \item{es}{Estimated enrichment score.}
#' \item{pv}{p-value.}
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomicRanges
#' @import Biostrings
#' @importFrom BSgenome getBSgenome
#' @importFrom BSgenome getSeq
#' @importFrom stats density
#' @importFrom graphics plot
#' @importFrom graphics lines
#' @importFrom graphics abline
#' @importFrom graphics legend
#' @importFrom methods as
#'
#' @export
#' @examples
#' bam <- system.file("extdata", "chipseq.bam", package="gcapc")
#' cov <- read5endCoverage(bam)
#' bdw <- bindWidth(cov)
#' gcb <- gcEffects(cov, bdw, sampling = c(0.15,1))
#' gcapcPeaks(cov, gcb, bdw)
gcapcPeaks <- function(coverage,gcbias,bdwidth,flank=NULL,prefilter=4L,
permute=5L,pv=0.05,plot=FALSE,genome="hg19",
gctype=c("ladder","tricube")){
genome <- getBSgenome(genome)
gctype <- match.arg(gctype)
bdw <- bdwidth[1]
halfbdw <- floor(bdw/2)
if(is.null(flank)){
pdwh <- bdwidth[2]
flank <- pdwh-bdw+halfbdw
}else{
pdwh <- flank+bdw-halfbdw
}
if(gctype=="ladder"){
weight <- c(seq_len(flank),rep(flank+1,bdw),rev(seq_len(flank)))
weight <- weight/sum(weight)
}else if(gctype=="tricube"){
w <- flank+halfbdw
weight <- (1-abs(seq(-w,w)/w)^3)^3
weight <- weight/sum(weight)
}
cat("Starting to call peaks.\n")
### prefiltering regions
cat("...... prefiltering regions\n")
seqs <- sapply(coverage$fwd,length)
seqs <- floor(seqs/bdw-10)*bdw
starts <- lapply(seqs, function(i) seq(1+bdw*10, i, bdw))
ends <- lapply(seqs, function(i) seq(bdw*11, i, bdw))
chrs <- rep(names(seqs), times=sapply(starts, length))
region <- GRanges(chrs, IRanges(start=unlist(starts),end=unlist(ends)))
regionsp <- resize(split(region,seqnames(region)),pdwh)
rcfwd <- unlist(viewSums(Views(coverage$fwd,
ranges(shift(regionsp,-flank)))))
rcrev <- unlist(viewSums(Views(coverage$rev,
ranges(shift(regionsp,halfbdw)))))
regions <- region[rcfwd+rcrev >= prefilter]
rm(seqs,starts,ends,chrs,region,regionsp,rcfwd,rcrev)
## extend both ends
regionsrc <- reduce(shift(resize(regions,halfbdw*6+flank*2+1),
-halfbdw*2-flank))
regionsgc <- shift(resize(regionsrc,width(regionsrc)+halfbdw*2),-halfbdw)
rm(regions)
### gc content
cat("...... caculating GC content\n")
nr <- shift(resize(regionsgc,width(regionsgc)+flank*2),-flank)
seqs <- getSeq(genome,nr)
gcpos <- startIndex(vmatchPattern("S", seqs, fixed="subject"))
gcposb <- vector("integer",sum(as.numeric(width(nr))))
gcposbi <- rep(seq_along(nr),times=width(nr))
gcposbsp <- split(gcposb,gcposbi)
for(i in seq_along(nr)){
gcposbsp[[i]][gcpos[[i]]] <- 1L
}
gcposbsprle <- as(gcposbsp,"RleList")
gcnuml <- width(regionsgc)-halfbdw*2
gcnum <- vector('numeric',sum(as.numeric(gcnuml)))
gcnumi <- rep(seq_along(nr),times=gcnuml)
gc <- split(gcnum,gcnumi)
for(i in seq_along(nr)){
gc[[i]] <- round(runwtsum(gcposbsprle[[i]],k=length(weight),
wt=weight),3)
if(i%%500 == 0) cat('.')
}
cat('\n')
rm(regionsgc,nr,seqs,gcpos,gcposb,gcposbi,
gcposbsp,gcposbsprle,gcnuml,gcnum,gcnumi)
### gc weight
cat("...... caculating GC effect weights\n")
gcwbase0 <- round(Rle(gcbias$mu0med1/gcbias$mu0),3)
gcw <- RleList(lapply(gc,function(x) gcwbase0[x*1000+1])) # slow
rm(gc,gcwbase0)
### reads count
regionrcsp <- split(regionsrc,seqnames(regionsrc))
rcfwd <- RleList(unlist(viewApply(Views(coverage$fwd,ranges(regionrcsp)),
runsum,k=pdwh)))
rcrev <- RleList(unlist(viewApply(Views(coverage$rev,ranges(regionrcsp)),
runsum,k=pdwh)))
### enrichment score
cat("...... estimating enrichment score\n")
esl <- width(regionsrc)-halfbdw*2-flank*2
ir1 <- IRangesList(start=IntegerList(as.list(rep(1,length(esl)))),
end=IntegerList(as.list(esl)))
ir2 <- IRangesList(start=IntegerList(as.list(rep(1+halfbdw+flank,
length(esl)))),end=halfbdw+flank+IntegerList(as.list(esl)))
ir3 <- IRangesList(start=IntegerList(as.list(rep(1+halfbdw*2+flank*2,
length(esl)))),
end=halfbdw*2+flank*2+IntegerList(as.list(esl)))
rc1 <- rcfwd[ir1]
rc2 <- rcfwd[ir2]
rc3 <- rcrev[ir1]
rc4 <- rcrev[ir2]
gcw1 <- gcw[ir2]
gcw2 <- gcw[ir3]
gcw3 <- gcw[ir1]
esrlt <- round(2*sqrt(rc1*rc4)*gcw1-rc3*gcw3-rc2*gcw2,3)
names(esrlt) <- seq_along(regionsrc)
rm(rcfwd,rcrev,gcw,rc1,rc2,rc3,rc4)
### permutation analysis
cat("...... permutation analysis\n")
perm <- table(c())
for(p in seq_len(permute)){
covfwdp <- coverage$fwd[ranges(regionrcsp)]
covrevp <- coverage$rev[ranges(regionrcsp)]
for(i in seq_along(coverage$fwd)){
covfwdp[[i]] <- covfwdp[[i]][sample.int(length(covfwdp[[i]]))]
covrevp[[i]] <- covrevp[[i]][sample.int(length(covrevp[[i]]))]
}
end <- cumsum(width(regionrcsp))
start <- end-width(regionrcsp)+1
regionrcspp <- IRangesList(start=start,end=end)
rcfwdp <- RleList(unlist(viewApply(Views(covfwdp,regionrcspp),
runsum,k=pdwh)))
rcrevp <- RleList(unlist(viewApply(Views(covrevp,regionrcspp),
runsum,k=pdwh)))
rcp1 <- rcfwdp[ir1]
rcp2 <- rcfwdp[ir2]
rcp3 <- rcrevp[ir1]
rcp4 <- rcrevp[ir2]
accum <- table(unlist(round(2*sqrt(rcp1*rcp4)*gcw1-rcp3*gcw3-
rcp2*gcw2,3),use.names=FALSE))
n <- intersect(names(perm), names(accum))
perm <- c(perm[!(names(perm) %in% n)],
accum[!(names(accum) %in% n)],
perm[n] + accum[n])
cat('......... permutation',p,'\n')
}
perm <- perm[order(as.numeric(names(perm)))]
pvs <- 1- cumsum(as.numeric(perm))/sum(as.numeric(perm))
names(pvs) <- names(perm)
perm <- Rle(as.numeric(names(perm)),perm)
rm(covfwdp,covrevp,start,end,regionrcspp,rcp1,rcp2,rcp3,rcp4,
rcfwdp,rcrevp,gcw1,gcw2,gcw3,esl,regionrcsp)
### report peaks
cat('...... reporting peaks\n')
sccut <- quantile(perm,1-pv)
minpv <- 1/length(perm)
cat('......... enrichment scores cut at',sccut,'\n')
if(plot){
cat('......... ploting enrichment scores\n')
es <- as.numeric(unlist(esrlt,use.names=FALSE))
perm <- as.numeric(perm)
plot(density(perm,bw=1),col='blue',xlab='enrichment score',
xlim=range(es),main=paste('es determined by pvalue',pv))
lines(density(es,bw=1),col='red')
abline(v=sccut,lty=2,col='purple')
legend('topright',c('real','perm'),lty=1,
col=c('red','blue'),bty='n')
rm(es)
}
rm(perm)
cat('......... reporting peak bumps\n')
esrltsl <- slice(esrlt,sccut,rangesOnly=TRUE)
esrltsl0 <- slice(esrlt,0,rangesOnly=TRUE)
esrltsle <- reduce(shift(resize(esrltsl,width(esrltsl)+halfbdw*2),
-halfbdw))
peaksir <- intersect(esrltsl0,esrltsle)
peaksir <- peaksir[width(peaksir)>=halfbdw]
rm(esrltsl,esrltsl0,esrltsle)
cat('......... summarizing peak score and pvalue\n')
peaksir <- unlist(peaksir)
regionids <- as.integer(names(peaksir))
peaksirlst <- IRangesList(start=IntegerList(as.list(start(peaksir))),
end=IntegerList(as.list(end(peaksir))))
if(length(regionids)>50000){
stepa <- seq(1,length(regionids),50000)
stepb <- seq(50000,length(regionids),50000)
if(length(stepb)+1 == length(stepa))
stepb <- c(stepb,length(regionids))
peaksc <- c()
for(i in seq_along(stepa)){
idx <- stepa[i]:stepb[i]
peaksc <- c(peaksc,max(esrlt[regionids[idx]][peaksirlst[idx]]))
}
}else{
peaksc <- max(esrlt[regionids][peaksirlst])
}
peaksir <- shift(peaksir,start(regionsrc)[regionids]+halfbdw+flank)
peaks <- GRanges(seqnames(regionsrc)[regionids],peaksir,es=peaksc)
peaksrd <- reduce(peaks,min.gapwidth=halfbdw,with.revmap=TRUE)
peaksrd$es <- sapply(peaksrd$revmap,function(i) max(peaks$es[i]))
peaksrd$revmap <- NULL
pvsn <- as.numeric(names(pvs))
pvsni <- sapply(peaksrd$es,function(x) sum(pvsn<=x))
pvsni[pvsni==0] <- NA
peaksrd$pv <- pvs[pvsni]
peaksrd$pv[peaksrd$es<min(pvsn)] <- 1
peaksrd$pv[peaksrd$pv==0] <- minpv
peaksrd
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.