gcapc: GC Aware Peak Caller

#' @title GC Effects Aware Peak Calling
#'
#' @description
#' This function calls ChIP-seq peaks using potential GC effects information.
#' Enrichment scores are calculated on sliding windows of prefiltered
#' large regions, with GC effects considered. Permutation analysis is
#' used to determine significant binding peaks.
#'
#' @param coverage A list object returned by function \code{read5endCoverage}.
#'
#' @param gcbias A list object returned by function \code{gcEffects}.
#'
#' @param bdwidth A non-negative integer vector with two elements
#' specifying ChIP-seq binding width and peak detection half window size.
#' Usually generated by function \code{bindWidth}. A bad estimation of
#' bdwidth results no meaning of downstream analysis. The values
#' need to be the same as it is when calculating \code{gcbias}. 
#'
#' @param flank A non-negative integer specifying the flanking width of
#' ChIP-seq binding. This parameter provides the flexibility that reads
#' appear in flankings by decreased probabilities as increased distance
#' from binding region. This paramter helps to define effective GC
#' content calculation. Default is NULL, which means this paramater will
#' be calculated from \code{bdwidth}. However, if customized numbers
#' provided, there won't be recalucation for this parameter; instead, the
#' 2nd elements of \code{bdwidth} will be recalculated based on \code{flank}.
#' The value needs to be the same as it is when calculating \code{gcbias}.
#'
#' @param prefilter A non-negative integer specifying the minimum of reads
#' to qualify a potential binding region. Regions with total of reads from
#' forward and reverse strands larger or equivalent to \code{prefilter} are
#' selected for downstream analysis. Default is 4.
#'
#' @param permute A non-negative integer specifying times of permutation to
#' be performed. Default is 5. When whole large genome is used, such as
#' human genome, 5 times of permutation could be enough.
#'
#' @param pv A numeric specifying p-value cutoff for significant
#' binding peaks. Default is 0.05.
#'
#' @param plot A logical vector which, when TRUE (default), returns density
#' plots of real and permutation enrichment scores.
#'
#' @param genome A \link[BSgenome]{BSgenome} object containing the sequences
#' of the reference genome that was used to align the reads, or the name of
#' this reference genome specified in a way that is accepted by the
#' \code{\link[BSgenome]{getBSgenome}} function defined in the \pkg{BSgenome}
#' software package. In that case the corresponding BSgenome data package
#' needs to be already installed (see \code{?\link[BSgenome]{getBSgenome}} in
#' the \pkg{BSgenome} package for the details). The value needs to be the same
#' as it is when calculating \code{gcbias}.
#'
#' @param gctype A character vector specifying choice of method to calculate
#' effective GC content. Default \code{ladder} is based on uniformed fragment
#' distribution. A more smoother method based on tricube assumption is also
#' allowed. However, tricube should be not used if estimated peak half size
#' is 3 times or more larger than estimated bind width. The value
#' needs to be the same as it is when calculating \code{gcbias}.
#'
#' @return A GRanges of peaks with meta columns:
#' \item{es}{Estimated enrichment score.}
#' \item{pv}{p-value.}
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomicRanges
#' @import Biostrings
#' @importFrom BSgenome getBSgenome
#' @importFrom BSgenome getSeq
#' @importFrom stats density
#' @importFrom graphics plot
#' @importFrom graphics lines
#' @importFrom graphics abline
#' @importFrom graphics legend
#' @importFrom methods as
#'
#' @export
#' @examples
#' bam <- system.file("extdata", "chipseq.bam", package="gcapc")
#' cov <- read5endCoverage(bam)
#' bdw <- bindWidth(cov)
#' gcb <- gcEffects(cov, bdw, sampling = c(0.15,1))
#' gcapcPeaks(cov, gcb, bdw)

gcapcPeaks <- function(coverage,gcbias,bdwidth,flank=NULL,prefilter=4L,
                       permute=5L,pv=0.05,plot=FALSE,genome="hg19",
                       gctype=c("ladder","tricube")){
    genome <- getBSgenome(genome)
    gctype <- match.arg(gctype)
    bdw <- bdwidth[1]
    halfbdw <- floor(bdw/2)
    if(is.null(flank)){
        pdwh <- bdwidth[2]
        flank <- pdwh-bdw+halfbdw
    }else{
        pdwh <- flank+bdw-halfbdw
    }
    if(gctype=="ladder"){
      weight <- c(seq_len(flank),rep(flank+1,bdw),rev(seq_len(flank)))
      weight <- weight/sum(weight)
    }else if(gctype=="tricube"){
      w <- flank+halfbdw
      weight <- (1-abs(seq(-w,w)/w)^3)^3
      weight <- weight/sum(weight)
    }
    cat("Starting to call peaks.\n")
    ### prefiltering regions
    cat("...... prefiltering regions\n")
    seqs <- sapply(coverage$fwd,length)
    seqs <- floor(seqs/bdw-10)*bdw
    starts <- lapply(seqs, function(i) seq(1+bdw*10, i, bdw))
    ends <- lapply(seqs, function(i) seq(bdw*11, i, bdw))
    chrs <- rep(names(seqs), times=sapply(starts, length))
    region <- GRanges(chrs, IRanges(start=unlist(starts),end=unlist(ends)))
    regionsp <- resize(split(region,seqnames(region)),pdwh)
    rcfwd <- unlist(viewSums(Views(coverage$fwd,
                 ranges(shift(regionsp,-flank)))))
    rcrev <- unlist(viewSums(Views(coverage$rev,
                 ranges(shift(regionsp,halfbdw)))))
    regions <- region[rcfwd+rcrev >= prefilter]
    rm(seqs,starts,ends,chrs,region,regionsp,rcfwd,rcrev)
    ## extend both ends
    regionsrc <- reduce(shift(resize(regions,halfbdw*6+flank*2+1),
                              -halfbdw*2-flank))
    regionsgc <- shift(resize(regionsrc,width(regionsrc)+halfbdw*2),-halfbdw)
    rm(regions)
    ### gc content
    cat("...... caculating GC content\n")
    nr <- shift(resize(regionsgc,width(regionsgc)+flank*2),-flank)
    seqs <- getSeq(genome,nr)
    gcpos <- startIndex(vmatchPattern("S", seqs, fixed="subject"))
    gcposb <- vector("integer",sum(as.numeric(width(nr))))
    gcposbi <- rep(seq_along(nr),times=width(nr))
    gcposbsp <- split(gcposb,gcposbi)
    for(i in seq_along(nr)){
        gcposbsp[[i]][gcpos[[i]]] <- 1L
    }
    gcposbsprle <- as(gcposbsp,"RleList")
    gcnuml <- width(regionsgc)-halfbdw*2
    gcnum <- vector('numeric',sum(as.numeric(gcnuml)))
    gcnumi <- rep(seq_along(nr),times=gcnuml)
    gc <- split(gcnum,gcnumi)
    for(i in seq_along(nr)){
        gc[[i]] <- round(runwtsum(gcposbsprle[[i]],k=length(weight),
                       wt=weight),3)
        if(i%%500 == 0) cat('.')
    }
    cat('\n')
    rm(regionsgc,nr,seqs,gcpos,gcposb,gcposbi,
        gcposbsp,gcposbsprle,gcnuml,gcnum,gcnumi)
    ### gc weight
    cat("...... caculating GC effect weights\n")
    gcwbase0 <- round(Rle(gcbias$mu0med1/gcbias$mu0),3)
    gcw <- RleList(lapply(gc,function(x) gcwbase0[x*1000+1])) # slow
    rm(gc,gcwbase0)
    ### reads count
    regionrcsp <- split(regionsrc,seqnames(regionsrc))
    rcfwd <- RleList(unlist(viewApply(Views(coverage$fwd,ranges(regionrcsp)),
                 runsum,k=pdwh)))
    rcrev <- RleList(unlist(viewApply(Views(coverage$rev,ranges(regionrcsp)),
                 runsum,k=pdwh)))
    ### enrichment score
    cat("...... estimating enrichment score\n")
    esl <- width(regionsrc)-halfbdw*2-flank*2
    ir1 <- IRangesList(start=IntegerList(as.list(rep(1,length(esl)))),
               end=IntegerList(as.list(esl)))
    ir2 <- IRangesList(start=IntegerList(as.list(rep(1+halfbdw+flank,
               length(esl)))),end=halfbdw+flank+IntegerList(as.list(esl)))
    ir3 <- IRangesList(start=IntegerList(as.list(rep(1+halfbdw*2+flank*2,
               length(esl)))),
               end=halfbdw*2+flank*2+IntegerList(as.list(esl)))
    rc1 <- rcfwd[ir1]
    rc2 <- rcfwd[ir2]
    rc3 <- rcrev[ir1]
    rc4 <- rcrev[ir2]
    gcw1 <- gcw[ir2]
    gcw2 <- gcw[ir3]
    gcw3 <- gcw[ir1]
    esrlt <- round(2*sqrt(rc1*rc4)*gcw1-rc3*gcw3-rc2*gcw2,3)
    names(esrlt) <- seq_along(regionsrc)
    rm(rcfwd,rcrev,gcw,rc1,rc2,rc3,rc4)
    ### permutation analysis
    cat("...... permutation analysis\n")
    perm <- table(c())
    for(p in seq_len(permute)){
        covfwdp <- coverage$fwd[ranges(regionrcsp)]
        covrevp <- coverage$rev[ranges(regionrcsp)]
        for(i in seq_along(coverage$fwd)){
            covfwdp[[i]] <- covfwdp[[i]][sample.int(length(covfwdp[[i]]))]
            covrevp[[i]] <- covrevp[[i]][sample.int(length(covrevp[[i]]))]
        }
        end <- cumsum(width(regionrcsp))
        start <- end-width(regionrcsp)+1
        regionrcspp <- IRangesList(start=start,end=end)
        rcfwdp <- RleList(unlist(viewApply(Views(covfwdp,regionrcspp),
                                         runsum,k=pdwh)))
        rcrevp <- RleList(unlist(viewApply(Views(covrevp,regionrcspp),
                                         runsum,k=pdwh)))
        rcp1 <- rcfwdp[ir1]
        rcp2 <- rcfwdp[ir2]
        rcp3 <- rcrevp[ir1]
        rcp4 <- rcrevp[ir2]
        accum <- table(unlist(round(2*sqrt(rcp1*rcp4)*gcw1-rcp3*gcw3-
                                    rcp2*gcw2,3),use.names=FALSE))
        n <- intersect(names(perm), names(accum))
        perm <- c(perm[!(names(perm) %in% n)],
                 accum[!(names(accum) %in% n)],
                 perm[n] + accum[n])
        cat('......... permutation',p,'\n')
    }
    perm <- perm[order(as.numeric(names(perm)))]
    pvs <- 1- cumsum(as.numeric(perm))/sum(as.numeric(perm))
    names(pvs) <- names(perm)
    perm <- Rle(as.numeric(names(perm)),perm)
    rm(covfwdp,covrevp,start,end,regionrcspp,rcp1,rcp2,rcp3,rcp4,
        rcfwdp,rcrevp,gcw1,gcw2,gcw3,esl,regionrcsp)
    ### report peaks
    cat('...... reporting peaks\n')
    sccut <- quantile(perm,1-pv)
    minpv <- 1/length(perm)
    cat('......... enrichment scores cut at',sccut,'\n')
    if(plot){
        cat('......... ploting enrichment scores\n')
        es <- as.numeric(unlist(esrlt,use.names=FALSE))
        perm <- as.numeric(perm)
        plot(density(perm,bw=1),col='blue',xlab='enrichment score',
            xlim=range(es),main=paste('es determined by pvalue',pv))
        lines(density(es,bw=1),col='red')
        abline(v=sccut,lty=2,col='purple')
        legend('topright',c('real','perm'),lty=1,
            col=c('red','blue'),bty='n')
        rm(es)
    }
    rm(perm)
    cat('......... reporting peak bumps\n')
    esrltsl <- slice(esrlt,sccut,rangesOnly=TRUE)
    esrltsl0 <- slice(esrlt,0,rangesOnly=TRUE)
    esrltsle <- reduce(shift(resize(esrltsl,width(esrltsl)+halfbdw*2),
                    -halfbdw))
    peaksir <- intersect(esrltsl0,esrltsle)
    peaksir <- peaksir[width(peaksir)>=halfbdw]
    rm(esrltsl,esrltsl0,esrltsle)
    cat('......... summarizing peak score and pvalue\n')
    peaksir <- unlist(peaksir)
    regionids <- as.integer(names(peaksir))
    peaksirlst <- IRangesList(start=IntegerList(as.list(start(peaksir))),
                      end=IntegerList(as.list(end(peaksir))))
    if(length(regionids)>50000){
        stepa <- seq(1,length(regionids),50000)
        stepb <- seq(50000,length(regionids),50000)
        if(length(stepb)+1 == length(stepa))
            stepb <- c(stepb,length(regionids))
        peaksc <- c()
        for(i in seq_along(stepa)){
            idx <- stepa[i]:stepb[i]
            peaksc <- c(peaksc,max(esrlt[regionids[idx]][peaksirlst[idx]]))
        }
    }else{
        peaksc <- max(esrlt[regionids][peaksirlst])
    }
    peaksir <- shift(peaksir,start(regionsrc)[regionids]+halfbdw+flank)
    peaks <- GRanges(seqnames(regionsrc)[regionids],peaksir,es=peaksc)
    peaksrd <- reduce(peaks,min.gapwidth=halfbdw,with.revmap=TRUE)
    peaksrd$es <- sapply(peaksrd$revmap,function(i) max(peaks$es[i]))
    peaksrd$revmap <- NULL
    pvsn <- as.numeric(names(pvs))
    pvsni <- sapply(peaksrd$es,function(x) sum(pvsn<=x))
    pvsni[pvsni==0] <- NA
    peaksrd$pv <- pvs[pvsni]
    peaksrd$pv[peaksrd$es<min(pvsn)] <- 1
    peaksrd$pv[peaksrd$pv==0] <- minpv
    peaksrd
}