NADfinder.R
In NADfinder: Call wide peaks for sequencing data

## ---- echo=FALSE, results="hide", warning=FALSE-------------------------------
suppressPackageStartupMessages({
  library(NADfinder)
  library(BSgenome.Mmusculus.UCSC.mm10)
  library(rtracklayer)
})
knitr::opts_chunk$set(warning=FALSE, message=FALSE, eval=TRUE)

## ----quickStart, fig.width=6, fig.height=4------------------------------------
## load the library
library(NADfinder)
library(SummarizedExperiment)
## test bam files in the package
path <- system.file("extdata", package = "NADfinder", lib.loc = NULL,
            mustWork = TRUE)
bamFiles <- dir(path, ".bam$")

## window size for tile of genome. Here we set it to a big window size,
## ie., 50k because of the following two reasons:
## 1. peaks of NADs are wide;
## 2. data will be smoothed better with bigger window size.
ws <- 50000
## step for tile of genome
step <- 5000
## Set the background. 
## 0.25 means 25% of the lower ratios will be used for background training.
backgroundPercentage <- 0.25
## Count the reads for each window with a given step in the genome.
## The output will be a GRanges object.
library(BSgenome.Mmusculus.UCSC.mm10)

## ----eval=FALSE---------------------------------------------------------------
#  se <- tileCount(reads=file.path(path, bamFiles),
#                  genome=Mmusculus,
#                  windowSize=ws,
#                  step=step,
#                  mode = IntersectionNotStrict,
#                  dataOverSamples = FALSE)

## -----------------------------------------------------------------------------
data(single.count)
se <- single.count

## -----------------------------------------------------------------------------

## Calculate ratios for peak calling. We use nucleoleus vs genomic DNA.
dat <- log2se(se, nucleolusCols = "N18.subsampled.srt.bam", 
              genomeCols ="G18.subsampled.srt.bam",
              transformation = "log2CPMRatio")
## Smooth the ratios for each chromosome.
## We found that for each chromosome, the ratios are higher in 5'end than 3'end.
datList <- smoothRatiosByChromosome(dat, chr = c("chr18", "chr19"), N = 100)
## check the difference between the cumulative percentage tag allocation 
## in genome and nucleoleus samples.
cumulativePercentage(datList[["chr18"]])

## -----------------------------------------------------------------------------
## check the results of smooth function
chr18 <- datList[["chr18"]] ## we only have reads in chr18 in test data.
chr18subset <- subset(chr18, seq.int(floor(length(chr18)/10))*10)
chr18 <- assays(chr18subset)
ylim <- range(c(chr18$ratio[, 1], 
                chr18$bcRatio[, 1], 
                chr18$smoothedRatio[, 1]))
plot(chr18$ratio[, 1], 
     ylim=ylim*c(.9, 1.1), 
     type="l", main="chr18")
abline(h=0, col="cyan", lty=2)
points(chr18$bcRatio[, 1], type="l", col="green")
points(chr18$smoothedRatio[, 1], type="l", col="red")
legend("topright", 
       legend = c("raw_ratio", "background_corrected", "smoothed"), 
       fill = c("black", "green", "red"))
## call peaks for each chromosome
peaks <- lapply(datList, trimPeaks, 
                backgroundPercentage=backgroundPercentage, 
                cutoffAdjPvalue=0.05, countFilter=1000)
## plot the peaks in "chr18"
peaks.subset <- countOverlaps(rowRanges(chr18subset), peaks$chr18)>=1
peaks.run <- rle(peaks.subset)
run <- cumsum(peaks.run$lengths)
run <- data.frame(x0=c(1, run[-length(run)]), x1=run)
#run <- run[peaks.run$values, , drop=FALSE]
rect(xleft = run$x0, ybottom = ylim[2]*.75, 
     xright = run$x1, ytop = ylim[2]*.8,
     col = "black")
## convert list to a GRanges object
peaks.gr <- unlist(GRangesList(peaks))

## -----------------------------------------------------------------------------
## output the peaks
write.csv(as.data.frame(unname(peaks.gr)), "peaklist.csv", row.names=FALSE)
## export peaks to a bed file.
library(rtracklayer)
#export(peaks.gr, "peaklist.bed", "BED")

## -----------------------------------------------------------------------------
library(NADfinder)
## bam file path
path <- system.file("extdata", package = "NADfinder", lib.loc = NULL,
            mustWork = TRUE)
bamFiles <- dir(path, ".bam$")

f <- bamFiles
ws <- 50000
step <- 5000
library(BSgenome.Mmusculus.UCSC.mm10)

## ----eval=FALSE---------------------------------------------------------------
#  se <- tileCount(reads=file.path(path, f),
#                  genome=Mmusculus,
#                  windowSize=ws,
#                  step=step)

## -----------------------------------------------------------------------------
data(triplicate.count)
se <- triplicate.count

## Calculate ratios for nucleoloar vs genomic samples.
se <- log2se(se, 
             nucleolusCols = c("N18.subsampled.srt-2.bam", 
                                "N18.subsampled.srt-3.bam",
                                "N18.subsampled.srt.bam" ),
             genomeCols = c("G18.subsampled.srt-2.bam",
                            "G18.subsampled.srt-3.bam",
                            "G18.subsampled.srt.bam"), 
             transformation = "log2CPMRatio")
seList<- smoothRatiosByChromosome(se, chr="chr18")
cumulativePercentage(seList[["chr18"]])
peaks <- lapply(seList, callPeaks, 
                cutoffAdjPvalue=0.05, countFilter=1)
peaks <- unlist(GRangesList(peaks))
peaks

## ----fig.height=1.5-----------------------------------------------------------
ideo <- readRDS(system.file("extdata", "ideo.mm10.rds", 
                            package = "NADfinder"))
plotSig(ideo=ideo, grList=GRangesList(peaks), mcolName="AveSig", 
        layout=list("chr18"), 
        parameterList=list(types="heatmap"))

## ----sessionInfo--------------------------------------------------------------
sessionInfo()