Nothing
#' plot ATAC-seq footprints infer factor occupancy genome wide
#' @description Aggregate ATAC-seq footprint for a given motif generated
#' over binding sites within the genome.
#' @param bamfiles A vector of characters indicates the file names of bams.
#' All the bamfiles will be pulled together.
#' @param index The names of the index file of the 'BAM' file being processed;
#' This is given without the '.bai' extension.
#' @param pfm A Position frequency Matrix represented as a numeric matrix
#' with row names A, C, G and T.
#' @param genome An object of \link[BSgenome:BSgenome-class]{BSgenome}.
#' @param min.score The minimum score for counting a match.
#' Can be given as a character string containing a
#' percentage (e.g. "95%") of the highest possible
#' score or as a single number.
#' See \link[Biostrings]{matchPWM}.
#' @param bindingSites A object of \link[GenomicRanges:GRanges-class]{GRanges} indicates
#' candidate binding sites (eg. the output of fimo). The GRanges object must have
#' score column in the metadata column.
#' @param seqlev A vector of characters indicates the sequence levels.
#' @param upstream,downstream numeric(1) or integer(1).
#' Upstream and downstream of the binding region for
#' aggregate ATAC-seq footprint.
#' @param maxSiteNum numeric(1). Maximal number of predicted binding sites.
#' if predicted binding sites is more than this number, top maxSiteNum binding
#' sites will be used.
#' @param anchor "cut site" or "fragment center". If "fragment center" is used,
#' the input bamfiles must be paired-end.
#' @param group Group information for the bamfiles. Default by strand. Otherwise,
#' a factor or vector of characters with same length of bamfiles. The
#' group is limited to 2 groups.
#' @param ... xlab, legTitle, xlim or ylim could be used by \link{plotFootprints}
#' @importFrom stats cor.test
#' @importFrom ChIPpeakAnno featureAlignedSignal reCenterPeaks
#' @importFrom GenomicAlignments readGAlignments
#' @importFrom Biostrings matchPWM maxScore
#' @importFrom Rsamtools ScanBamParam testPairedEndBam scanBamHeader
#' @import GenomeInfoDb
#' @import GenomicRanges
#' @import IRanges
#' @import S4Vectors
#' @return an invisible list of matrixes with the signals for plot.
#' It includes:
#' - signal mean values of coverage for positive strand and negative strand
#' in feature regions
#' - spearman.correlation spearman correlations of cleavage counts in the
#' highest 10-nucleotide-window and binding prediction score.
#' - bindingSites predicted binding sites.
#' @export
#' @references Chen, K., Xi, Y., Pan, X., Li, Z., Kaestner, K., Tyler, J.,
#' Dent, S., He, X. and Li, W., 2013.
#' DANPOS: dynamic analysis of nucleosome position and occupancy by sequencing.
#' Genome research, 23(2), pp.341-351.
#' @author Jianhong Ou, Julie Zhu
#' @examples
#'
#'bamfile <- system.file("extdata", "GL1.bam",
#' package="ATACseqQC")
#'library(MotifDb)
#'CTCF <- query(MotifDb, c("CTCF"))
#'CTCF <- as.list(CTCF)
#'library(BSgenome.Hsapiens.UCSC.hg19)
#'factorFootprints(bamfile, pfm=CTCF[[1]],
#' genome=Hsapiens,
#' min.score="95%", seqlev="chr1",
#' upstream=100, downstream=100)
#'##### Using user defined binding sites #####
#'bds <- readRDS(system.file("extdata", "jolma2013.motifs.bindingList.95.rds",
#' package="ATACseqQC"))
#'bindingSites <- bds[["Hsapiens-jolma2013-CTCF"]]
#'seqlev <- "chr1" #seqlevels(bindingSites)
#'bindingSites <- bindingSites[seqnames(bindingSites) %in% seqlev]
#'seqlevels(bindingSites) <- seqlev
#'seqinfo(bindingSites) <- seqinfo(Hsapiens)[seqlev]
#'factorFootprints(bamfile, pfm=CTCF[[1]],
#' genome=Hsapiens, seqlev=seqlev,
#' upstream=100, downstream=100)
#'
factorFootprints <- function(bamfiles, index=bamfiles, pfm, genome,
min.score="95%", bindingSites,
seqlev=paste0("chr", c(1:22, "X", "Y")),
upstream=100, downstream=100,
maxSiteNum=1e6, anchor="cut site",
group="strand",
...){
#stopifnot(length(bamfiles)==4)
stopifnot(is(genome, "BSgenome"))
stopifnot(all(round(colSums(pfm), digits=4)==1))
stopifnot(upstream>10 && downstream>10)
stopifnot(is.numeric(maxSiteNum))
stopifnot(all(seqlev %in% seqlevels(genome)))
maxSiteNum <- ceiling(maxSiteNum[1])
stopifnot(maxSiteNum>1)
anchor <- match.arg(anchor, choices = c("cut site", "fragment center"))
if(length(group) == 1){
if(group!="strand"){
stop("If length of group is 1, it must be strand.")
}
groupFlag <- TRUE
}else{
stopifnot(length(group)==length(bamfiles))
group <- as.factor(group)
if(length(levels(group))!=2){
stop("The length of levels of group must be 2.")
}
groupFlag <- FALSE
}
if(anchor=="fragment center"){
null <- mapply(function(.bam, .index){
suppressMessages(pe <- testPairedEndBam(.bam, .index))
if(!pe){
stop("If anchor is fragment center, the bamfiles must be paired end reads.")
}
}, bamfiles, index)
}
if(missing(bindingSites)){
pwm <- motifStack::pfm2pwm(pfm)
maxS <- maxScore(pwm)
if(!is.numeric(min.score)){
if(!is.character(min.score)){
stop("'min.score' must be a single number or string")
}else{
nc <- nchar(min.score)
if (substr(min.score, nc, nc) == "%"){
min.score <- substr(min.score, 1L, nc - 1L)
}else{
stop("'min.score' can be given as a character string containing a percentage",
"(e.g. '85%') of the highest possible score")
}
min.score <- maxS * as.double(min.score)/100
}
}else{
min.score <- min.score[1]
}
predefined.score <- maxS * as.double(0.85)
suppressWarnings({
mt <- tryCatch(matchPWM(pwm, genome, min.score = min(predefined.score, min.score),
with.score=TRUE,
exclude=paste0("^", names(genome)[!names(genome) %in% seqlev], "$")),
error=function(e){
message(e)
stop("No predicted binding sites available by giving seqlev. ")
})
})
if (min.score <= predefined.score){
mt$userdefined <- TRUE
} else {
mt$userdefined <- FALSE
mt$userdefined[mt$score >= min.score] <- TRUE
}
if(length(mt)>maxSiteNum){## subsample
mt$oid <- seq_along(mt)
mt <- mt[order(mt$score, decreasing = TRUE)]
mt <- mt[seq.int(maxSiteNum)]
mt <- mt[order(mt$oid)]
mt$oid <- NULL
}
subsampleNum <- min(10000, maxSiteNum)
if(length(mt)>subsampleNum && sum(mt$userdefined)<subsampleNum){## subsample
set.seed(seed = 1)
mt.keep <- seq_along(mt)[!mt$userdefined]
n <- subsampleNum-sum(mt$userdefined)
if(length(mt.keep)>n){
mt.keep <- mt.keep[order(mt[mt.keep]$score, decreasing = TRUE)]
mt.keep <- mt.keep[seq.int(n)]
mt.keep <- sort(mt.keep)
mt.keep <- seq_along(mt) %in% mt.keep
mt <- mt[mt$userdefined | mt.keep]
}
}
}else{
stopifnot(is(bindingSites, "GRanges"))
stopifnot(all(!is.na(seqlengths(bindingSites))))
stopifnot(length(bindingSites)>1)
stopifnot(length(bindingSites$score)==length(bindingSites))
mt <- bindingSites
mt$userdefined <- TRUE
}
if(sum(mt$userdefined)<2){
stop("less than 2 binding sites by input min.score")
}
wid <- ncol(pfm)
#mt <- mt[seqnames(mt) %in% seqlev]
seqlevels(mt) <- seqlev
seqinfo(mt) <- Seqinfo(seqlev, seqlengths = seqlengths(mt))
## read in bam file with input seqlev specified by users
which <- as(seqinfo(mt), "GRanges")
#param <- ScanBamParam(which=which)
if(anchor=="cut site"){
bamIn <- mapply(function(.b, .i) {
seqlevelsStyle(which) <- checkBamSeqStyle(.b, .i)[1]
param <- ScanBamParam(which=which)
readGAlignments(.b, .i, param = param)
}, bamfiles, index, SIMPLIFY = FALSE)
}else{
bamIn <- mapply(function(.b, .i) {
seqlevelsStyle(which) <- checkBamSeqStyle(.b, .i)[1]
param <- ScanBamParam(which=which)
readGAlignmentPairs(.b, .i, param = param)
}, bamfiles, index, SIMPLIFY = FALSE)
}
bamIn <- lapply(bamIn, as, Class = "GRanges")
bamIn <- split(bamIn, group)
bamIn <- lapply(bamIn, function(.ele){
if(!is(.ele, "GRangesList")) .ele <- GRangesList(.ele)
.ele <- unlist(.ele)
seqlevelsStyle(.ele) <- seqlevelsStyle(genome)[1]
if(anchor=="cut site"){
## keep 5'end as cutting sites
promoters(.ele, upstream=0, downstream=1)
}else{
## keep fragment center
reCenterPeaks(.ele, width=1)
}
})
libSize <- lengths(bamIn)
if(groupFlag){
libFactor <- mapply(bamIn, libSize, FUN=function(.ele, .libSize){
coverageSize <- sum(as.numeric(width(reduce(.ele, ignore.strand=TRUE))))
.libSize / coverageSize
})
}else{
libFactor <- libSize/mean(libSize)
}
if(groupFlag){
## split into positive strand and negative strand
bamIn <- split(bamIn[[1]], strand(bamIn[[1]]))
## get coverage
cvglist <- sapply(bamIn, coverage)
cvglist <- cvglist[c("+", "-")]
cvglist <- lapply(cvglist, function(.ele)
.ele[names(.ele) %in% seqlev])
## coverage of mt, must be filtered, otherwise too much
cvgSum <- cvglist[["+"]] + cvglist[["-"]] ## used for filter mt
}else{
## it was splitted by groups
## get coverage
cvglist <- sapply(bamIn, coverage)
cvglist <- lapply(cvglist, function(.ele)
.ele[names(.ele) %in% seqlev])
## coverage of mt, must be filtered, otherwise too much
cvgSum <- cvglist[[1]] + cvglist[[2]] ## used for filter mt
}
mt.s <- split(mt, seqnames(mt))
seqlev <- intersect(names(cvgSum), names(mt.s))
cvgSum <- cvgSum[seqlev]
mt.s <- mt.s[seqlev]
## too much if use upstream and downstream, just use 3*wid maybe better.
mt.s.ext <- promoters(mt.s, upstream=wid, downstream=wid+wid)
stopifnot(all(lengths(mt.s.ext)==lengths(mt.s)))
mt.v <- Views(cvgSum, mt.s.ext)
mt.s <- mt.s[viewSums(mt.v)>0]
mt <- unlist(mt.s)
mt.ids <- promoters(reCenterPeaks(mt, width=1),
upstream=upstream+floor(wid/2),
downstream=downstream+ceiling(wid/2)+1)
mt.ids <- paste0(as.character(seqnames(mt.ids)), ":", start(mt.ids), "-", end(mt.ids))
sigs <- featureAlignedSignal(cvglists=cvglist,
feature.gr=reCenterPeaks(mt, width=1),
upstream=upstream+floor(wid/2),
downstream=downstream+ceiling(wid/2),
n.tile=upstream+downstream+wid)
mt <- mt[match(rownames(sigs[[1]]), mt.ids)]
cor <- lapply(sigs, function(sig){
sig.colMeans <- colMeans(sig)
## calculate correlation of footprinting and binding score
windows <- slidingWindows(IRanges(1, ncol(sig)), width = wid, step = 1)[[1]]
# remove the windows with overlaps of motif binding region
windows <- windows[end(windows)<=upstream | start(windows)>=upstream+wid]
sig.windowMeans <- viewMeans(Views(sig.colMeans, windows))
windows.sel <- windows[which.max(sig.windowMeans)][1]
highest.sig.windows <-
rowMeans(sig[, start(windows.sel):end(windows.sel)])
predictedBindingSiteScore <- mt$score
if(length(predictedBindingSiteScore) == length(highest.sig.windows)){
suppressWarnings({
cor <- cor.test(x = predictedBindingSiteScore,
y = highest.sig.windows,
method = "spearman")
})
}else{
cor <- NA
}
cor
})
sigs <- lapply(sigs, function(.ele) .ele[mt$userdefined, ])
mt <- mt[mt$userdefined]
mt$userdefined <- NULL
## segmentation the signals
if(groupFlag){
## x2 because stranded.
Profile <- lapply(sigs, function(.ele) colMeans(.ele, na.rm = TRUE)*2/libFactor[1])
}else{
Profile <- mapply(sigs, libFactor, FUN = function(.ele, .libFac){
colMeans(.ele, na.rm = TRUE)/.libFac
}, SIMPLIFY = FALSE)
}
## upstream + wid + downstream
Profile.split <- lapply(Profile, function(.ele){
list(upstream=.ele[seq.int(upstream)],
binding=.ele[upstream+seq.int(wid)],
downstream=.ele[upstream+wid+seq.int(downstream)])
})
optimalSegmentation <- function(.ele){
.l <- length(.ele)
short_abun <- cumsum(.ele)/seq.int(.l)
long_abun <- cumsum(rev(.ele))/seq.int(.l)
long_abun <- rev(long_abun)
short_abun <- short_abun[-length(short_abun)]
long_abun <- long_abun[-1]
##long_abun should always greater than short_abun
long_abun <- long_abun - short_abun
long_abun[long_abun<0] <- 0
cov_diff <- numeric(length(short_abun))
for(i in seq_along(.ele)){
cov_diff_tmp <- .ele
cov_diff_tmp <- cov_diff_tmp-short_abun[i]
cov_diff_tmp[-seq.int(i)] <- cov_diff_tmp[-seq.int(i)] - long_abun[i]
cov_diff[i] <- mean(cov_diff_tmp^2)
}
.ids <- which(cov_diff==min(cov_diff, na.rm = TRUE))
data.frame(pos=.ids, short_abun=short_abun[.ids], long_abun=long_abun[.ids])
}
Profile.seg <- lapply(Profile.split, function(.ele){
ups <- optimalSegmentation(.ele$upstream)
downs <- optimalSegmentation(rev(.ele$downstream))
## find the nearest pair
.min <- c(max(rbind(ups, downs)), 0, 0)
for(i in seq.int(nrow(ups))){
for(j in seq.int(nrow(downs))){
tmp <- sum(abs(ups[i, -1] - downs[j, -1]))
if(tmp < .min[1]){
.min <- c(tmp, i, j)
}
}
}
c(colMeans(rbind(ups[.min[2], ], downs[.min[3], ])), binding=mean(.ele$binding, na.rm=TRUE))
})
Profile.seg <- colMeans(do.call(rbind, Profile.seg))
Profile.seg[3] <- Profile.seg[2]+Profile.seg[3]
names(Profile.seg)[2:3] <- c("distal_abun", "proximal_abun")
tryCatch({ ## try to avoid the error when ploting.
args <- list(...)
if(groupFlag){
args$Profile <- c(Profile[["+"]], Profile[["-"]])
args$legLabels <- c("For. strand", "Rev. strand")
}else{
args$Profile <- c(Profile[[1]], Profile[[2]])
args$legLabels <- names(Profile)
}
args$ylab <- ifelse(anchor=="cut site", "Cut-site probability", "reads density (arbitrary unit)")
args$Mlen <- wid
args$motif <- pwm2pfm(pfm)
args$segmentation <- Profile.seg
do.call(plotFootprints, args = args)
}, error=function(e){
message(e)
})
return(invisible(list(signal=sigs,
spearman.correlation=cor,
bindingSites=mt,
Mlen=wid,
estLibSize=libSize,
Profile.segmentation=Profile.seg)))
}
pwm2pfm <- function(pfm, name="motif"){
if(!all(round(colSums(pfm), digits=4)==1)){
return(NULL)
}
new("pfm", mat=as.matrix(pfm), name=name)
}
checkBamSeqStyle <- function(bamfile, index){
header <- scanBamHeader(bamfile, index=index)
which <- header[[1]]$targets
seqlevelsStyle(names(which))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.