Nothing
#' V-plot
#' @description Aggregate ATAC-seq Fragment Midpoint vs. Length for a given motif generated
#' over binding sites within the genome.
#' @param bamfiles A vector of characters indicates the file names of bams.
#' All the bamfiles will be pulled together.
#' @param index The names of the index file of the 'BAM' file being processed;
#' This is given without the '.bai' extension.
#' @param pfm A Position frequency Matrix represented as a numeric matrix
#' with row names A, C, G and T.
#' @param genome An object of \link[BSgenome:BSgenome-class]{BSgenome}.
#' @param min.score The minimum score for counting a match.
#' Can be given as a character string containing a
#' percentage (e.g. "95%") of the highest possible
#' score or as a single number.
#' See \link[Biostrings]{matchPWM}.
#' @param bindingSites A object of \link[GenomicRanges:GRanges-class]{GRanges} indicates
#' candidate binding sites (eg. the output of fimo).
#' @param seqlev A vector of characters indicates the sequence levels.
#' @param upstream,downstream numeric(1) or integer(1).
#' Upstream and downstream of the binding region for
#' aggregate ATAC-seq footprint.
#' @param maxSiteNum numeric(1). Maximal number of predicted binding sites.
#' if predicted binding sites is more than this number, top maxSiteNum binding
#' sites will be used.
#' @param draw Plot or not. Default TRUE.
#' @param ... parameters could be used by \link[graphics]{smoothScatter}
#' @importFrom ChIPpeakAnno reCenterPeaks
#' @importFrom GenomicAlignments readGAlignmentPairs
#' @importFrom Biostrings matchPWM maxScore
#' @importFrom Rsamtools ScanBamParam
#' @importFrom graphics smoothScatter
#' @importFrom KernSmooth bkde2D
#' @import GenomeInfoDb
#' @import GenomicRanges
#' @import IRanges
#' @import S4Vectors
#' @return an invisible data.frame for plot.
#' @export
#' @references Jorja G. Henikoff, Jason A. Belsky, Kristina Krassovsky,
#' David M. MacAlpine, and Steven Henikoff.
#' Epigenome characterization at single base-pair resolution.
#' PNAS 2011 108 (45) 18318-18323
#' @author Jianhong Ou
#' @examples
#'
#'bamfile <- system.file("extdata", "GL1.bam",
#' package="ATACseqQC")
#'library(MotifDb)
#'CTCF <- query(MotifDb, c("CTCF"))
#'CTCF <- as.list(CTCF)
#'library(BSgenome.Hsapiens.UCSC.hg19)
#'vPlot(bamfile, pfm=CTCF[[1]],
#' genome=Hsapiens,
#' min.score="95%", seqlev="chr1",
#' ylim=c(30, 250))
#'
vPlot <- function(bamfiles, index=bamfiles, pfm, genome,
min.score="95%", bindingSites,
seqlev=paste0("chr", c(1:22, "X", "Y")),
upstream=200, downstream=200,
maxSiteNum=1e6, draw=TRUE, ...){
stopifnot(is(genome, "BSgenome"))
stopifnot(upstream>10 && downstream>10)
stopifnot(is.numeric(maxSiteNum))
maxSiteNum <- ceiling(maxSiteNum[1])
stopifnot(maxSiteNum>1)
if(missing(bindingSites)){
stopifnot(all(round(colSums(pfm), digits=4)==1))
pwm <- motifStack::pfm2pwm(pfm)
maxS <- maxScore(pwm)
if(!is.numeric(min.score)){
if(!is.character(min.score)){
stop("'min.score' must be a single number or string")
}else{
nc <- nchar(min.score)
if (substr(min.score, nc, nc) == "%"){
min.score <- substr(min.score, 1L, nc - 1L)
}else{
stop("'min.score' can be given as a character string containing a percentage",
"(e.g. '85%') of the highest possible score")
}
min.score <- maxS * as.double(min.score)/100
}
}else{
min.score <- min.score[1]
}
predefined.score <- maxS * as.double(0.85)
suppressWarnings({
mt <- matchPWM(pwm, genome, min.score = min(predefined.score, min.score),
with.score=TRUE,
exclude=names(genome)[!names(genome) %in% seqlev])
})
if (min.score <= predefined.score){
mt$userdefined <- TRUE
} else {
mt$userdefined <- FALSE
mt$userdefined[mt$score >= min.score] <- TRUE
}
if(length(mt)>maxSiteNum){## subsample
mt$oid <- seq_along(mt)
mt <- mt[order(mt$score, decreasing = TRUE)]
mt <- mt[seq.int(maxSiteNum)]
mt <- mt[order(mt$oid)]
mt$oid <- NULL
}
subsampleNum <- min(10000, maxSiteNum)
if(length(mt)>subsampleNum && sum(mt$userdefined)<subsampleNum){## subsample
set.seed(seed = 1)
mt.keep <- seq_along(mt)[!mt$userdefined]
n <- subsampleNum-sum(mt$userdefined)
if(length(mt.keep)>n){
mt.keep <- mt.keep[order(mt[mt.keep]$score, decreasing = TRUE)]
mt.keep <- mt.keep[seq.int(n)]
mt.keep <- sort(mt.keep)
mt.keep <- seq_along(mt) %in% mt.keep
mt <- mt[mt$userdefined | mt.keep]
}
}
wid <- ncol(pfm)
}else{
stopifnot(is(bindingSites, "GRanges"))
stopifnot(all(!is.na(seqlengths(bindingSites))))
stopifnot(length(bindingSites)>1)
stopifnot(length(bindingSites$score)==length(bindingSites))
mt <- bindingSites
mt$userdefined <- TRUE
wid <- mean(width(mt))
}
if(sum(mt$userdefined)<2){
stop("less than 2 binding sites by input min.score")
}
#mt <- mt[seqnames(mt) %in% seqlev]
seqlevels(mt) <- seqlev
seqinfo(mt) <- Seqinfo(seqlev, seqlengths = seqlengths(mt))
## read in bam file with input seqlev specified by users
which <- as(seqinfo(mt), "GRanges")
bamIn <- mapply(function(.b, .i) {
seqlevelsStyle(which) <- checkBamSeqStyle(.b, .i)[1]
param <- ScanBamParam(which=which,
flag = scanBamFlag(isProperPair=TRUE,
isSecondaryAlignment=FALSE,
isNotPassingQualityControls=FALSE,
isDuplicate=FALSE))
readGAlignmentPairs(.b, .i, param = param)
}, bamfiles, index, SIMPLIFY = FALSE)
bamIn <- lapply(bamIn, as, Class = "GRanges")
if(!is(bamIn, "GRangesList")) bamIn <- GRangesList(bamIn)
bamIn <- unlist(bamIn, use.names = FALSE)
if(length(bamIn)<1){
stop("No paired reads. Please double check the inputs.")
}
seqlevelsStyle(bamIn) <- seqlevelsStyle(genome)[1]
mt.ext <- promoters(reCenterPeaks(mt, width=1),
upstream=upstream+floor(wid/2),
downstream=downstream+ceiling(wid/2))
bamIn$w <- width(bamIn)
bamIn <- reCenterPeaks(bamIn, width = 1)
ol <- findOverlaps(bamIn, mt.ext)
bam.query <- bamIn[queryHits(ol)]
mt.ext.subject <- mt.ext[subjectHits(ol)]
rel <- getRelationship(bam.query, mt.ext.subject)
rel$FragmentLength <- bam.query$w
rel$distanceToBindingSite <- rel$distanceToStart - upstream - floor(wid/2)
rel <- rel[order(rel$distanceToStart, rel$FragmentLength), c("distanceToBindingSite", "FragmentLength")]
if(draw) smoothScatter(rel, ...)
return(invisible(rel))
}
getRelationship <- function(queryHits, subjectHits){
if(!inherits(queryHits, "GRanges"))
stop("queryHits must be an object of GRanges")
if(!inherits(subjectHits, "GRanges"))
stop("subjectHits must be an object of GRanges")
strand <- strand(subjectHits)=="-"
FeatureStart <- as.numeric(ifelse(strand,
end(subjectHits),
start(subjectHits)))
FeatureEnd <- as.numeric(ifelse(strand,
start(subjectHits),
end(subjectHits)))
PeakStart <- as.numeric(ifelse(strand, end(queryHits),
start(queryHits)))
PeakEnd <- as.numeric(ifelse(strand, start(queryHits), end(queryHits)))
ss <- PeakStart - FeatureStart
ee <- PeakEnd - FeatureEnd
se <- PeakStart - FeatureEnd
es <- PeakEnd - FeatureStart
shortestDistance <- apply(cbind(ss, ee, se, es), 1,
function(.ele) min(abs(.ele)))
shortestDistanceToStart <- apply(cbind(ss, es), 1,
function(.ele) min(abs(.ele)))
data.frame(shortestDistance=shortestDistance,
ss=ss,
distanceToStart=shortestDistanceToStart)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.