Nothing
#' @title split bams into nucleosome free, mononucleosome,
#' dinucleosome and trinucleosome
#' @description use random forest to split the reads into nucleosome free,
#' mononucleosome, dinucleosome and trinucleosome.
#' The features used in random forest including
#' fragment length, GC content, and
#' UCSC phastCons conservation scores.
#' @param obj an object of \link[GenomicAlignments:GAlignments-class]{GAlignments}
#' @param txs GRanges of transcripts
#' @param genome an object of BSgenome
#' @param conservation an object of \link[GenomicScores:GScores-class]{GScores}.
#' @param outPath folder to save the splitted alignments. If outPath is setting,
#' the return of the function will not contain seq and qual fields.
#' @param breaks a numeric vector for fragment size of nucleosome free,
#' mononucleosome, dinucleosome and trinucleosome. The breaks pre-defined
#' here is following the description of Greenleaf's paper (see reference).
#' @param labels a character vector for labels of the levels
#' of the resulting category.
#' @param labelsOfNucleosomeFree,labelsOfMononucleosome character(1). The label
#' for nucleosome free and mononucleosome.
#' @param trainningSetPercentage numeric(1) between 0 and 1. Percentage of
#' trainning set from top coverage.
#' @param cutoff numeric(1) between 0 and 1. cutoff value for prediction.
#' @param halfSizeOfNucleosome numeric(1) or integer(1). Thre read length will
#' be adjusted to half of the nucleosome size to enhance the signal-to-noise
#' ratio.
#' @return a list of GAlignments
#' @author Jianhong Ou
#' @import BiocGenerics
#' @import GenomeInfoDb
#' @import GenomicRanges
#' @import IRanges
#' @import S4Vectors
#' @importFrom Biostrings letterFrequency
#' @importFrom BSgenome getSeq
#' @importFrom ChIPpeakAnno annotatePeakInBatch
#' @importFrom GenomicAlignments coverage
#' @importFrom randomForest randomForest
#' @importFrom stats predict
#' @importFrom GenomicScores gscores
#' @export
#' @references Buenrostro, J.D., Giresi, P.G., Zaba, L.C., Chang, H.Y. and
#' Greenleaf, W.J., 2013. Transposition of native chromatin for fast and
#' sensitive epigenomic profiling of open chromatin, DNA-binding proteins and
#' nucleosome position. Nature methods, 10(12), pp.1213-1218.
#'
#' Chen, K., Xi, Y., Pan, X., Li, Z., Kaestner, K., Tyler, J., Dent, S.,
#' He, X. and Li, W., 2013. DANPOS: dynamic analysis of nucleosome position
#' and occupancy by sequencing. Genome research, 23(2), pp.341-351.
#' @examples
#' library(GenomicRanges)
#' bamfile <- system.file("extdata", "GL1.bam",
#' package="ATACseqQC", mustWork=TRUE)
#' tags <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT")
#' gal1 <- readBamFile(bamFile=bamfile, tag=tags,
#' which=GRanges("chr1", IRanges(1, 1e6)),
#' asMates=FALSE)
#' names(gal1) <- mcols(gal1)$qname
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' txs <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(phastCons100way.UCSC.hg19)
#' splitGAlignmentsByCut(gal1, txs=txs, genome=Hsapiens,
#' conservation=phastCons100way.UCSC.hg19)
#'
splitGAlignmentsByCut <- function(obj, txs, genome, conservation,
outPath,
breaks=c(0, 100, 180, 247, 315, 473, 558, 615, Inf),
labels = c("NucleosomeFree", "inter1",
"mononucleosome", "inter2",
"dinucleosome", "inter3",
"trinucleosome", "others"),
labelsOfNucleosomeFree="NucleosomeFree",
labelsOfMononucleosome="mononucleosome",
trainningSetPercentage=.15,
cutoff = .8, halfSizeOfNucleosome=80L){
stopifnot(length(labels)+1==length(breaks))
stopifnot(length(labelsOfMononucleosome)==1)
stopifnot(length(labelsOfNucleosomeFree)==1)
stopifnot(labelsOfMononucleosome %in% labels)
stopifnot(labelsOfNucleosomeFree %in% labels)
conservationFlag <- FALSE
if(!missing(conservation)){
if(length(conservation)){
stopifnot(is(conservation, "GScores"))
conservationFlag <- TRUE ## conservation is supplied.
}
}
stopifnot(is(obj, "GAlignments"))
if(length(obj)==0 && (!missing(outPath)) && (!conservationFlag)){
## big file mode
meta <- metadata(obj)
if(!all(c("file", "param") %in% names(meta))){
stop("length of obj could not be 0.")
}
ow <- getOption("warn")
on.exit(options(warn = ow))
options(warn=-1)
chunk <- 100000
index <- ifelse(length(meta$index)>0, meta$index, meta$file)
bamfile <- BamFile(meta$file, index=index, yieldSize=chunk, asMates = meta$asMates)
outfile <- list()
mergedfile <- list()
mpos <- NULL
open(bamfile)
if(length(meta$mpos)>0){
mpos <- meta$mpos
}
while (length(chunk0 <- readGAlignments(bamfile, param=meta$param))) {
if(length(mpos)){
mcols(chunk0)$MD <- NULL
names(chunk0) <- mcols(chunk0)$qname
chunk0 <- chunk0[order(names(chunk0))]
mcols(chunk0)$mpos <- mpos[paste(mcols(chunk0)$qname, start(chunk0))]
}
gal1 <- splitGAlignmentsByCut(chunk0, txs=txs, genome = genome,
breaks = breaks,
labels = labels, labelsOfNucleosomeFree = labelsOfNucleosomeFree,
labelsOfMononucleosome = labelsOfMononucleosome,
trainningSetPercentage = trainningSetPercentage,
cutoff = cutoff, halfSizeOfNucleosome = halfSizeOfNucleosome)
## gal1 is a list of GAlignments
## save the gal1 to tmpfiles
for(i in seq_along(gal1)){
outfile[[names(gal1)[i]]] <- c(tempfile(fileext = ".bam"), outfile[[names(gal1)[i]]])
gal1_data <- gal1[[i]]
if(length(meta$header)>0) metadata(gal1_data)$header <- meta$header
exportBamFile(gal1_data, outfile[[names(gal1)[i]]][1])
}
rm(gal1)
}
close(bamfile)
for(i in seq_along(outfile)){
if(length(outfile[[i]])>1){
mergedfile[[names(outfile)[i]]] <- mergeBam(outfile[[i]],
destination=file.path(outPath,
paste0(names(outfile)[i], ".bam")),
indexDestination=TRUE)
unlink(outfile[[i]])
unlink(paste0(outfile[[i]], ".bai"))
}else{
file.copy(outfile[[i]],
file.path(outPath, paste0(names(outfile)[i], ".bam")))
file.copy(paste0(outfile[[i]], ".bai"),
file.path(outPath, paste0(names(outfile)[i], ".bam.bai")))
mergedfile[[names(outfile)[i]]] <-
file.path(outPath, paste0(names(outfile)[i], ".bam"))
unlink(outfile[[i]])
unlink(paste0(outfile[[i]], ".bai"))
}
}
meta$param <- ScanBamParam(flag=meta$param@flag, what=c("qname", "flag", "mapq", "isize"))
objs <- lapply(mergedfile, function(.ele){
## read the bam files
chunk0 <- readGAlignments(.ele, param=meta$param)
names(chunk0) <- mcols(chunk0)$qname
chunk0 <- chunk0[order(names(chunk0))]
mcols(chunk0)$mpos <- mpos[paste(mcols(chunk0)$qname, start(chunk0))]
chunk0
})
return(objs)
}
if(length(obj)==0){
obj <- loadBamFile(obj)
}
stopifnot(length(names(obj))==length(obj))
seqlev <- unique(seqlevels(obj))
seqlev <- seqlev[seqlev %in% unique(seqnames(obj))]
if(length(seqlev)==0){
stop("no reads in the obj.")
}
objs <- split(obj,
cut(abs(mcols(obj)$isize),
breaks = breaks,
labels = labels))
if(!conservationFlag){
if(!missing(outPath)){
writeListOfGAlignments(objs, outPath = outPath)
}
return(objs)
}
stopifnot(is(txs, "GRanges"))
stopifnot(is(genome, "BSgenome"))
nf.old <- objs[[labelsOfNucleosomeFree]]
nc.old <- objs[[labelsOfMononucleosome]]
rm(objs)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
nf.cvg <- coverage(nf.old)
nc.cvg <- coverage(nc.old)
nf.cvg <- nf.cvg[seqlev]
nc.cvg <- nc.cvg[seqlev]
nf.cvg.quantile <- sapply(nf.cvg, function(.ele) {
.ele <- table(.ele)
.ele <- .ele[names(.ele)!="0"] ## remove 0
.x <- cumsum(.ele)/sum(.ele)
w <- which(.x>=1-trainningSetPercentage)
if(length(w)==0) return(0)
as.numeric(names(.x[w[1]]))
})
nc.cvg.quantile <- sapply(nc.cvg, function(.ele) {
.ele <- table(.ele)
.ele <- .ele[names(.ele)!="0"] ## remove 0
.x <- cumsum(.ele)/sum(.ele)
w <- which(.x>=1-trainningSetPercentage)
if(length(w)==0) return(0)
as.numeric(names(.x[w[1]]))
})
cvg.quantile <- ifelse(nf.cvg.quantile < nc.cvg.quantile,
nf.cvg.quantile, nc.cvg.quantile)
#names(cvg.quantile) <- seqlev
nf.cvg.view <-
as(IRangesList(mapply(function(x, y) as(Views(x, x>y), "IRanges"),
nf.cvg, cvg.quantile)), "GRanges")
nc.cvg.view <-
as(IRangesList(mapply(function(x, y) as(Views(x, x>y), "IRanges"),
nc.cvg, cvg.quantile)), "GRanges")
if(length(nc.cvg.view)<1){
stop("not enough mononucleosome reads for training! Just try without conservation score.")
}
if(length(nf.cvg.view)<1){
stop("not enough nucleosome free reads for training! Just try without conservation score.")
}
#annotation the GRanges to get the strand info
TSS <- promoters(txs,
upstream = 0,
downstream = 1)
rm(txs)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
TSS <- TSS[seqnames(TSS) %in% seqlev]
seqlev <- seqlev[seqlev %in% unique(seqnames(TSS))]
seqlevels(TSS) <- seqlev
seqinfo(TSS) <- Seqinfo(seqlev, seqlengths = seqlengths(TSS))
TSS <- unique(TSS)
nf.anno <- annotatePeakInBatch(nf.cvg.view, AnnotationData=TSS,
output="nearestLocation")
nc.anno <- annotatePeakInBatch(nc.cvg.view, AnnotationData=TSS,
output="nearestLocation")
nf.anno$feature_strand[is.na(nf.anno$feature_strand)] <- "*"
nc.anno$feature_strand[is.na(nc.anno$feature_strand)] <- "*"
strand(nf.anno) <- nf.anno$feature_strand
strand(nc.anno) <- nc.anno$feature_strand
mcols(nf.anno) <- DataFrame(score=1)
mcols(nc.anno) <- DataFrame(score=1)
nx <- GRoperator(nf.anno, nc.anno, operator = "-")
nx <- nx[width(nx)>=40]
nf <- nx[nx$score>0]
nc <- nx[nx$score<0]
if(length(nc)<1){
stop("not enough mononucleosome reads for training! Just try without conservation score.")
}
if(length(nf)<1){
stop("not enough nucleosome free reads for training! Just try without conservation score.")
}
block <- 100000
nf <- reCenterPeaks(nf, width=halfSizeOfNucleosome)
nc <- reCenterPeaks(nc, width=halfSizeOfNucleosome)
coverage.nf <-
Views(nf.cvg[seqlev], split(ranges(nf),
as.character(seqnames(nf)))[seqlev])
coverage.nf <- IRangesList(lapply(coverage.nf, function(.ele){
ir <- as(.ele, "IRanges")
mcols(ir)$coverage <- viewMeans(.ele)
ir
}))
nf <- as(coverage.nf, "GRanges")
nf$score <- unlist(lapply(coverage.nf, function(.ele)
as.numeric(mcols(.ele)$coverage)))
nf <- nf[order(nf$score, decreasing=TRUE)]
nf <- nf[seq_len(min(block, length(nf)))]
coverage.nc <-
Views(nc.cvg[seqlev], split(ranges(nc),
as.character(seqnames(nc)))[seqlev])
coverage.nc <- IRangesList(lapply(coverage.nc, function(.ele){
ir <- as(.ele, "IRanges")
mcols(ir)$coverage <- viewMeans(.ele)
ir
}))
nc <- as(coverage.nc, "GRanges")
nc$score <- unlist(lapply(coverage.nc, function(.ele)
as.numeric(mcols(.ele)$coverage)))
nc <- nc[order(nc$score, decreasing=TRUE)]
nc <- nc[seq_len(min(block, length(nc)))]
obj <- obj[seqnames(obj) %in% seqlev]
newdata <- split(as(obj, "GRanges"), names(obj))
newdata <- range(newdata, ignore.strand=TRUE)
newdata <- unlist(newdata)
seqlevels(newdata) <- seqlev
seqinfo(newdata) <- seqinfo(TSS)
nd <- GenomicRanges::trim(newdata)
rm(newdata)
nf.seq <- getSeq(genome, nf)
nc.seq <- getSeq(genome, nc)
# gc ratios
nf.gc <- letterFrequency(nf.seq, letters="CG", as.prob = TRUE)
nc.gc <- letterFrequency(nc.seq, letters="CG", as.prob = TRUE)
nd.gc <- lapply(split(nd, rep(as.character(seq.int(ceiling(length(nd)/block))),
each=block)[seq_along(nd)]),
function(.ele){
letterFrequency(getSeq(genome, .ele), letters="CG", as.prob = TRUE)
})
nd.gc <- nd.gc[order(as.numeric(names(nd.gc)))]
nd.gc <- do.call(rbind, nd.gc)
rm(nf.seq)
rm(nc.seq)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
# conservation
getScoresFromCons <- function(cons, gr){# this step too slow, and why gscores change their output again and again?
gr.cons <- lapply(split(gr, rep(as.character(seq.int(ceiling(length(gr)/block))),
each=block)[seq_along(gr)]),
function(.ele){
gscores(x=cons, ranges = .ele)
})
gr.cons <- gr.cons[order(as.numeric(names(gr.cons)))]
gr.cons <- unlist(GRangesList(gr.cons), use.names = FALSE)
stopifnot(identical(ranges(gr), ranges(gr.cons)))
gr.cons
}
nf.conservation <- getScoresFromCons(conservation, nf)
nc.conservation <- getScoresFromCons(conservation, nc)
nd.conservation <- getScoresFromCons(conservation, nd)
## determine the columns to be used for conservation
score.colnames <- c(nf.conservation, nc.conservation, nd.conservation)
score.colnames <- sapply(colnames(mcols(score.colnames)),
function(.ele){
sum(is.na(mcols(score.colnames)[, .ele]))
})
score.colnames <- names(score.colnames[order(score.colnames)])[1]
nf.conservation <- mcols(nf.conservation)[, score.colnames]
nc.conservation <- mcols(nc.conservation)[, score.colnames]
nd.conservation <- mcols(nd.conservation)[, score.colnames]
nf.conservation[is.na(nf.conservation)] <- 0
nc.conservation[is.na(nc.conservation)] <- 0
nd.conservation[is.na(nd.conservation)] <- 0
# median fragment length
nf.old <- as(nf.old, "GRanges")
nc.old <- as(nc.old, "GRanges")
nf.ol <- findOverlaps(nf, nf.old)
nc.ol <- findOverlaps(nc, nc.old)
nf.ol <- split(width(nf.old)[subjectHits(nf.ol)], queryHits(nf.ol))
nc.ol <- split(width(nc.old)[subjectHits(nc.ol)], queryHits(nc.ol))
rm(nf.old)
rm(nc.old)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
stopifnot(length(nf.ol)==length(nf))
stopifnot(length(nc.ol)==length(nc))
nf.frag.len <- sapply(nf.ol, median)
nc.frag.len <- sapply(nc.ol, median)
nf.frag.len <- nf.frag.len[order(as.numeric(names(nf.frag.len)))]
nc.frag.len <- nc.frag.len[order(as.numeric(names(nc.frag.len)))]
nd.frag.len <- width(nd)
x.nf <- cbind(nf.frag.len, nf.conservation, nf.gc)
x.nc <- cbind(nc.frag.len, nc.conservation, nc.gc)
x <- rbind(x.nf, x.nc)
testdata <- cbind(nd.frag.len, nd.conservation, nd.gc)
colnames(testdata) <- colnames(x) <- NULL
y <- as.factor(rep(c("f", "n"), c(length(nf), length(nc))))
fit <- randomForest(x, y,
ntree=2*ceiling(sqrt(length(y))),
keep.inbag=TRUE)
pred <- predict(fit, newdata=testdata, type='prob')
pred.free <- pred[, "f"]
pred.bind <- pred[, "n"]
rm(nd.gc, nf.gc, nc.gc)
rm(pred)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
nucleosome <- names(obj) %in% names(nd[pred.bind >= cutoff])
nucleosomefree <- names(obj) %in%
names(nd[pred.free>= cutoff & pred.bind < cutoff])
NucleosomeFree <- obj[nucleosomefree]
Nucleosome <- obj[nucleosome]
left <- obj[!(nucleosome | nucleosomefree)]
rm(nd)
rm(obj)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
objs <- split(left,
cut(abs(mcols(left)$isize),
breaks = breaks,
labels = labels))
rm(left)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
Nucleosome <- split(Nucleosome,
cut(abs(mcols(Nucleosome)$isize),
breaks = breaks,
labels = labels))
Nucleosome[[labelsOfNucleosomeFree]] <- NucleosomeFree
rm(NucleosomeFree)
gc(verbose = FALSE, reset = TRUE, full = TRUE)
for(i in intersect(names(Nucleosome), names(objs))){
objs[[i]] <- c(Nucleosome[[i]], objs[[i]])
}
if(!missing(outPath)){
writeListOfGAlignments(objs, outPath = outPath)
}
objs
}
GRbin <- function(A, B, col){
A1 <- B1 <- C <- disjoin(c(A, B), ignore.strand=FALSE)
ol1 <- findOverlaps(C, A, minoverlap=1L,
type="any", ignore.strand=FALSE)
ol2 <- findOverlaps(C, B, minoverlap=1L,
type="any", ignore.strand=FALSE)
A1$value <- B1$value <- 0
A1$value[queryHits(ol1)] <- mcols(A)[subjectHits(ol1), col]
B1$value[queryHits(ol2)] <- mcols(B)[subjectHits(ol2), col]
GRangesList(A=A1, B=B1)
}
GRoperator <- function(A, B, col="score",
operator=c("+", "-", "*", "/", "^", "%%")){
if(!is(A, "GRanges") || !is(B, "GRanges")){
stop("A and B must be objects of GRanges")
}
operator <- match.arg(operator)
if(!(col %in% colnames(mcols(A)))){
stop("col is not in metadata of A")
}
if(!(col %in% colnames(mcols(B)))){
stop("col is not in metadata of B")
}
C <- GRbin(A, B, col)
A <- C$A
B <- C$B
out <- A
operator <- .Primitive(operator)
out$value <- operator(A$value, B$value)
colnames(mcols(out)) <- col
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.