#' getCovStats
#'
#' Assembles read distribution statistics from a set of bigwig files based on
#' random windows.
#'
#' @param x A (named) vector of paths to bigwig files (all from the same genome)
#' @param binSize The size of bins
#' @param nbBins The approximate number of random bins. More bins gives more
#' accurate readouts but take longer to read and compute.
#' @param exclude Region to exclude
#' @param canonical.chr Logical; whether to restrict the sampling to standard
#' chromosomes.
#' @param maxCovQuant The quantile to use as maximum coverage (default 0.999)
#' @param BPPARAM BioParallel BPPARAM for multithreading across files.
#'
#' @return A named list of QC tables
#' @export
#' @importFrom BiocParallel bplapply SerialParam
#' @import GenomicRanges S4Vectors
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom dplyr bind_rows
#' @importFrom rtracklayer import BigWigSelection BigWigFile
#' @importFrom Rsamtools BamFile scanBamFlag ScanBamParam
#'
#' @examples
#' # we use an example bigwig file
#' bwf <- system.file("extdata/example_atac.bw", package="epiwraps")
#' # because most of the file is empty, we'll exclude some of the ranges
#' cs <- getCovStats(bwf, exclude=GRanges("1", IRanges(1, 4300000)))
#' plotCovStats(cs)
getCovStats <- function(x, binSize=1000, nbBins=10000, exclude=NULL,
canonical.chr=TRUE, maxCovQuant=0.999,
BPPARAM=SerialParam()){
if(.parseFiletypeFromName(x[[1]])=="bam"){
maxes <- seqlengths(BamFile(x[[1]]))
}else{
maxes <- seqlengths(BigWigFile(x[[1]]))
}
stopifnot(length(maxes)>0)
if(canonical.chr) maxes <- maxes[grep("^Y$|^X$|^[0-9]$", ignore.case=TRUE,
gsub("chr|CHR","",names(maxes)))]
if(length(maxes)==0)
stop("No chromosome left after filtering! Consider using ",
"canonical.chr=FALSE")
if(is.null(names(x)))
names(x) <- make.unique(gsub("\\.bw$|\\.bigwig$|\\.bam$", "", basename(x),
ignore.case=TRUE))
chr <- table(sample(factor(names(maxes),names(maxes)), size=nbBins,
prob=maxes/sum(maxes), replace=TRUE))
pos <- lapply(seq_along(chr), FUN=function(x)
sample.int(maxes[x]-binSize,chr[[x]], replace=TRUE))
gr <- sort(GRanges(rep(names(chr),lengths(pos)),
IRanges(unlist(pos), width=binSize)))
if(!is.null(exclude)) gr <- gr[!overlapsAny(gr, exclude)]
covs <- bplapply(x, BPPARAM=BPPARAM, FUN=function(x){
ftype <- .parseFiletypeFromName(x)
if(ftype=="bam"){
flgs <- scanBamFlag(isDuplicate=FALSE, isSecondaryAlignment=FALSE)
params <- Rsamtools::ScanBamParam(which=gr, flag=flgs)
x <- as(GenomicAlignments::readGAlignmentPairs(x, param=params), "GRanges")
y <- as.integer(countOverlaps(gr,x))
}else if(ftype=="bw"){
x <- rtracklayer::import(x, format="BigWig", selection=BigWigSelection(gr))
o <- findOverlaps(gr,x)
tmp <- max(splitAsList(x$score[to(o)], from(o)))
if(length(tmp)!=length(gr)){
y <- rep(0,length(gr))
y[as.integer(names(tmp))] <- as.numeric(tmp)
}else{
y <- as.numeric(tmp)
}
}else{
stop("Unsupported file format ", x)
}
y
})
toExclude <- unique(unlist(lapply(covs, FUN=function(x){
which(x>quantile(x,maxCovQuant))
})))
if(length(toExclude)>0){
covs <- lapply(covs, FUN=function(x) x[-toExclude])
}
if(length(covs)>1){
tmp <- do.call(cbind, covs)
pea <- cor(tmp)
spea <- cor(tmp, method="spearman")
rm(tmp)
}
covs <- lapply(covs, sort)
d1 <- dplyr::bind_rows(lapply(covs, FUN=function(x){
data.frame(coverage=unique(x),
fraction.above=vapply(unique(x), FUN.VALUE=numeric(1),
FUN=function(y) sum(x>y)/length(x)))
}), .id="file")
d2 <- dplyr::bind_rows(lapply(covs, FUN=function(x){
data.frame(rank=rank(x,ties.method="random"), fraction.of.highest=x/max(x))
}), .id="file")
d1$file <- as.factor(d1$file)
d2$file <- as.factor(d2$file)
ll <- list(coverage=d1, enrichment=d2)
if(length(covs)>1) ll <- c(ll, list(cor.pearson=pea, cor.spearman=spea))
ll
}
#' plotCovStats
#'
#' Plots coverage statistics, such as as fingerprint plot.
#'
#' @param qc A list of coverage statistics, as produced by
#' \code{\link{getCovStats}}.
#' @param labels Passed to \code{\link[cowplot]{plot_grid}}.
#'
#' @return A grid object to be plotted.
#' @export
#' @import ggplot2
#' @importFrom cowplot plot_grid get_legend
#'
#' @examples
#' # we use an example bigwig file
#' bwf <- system.file("extdata/example_atac.bw", package="epiwraps")
#' # because most of the file is empty, we'll exclude some of the ranges
#' cs <- getCovStats(bwf, exclude=GRanges("1", IRanges(1, 4300000)))
#' plotCovStats(cs)
plotCovStats <- function(qc, labels="AUTO", show.legend=TRUE){
p1 <- ggplot(qc$coverage, aes(coverage, fraction.above, colour=file)) +
geom_line() + labs(x="Read density", y="Fraction of regions > density") +
theme(legend.position="top", legend.direction = "horizontal") +
xlim(0,min(max(qc$coverage$coverage),50))
qc$enrichment$rank <- qc$enrichment$rank/max(qc$enrichment$rank)
p2 <- ggplot(qc$enrichment, aes(rank, fraction.of.highest, colour=file)) +
geom_abline(slope=1, intercept=0, linetype="dashed", colour="grey") +
geom_line() + labs(x="Region rank", y="Fraction of highest density") +
theme(legend.position="none")
p <- plot_grid(p1 + theme(legend.position="none"), p2,
labels=labels, nrow=1, scale=0.95)
if(!show.legend || length(unique(qc$coverage$file))==1) return(p)
plot_grid(p, get_legend(p1), nrow=2, rel_heights = c(3,1))
}
#' plotCorFromCovStats
#'
#' @param qc A list of coverage statistics, as produced by
#' \code{\link{getCovStats}}.
#' @param method The correlation metrics to include
#' @param col Optional heatmap colors
#' @param na_col Color for the diagonal; passed to
#' \code{\link[ComplexHeatmap]{Heatmap}}.
#' @param column_title Column title (if NULL, uses the metric)
#' @param ... Passed to \code{\link[ComplexHeatmap]{Heatmap}}.
#'
#' @return A `Heatmap` or `HeatmapList` object ready to be plotted.
#' @export
#' @import ComplexHeatmap
#' @importFrom viridisLite viridis inferno
#' @importFrom stats hclust dist as.dendrogram relevel
plotCorFromCovStats <- function(qc, method=c("pearson","spearman"), col=NULL,
na_col="white", column_title=NULL, ...){
if(is.null(qc$cor.pearson) || is.null(qc$cor.spearman))
stop("The object does not contain the necessary information - was
getCovStats run on multiple samples?")
method <- match.arg(method, several.ok=TRUE)
dat <- lapply(setNames(method, method), FUN=function(x){
m <- switch(x,
pearson=qc$cor.pearson,
spearman=qc$cor.spearman)
diag(m) <- NA
m
})
o <- tryCatch( as.dendrogram(hclust(dist(do.call(cbind, dat)))),
error=function(e) FALSE )
h <- lapply(names(dat), FUN=function(x){
if(is.null(col)){
col = switch(x,
pearson=viridisLite::viridis(50),
spearman=viridisLite::inferno(50))
}
if(is.null(column_title)) column_title <- x
Heatmap(dat[[x]], name=x, col=col, column_title=column_title,
cluster_rows=o, cluster_columns=o, na_col="white", ...)
})
if(length(h)==1) return(h[[1]])
hl <- NULL
for(hi in h) hl <- hl + hi
hl
}
#' fragSizesDist
#'
#' @param x A (named) vector of paths to bam files.
#' @param what Either a positive integer (length 1) indicating how many reads
#' to randomly sample, or a character vector (of length 1) indicating which
#' chromosome to read.
#' @param flags A `scanBamFlag` object (see \link[Rsamtools]{ScanBamParam})
#' @param BPPARAM A \link[BiocParallel]{BiocParallel} BPPARAM object for
#' multithreading.
#' @param returnPlot Logical; whether to return a plot.
#'
#' @return If `returnPlot=TRUE`, returns a ggplot object, otherwise a
#' data.frame of fragment lengths.
#' @export
#' @importFrom Rsamtools BamFile ScanBamParam scanBamFlag
#' @importFrom GenomicFiles reduceByYield REDUCEsampler
#' @importFrom GenomicAlignments readGAlignmentPairs
#' @importFrom BiocParallel bplapply SerialParam MulticoreParam SnowParam
#' @importFrom ggplot2 ggplot aes geom_density xlab
#' @import GenomicRanges
#'
#' @examples
#' # example bam file:
#' bam <- system.file("extdata", "ex1.bam", package="Rsamtools")
#' fragSizesDist(bam, what=100)
fragSizesDist <- function(x, what=10000, flags=scanBamFlag(isProperPair=TRUE),
BPPARAM=SerialParam(), returnPlot=TRUE){
stopifnot(length(what)==1)
if(is.null(names(x))) names(x) <- .cleanFileNames(x)
flen <- bplapply(x, BPPARAM=BPPARAM, FUN=function(x){
if(is.numeric(what)){
what <- as.integer(what)
stopifnot(what>1)
x <- GenomicFiles::reduceByYield(
BamFile(x), MAP=identity, YIELD=function(x)
readGAlignmentPairs(x, param = ScanBamParam(flag=flags)),
REDUCE=GenomicFiles::REDUCEsampler(what, verbose=FALSE))
}else{
x <- readGAlignmentPairs(x, param=ScanBamParam(flag=flags,
which=GRanges(what, IRanges(1L,10^8))))
}
width(as(x, "GRanges"))
})
d <- data.frame(sample=rep(factor(names(flen)), lengths(flen)),
length=unlist(flen), row.names=NULL)
if(!returnPlot) return(d)
ggplot(d, aes(length, colour=sample)) + geom_density() +
scale_x_continuous(breaks=c(100*0:5,750,1000), limits=c(0,1000)) +
xlab("Fragment length")
}
#' TSSenrichment
#'
#' A common quality metric for ATAC-seq data (see
#' [definition](https://www.encodeproject.org/data-standards/terms/#enrichment)
#' ). The interpretation of the enrichment score depends on the annotation
#' (the score tends to increase when only fewer, more common TSS are used), but
#' according to ENCODE guidelines anything below 5 is of concerning quality,
#' while a score >8 is ideal.
#'
#' @param tracks A (named) vector of paths to bigwig files.
#' @param ensdb An `ensembldb` object. Alternatively, a GRanges object of
#' regions centered around TSS.
#' @param useSeqLevels Optional seqlevels to use. If NULL, all are used.
#'
#' @return A list with the slots `score` (numeric vector of TSS enrichment
#' scores per sample) and `data` (per bin enrichment, for plotting)
#'
#' @export
#' @examples
#' # we first fetch the path to the example bigwig file:
#' bw <- system.file("extdata/example_atac.bw", package="epiwraps")
#' ## normally, we would load an ensembldb object using AnnotationHub. For the
#' ## purpose of this example, we'll pretend that the following set of regions
#' ## represent TSS:
#' tss <- system.file("extdata/example_peaks.bed", package="epiwraps")
#' tss <- rtracklayer::import(tss)
#' en <- TSSenrichment(bw, tss)
#' en$score
#' ## you can also plot using something like this:
#' ## ggplot(en$data, aes(position, enrichment, colour=sample)) + geom_line()
TSSenrichment <- function(tracks, ensdb, useSeqLevels=NULL){
stopifnot(is(ensdb, "EnsDb") || is(ensdb, "GRanges"))
if(!is(tracks, "character") || !all(file.exists(tracks)) ||
!all(.parseFiletypeFromName(tracks)=="bw"))
stop("`tracks` should be a character vector giving the paths to existing",
"bigwig files.")
if(is.null(names(tracks))) names(tracks) <- .cleanFileNames(tracks)
if(is(ensdb,"GRanges")){
txp <- ensdb
}else{
txp <- promoters(transcripts(ensdb), upstream = 1000, downstream = 1000)
}
if(!is.null(useSeqLevels))
txp <- keepSeqlevels(txp, intersect(useSeqLevels, seqlevels(txp)),
pruning.mode="coarse")
txp <- granges(txp)
txp <- txp[!duplicated(txp)]
if(length(txp)==0)
stop("No TSS found. Are you sure you are providing adequate seqlevels?")
sm <- signal2Matrix(tracks, txp, extend = 1000L, w = 100L, ret="list")
df <- dplyr::bind_rows(lapply(sm, FUN=function(x){
x <- unclass(x)
nf <- rowMeans(x[,c(1,ncol(x))])
w <- which(nf>0)
if(length(w)<20) warning("Too few TSS for adequate estimate.")
data.frame(position=round(seq(from=-1000L, to=1000L, length.out=ncol(x))),
enrichment=colMeans((x[w,])/nf[w]))
}), .id="sample")
df$sample <- factor(df$sample)
w <- which(df$position %in% unique(df$position)[10:11])
ag <- aggregate(df[w,"enrichment",drop=FALSE], by=list(sample=df$sample[w]),
FUN=mean)
ag <- setNames(ag$enrichment, ag$sample)
list(score=ag, data=df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.