#' make_coverage_matrix
#'
#' Makes coverage matrix based on bam or bigwig inputs
#'
#' @param inputs filenames of bigwig or bam
#' @param ranges ranges for which to compute coverage within
#' @param input_names names to associate with input bigwig or bam files, used
#' for naming assays in resulting SummarizedExperiment object
#' @param binsize binsize to bin coverage
#' @param format format of files, default is auto
#' @param up basepairs upstream of center to use
#' @param down basepairs downstream of center to use
#' @details up and down are 0 by default -- if not specified, actual range is
#' used. All ranges must be of equal width. If up and/or down are provided,
#' then the center of the range and up basepairs upstream and down basepairs
#' downstream are used.
#' @return RangedSummarizedExperiment
#' @import GenomicRanges
#' @export
#' @author Alicia Schep
#' @examples
#'
#' library(GenomicRanges)
#' # First read in some sample data
#' genomation_dir <- system.file("extdata", package = "genomationData")
#'
#' samp.file <- file.path(genomation_dir,'SamplesInfo.txt')
#' samp.info <- read.table(samp.file, header=TRUE, sep='\t',
#' stringsAsFactors = FALSE)
#' samp.info$fileName <- file.path(genomation_dir, samp.info$fileName)
#'
#' ctcf.peaks = genomation::readBroadPeak(system.file("extdata",
#' "wgEncodeBroadHistoneH1hescCtcfStdPk.broadPeak.gz",
#' package = "genomationData"))
#' ctcf.peaks = ctcf.peaks[seqnames(ctcf.peaks) == "chr21"]
#' ctcf.peaks = ctcf.peaks[order(-ctcf.peaks$signalValue)]
#' ctcf.peaks = resize(ctcf.peaks, width = 1000, fix = "center")
#'
#' # Make the coverage matrices
#' mats <- make_coverage_matrix(samp.info$fileName[1:3], ctcf.peaks,
#' up = 500, down = 500, binsize = 25)
#'
#' # Benchmarking speed of make_coverage_matrix compared to ScoreMatrixList
#' # function from genomation
#' \dontrun{
#' bm <- microbenchmark::microbenchmark(ctcf_mats =
#' make_coverage_matrix(samp.info$fileName[1:3],
#' ctcf.peaks,
#' up = 500, down = 500,
#' binsize = 25),
#' geno = ScoreMatrixList(samp.info$fileName[1:3],
#' ctcf.peaks, bin.num = 1000/25),
#' times = 5)
#'
#' bm
#'
#' plot(bm)}
#'
make_coverage_matrix <- function(inputs,
ranges,
input_names = names(inputs),
binsize = 1,
format = c("auto","bigwig","bam"),
up = 0,
down = 0){
format <- match.arg(format)
strand(ranges) <- ifelse(strand(ranges) == "-", "-","+")
if (up > 0 || down > 0){
ranges <- resize(ranges, fix = "center", width = 1)
ranges <- promoters(ranges, upstream = up, downstream = down)
}
#check for equal widths
stopifnot(sum(width(ranges) == width(ranges[1])) == length(ranges))
ranges <- adjust_ranges_for_bin(ranges, binsize)
rn <- paste(as.character(seqnames(ranges)),
paste(end(ranges),start(ranges), sep="-"),
sep=":")
names(ranges) <- rn
if (up > 0 || down > 0){
coln <- seq(-up,down - binsize,binsize)
} else{
coln <- seq(1,width(ranges[1]),binsize)
}
names(inputs) <- input_names
#Determine format
if (format == "auto"){
if (length(inputs) == 1){
tmp <- inputs
} else{
tmp <- inputs[[1]]
}
if (is.character(tmp)){
if (substr(tmp, nchar(tmp) - 6 , nchar(tmp)) == ".bigwig" ||
substr(tmp, nchar(tmp) - 2 , nchar(tmp)) == ".bw"){
format <- "bigwig"
} else if (substr(tmp, nchar(tmp) - 3 , nchar(tmp)) == ".bam"){
format <- "bam"
} else{
stop("Cannot determine format of inputs.")
}
}
}
if (format == "bigwig"){
# bw input
if (length(inputs) == 1){
out <- coverage_mat_from_bigwig(inputs, ranges, binsize, coln)
rownames(out) <- rn
out <- list(out)
names(out) <- input_names
} else{
out <- lapply(inputs, coverage_mat_from_bigwig, ranges, binsize, coln)
out <- lapply(out, function(x) {rownames(x) <- rn; x})
}
} else if (format == "bam"){
if (length(inputs) == 1){
out <- coverage_mat_from_bam(inputs, ranges, binsize, coln)
rownames(out) <- rn
out <- list(out)
names(out) <- input_names
} else{
out <- lapply(inputs, coverage_mat_from_bam, ranges, binsize, coln)
out <- lapply(out, function(x) {rownames(x) <- rn; x})
}
} else {
stop("Format not recognized")
}
return(SummarizedExperiment::SummarizedExperiment(out, rowRanges = ranges))
}
adjust_ranges_for_bin <- function(ranges, binsize){
if (binsize > 1 && width(ranges[1]) %% binsize != 0){
ranges <- resize(ranges, fix = "center",
width = (width(ranges[1]) %/% binsize +1)*binsize)
}
return(ranges)
}
#' normalize_coverage_matrix
#'
#' Normalizes coverage matrices using one of several methods.
#' @param mats matrix, list of matrix, or SummarizedExperiment
#' @param method normalization method option, see Details
#' @param pct Percentile, only used if PercentileMax is method
#' @param scalar vector of scalars used for normalizing each mat, only
#' used if scalar is method
#' @param digits number of significant digits of result to keep.
#' @param ... additional arguments to normalize_coverage_matrix
#' @details Normalization choices are "localRms", "localMean",
#' "localNonZeroMean", "PercentileMax", "scalar", and "none". localRMS will
#' divide each row by the root mean squared values of that row. localMean will
#' divide each row by the mean of that row. localNonZeroMean will divide each
#' row by nonzero values in that row. PercentileMax will divide values based on
#' percentile (given by pct argument) of the entire matrix. scalar will divide
#' entire matrix by a scalar, given by scalar argument. This scalar could for
#' example be a measure of the sequencing depth.
#' @return Should return data in the same format as input, but now with values
#' normalized according to the method chosen.
#' @export
#' @rdname normalize_coverage_matrix
#' @name normalize_coverage_matrix
#' @aliases normalize_coverage_matrix,list-method
#' normalize_coverage_matrix,matrix-method
#' normalize_coverage_matrix,SummarizedExperiment-method
#' @author Alicia Schep
#'
#' @examples
#' ## First we'll make some coverage matrices
#'
#' library(GenomicRanges)
#' # First read in some sample data
#' genomation_dir <- system.file("extdata", package = "genomationData")
#'
#' samp.file <- file.path(genomation_dir,'SamplesInfo.txt')
#' samp.info <- read.table(samp.file, header=TRUE, sep='\t',
#' stringsAsFactors = FALSE)
#' samp.info$fileName <- file.path(genomation_dir, samp.info$fileName)
#'
#' ctcf.peaks = genomation::readBroadPeak(system.file("extdata",
#' "wgEncodeBroadHistoneH1hescCtcfStdPk.broadPeak.gz",
#' package = "genomationData"))
#' ctcf.peaks = ctcf.peaks[seqnames(ctcf.peaks) == "chr21"]
#' ctcf.peaks = ctcf.peaks[order(-ctcf.peaks$signalValue)]
#' ctcf.peaks = resize(ctcf.peaks, width = 1000, fix = "center")
#'
#' # Make the coverage matrices
#' mats <- make_coverage_matrix(samp.info$fileName[1:3], ctcf.peaks,
#' up = 500, down = 500, binsize = 25)
#'
#' # Now normalize:
#' norm_mats <- normalize_coverage_matrix(mats)
#'
setMethod(normalize_coverage_matrix, "list",
function(mats,
method = c("localRms",
"localMean",
"localNonZeroMean",
"PercentileMax",
"scalar",
"none"),
pct = 0.95,
scalar = NULL,
digits = 3){
method <- match.arg(method)
if (method == "scalar"){
stopifnot(!is.null(scalar))
out <- lapply(seq_len(length(mats)), function(x) {
normalize_coverage_matrix_single(mats[[x]],
method = "scalar",
scalar = scalar[x],
digits = digits)
})
names(out) <- names(mats)
} else{
out <- lapply(mats, normalize_coverage_matrix_single,
method = method,
pct = pct,
digits = digits)
}
return(out)
})
#' @rdname normalize_coverage_matrix
#' @export
setMethod(normalize_coverage_matrix, "matrix",
function(mats,
method = c("localRms",
"localMean",
"localNonZeroMean",
"PercentileMax",
"scalar", "none"),
pct = 0.95,
scalar = NULL,
digits = 3){
method <- match.arg(method)
out <- normalize_coverage_matrix_single(mats, method, pct, scalar,
digits)
return(out)
})
#' @export
#' @rdname normalize_coverage_matrix
setMethod(normalize_coverage_matrix, "SummarizedExperiment",
function(mats,
...){
out <- mats
assays(out) <-
normalize_coverage_matrix(as.list(assays(mats)), ...)
return(out)
})
## Modified from gChipseq package
## not exported
normalize_coverage_matrix_single <- function(mat,
method = c("localRms",
"localMean",
"localNonZeroMean",
"PercentileMax",
"scalar",
"none"),
pct = 0.95,
scalar = NULL,
digits = 3)
{
method <- match.arg(method)
if (method == "localRms") {
scaler <- sqrt(rowSums(mat^2))
scaler[scaler == 0] <- 1
res <- mat/scaler
} else if (method == "localMean") {
scaler <- rowMeans(mat)
scaler[scaler == 0] <- 1
res <- mat/scaler
} else if (method == "localNonZeroMean") {
scaler <- apply(mat, 1, function(x) {
mean(x[x != 0])
})
scaler[is.nan(scaler)] <- 1
res <- mat/scaler
} else if (method == "PercentileMax") {
scaler <- quantile(mat, pct)
if (scaler == 0) {
scaler <- 1
}
res <- mat/scaler
res[res[] > 1] <- 1
} else if (method == "scalar") {
res <- mat/scalar
} else if (method == "none") {
res <- mat
}
#res <- signif(res, digits = digits)
return(res)
}
#### Helper Functions (not exported) ---------------------------------------------------------
#Function to reduce size of matrix by binning columns
bin_mat <- function(mat, binsize, digits = 3){
out <- vapply(seq(1, ncol(mat),binsize),
function(x) rowMeans(mat[,x:(x+binsize - 1),drop =FALSE]),
rep(0,nrow(mat)))
out <- signif(out, digits = digits)
if (dim(mat)[1] == 1) out <- matrix(out, nrow = 1)
return(out)
}
coverage_mat_from_bigwig <- function(bigwig_file, ranges, binsize, coln){
#check for equal widths
stopifnot(sum(width(ranges) == width(ranges[1])) == length(ranges))
tmp <- rtracklayer::import.bw(bigwig_file, selection = ranges,
as = "NumericList")
tmp_list <- lapply(seq_along(ranges), function(x){
if (as.character(strand(ranges[x])) == "-"){
return(rev(tmp@listData[[x]]))
} else{
return(tmp@listData[[x]])
}
})
tmp_mat <- t(simplify2array(tmp_list))
if (binsize == 1){
colnames(tmp_mat) <- coln
return(tmp_mat)
} else{
binned <- bin_mat(tmp_mat, binsize)
colnames(binned) <- coln
return(binned)
}
}
# This does not do read extension -- to do that you would need to use
# bamProfile and then smooth with flat window.
#' @importFrom bamsignals bamCoverage alignSignals
coverage_mat_from_bam <- function(bam_file, ranges, binsize, coln){
#check for equal widths
stopifnot(sum(width(ranges) == width(ranges[1])) == length(ranges))
tmp_mat <- t(alignSignals(bamCoverage(bam_file, ranges, verbose = FALSE)))
#tmp_mat <- t(bamsignals::alignSignals(cvg))
if (binsize == 1){
colnames(tmp_mat) <- coln
return(tmp_mat)
} else{
binned <- bin_mat(tmp_mat, binsize)
colnames(binned) <- coln
return(binned)
}
}
# Function from Johannes Helmuth,
# https://github.com/lamortenera/bamsignals/issues/16
bamCoverageFrags <- function(bampath, gr, fragLength=200, ss = TRUE, ...) {
prf <- bamsignals::bamProfile(bampath, gr, binsize=1, ss = TRUE, ...)
cov <- lapply(as.list(prf), function(p) {
l <- dim(p)[2]
x1 <- cumsum(as.numeric(p[1,]))
y1 <- c(rep(0, fragLength), x1[seq_len(l-200)])
x2 <- cumsum(as.numeric(rev(p[2,])))
y2 <- c(rep(0, fragLength), x2[seq_len(l-200)])
matrix(c(x1-y1, rev(x2-y2)), ncol = l, byrow = TRUE,
dimnames = list(c("sense", "antisense"), NULL))
})
if (ss == TRUE) {
prf@signals <- cov
} else {
prf@signals <- lapply(cov, colSums)
}
invisible(prf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.