#' Make fixed-width bins
#'
#' Make fixed-width bins based on given bin size.
#'
#' @param bamfile A BAM file from which the header is read to determine the chromosome lengths. If a \code{bamfile} is specified, option \code{assembly} is ignored.
#' @param assembly An assembly from which the chromosome lengths are determined. Please see \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}} for available assemblies. This option is ignored if \code{bamfile} is specified. Alternatively a data.frame generated by \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}}.
#' @param chrom.lengths A named character vector with chromosome lengths. Names correspond to chromosomes.
#' @param chromosome.format A character specifying the format of the chromosomes if \code{assembly} is specified. Either 'NCBI' for (1,2,3 ...) or 'UCSC' for (chr1,chr2,chr3 ...). If a \code{bamfile} or \code{chrom.lengths} is supplied, the format will be chosen automatically.
#' @param binsizes A vector of bin sizes in base pairs.
#' @param chromosomes A subset of chromosomes for which the bins are generated.
#' @return A \code{list()} of \code{\link{GRanges-class}} objects with fixed-width bins.
#' @author Aaron Taudt
#' @importFrom Rsamtools BamFile
#' @export
#'
#'@examples
#'## Make fixed-width bins of size 500kb and 1Mb
#'data(rn4_chrominfo)
#'chrom.lengths <- rn4_chrominfo$length
#'names(chrom.lengths) <- rn4_chrominfo$chromosome
#'bins <- fixedWidthBins(chrom.lengths=chrom.lengths, binsizes=c(5e5,1e6))
#'bins
#'
#'## Make bins using NCBI server (requires internet connection)
#'# bins <- fixedWidthBins(assembly='mm10', chromosome.format='NCBI', binsizes=c(5e5,1e6))
#'
fixedWidthBins <- function(bamfile=NULL, assembly=NULL, chrom.lengths=NULL, chromosome.format, binsizes=1e6, chromosomes=NULL) {
### Check user input ###
if (is.null(bamfile) & is.null(assembly) & is.null(chrom.lengths)) {
stop("Please specify either a 'bamfile', 'assembly' or 'chrom.lengths'")
}
if (is.null(bamfile) & is.null(chrom.lengths)) {
trigger.error <- chromosome.format
}
### Get chromosome lengths ###
if (!is.null(bamfile)) {
ptm <- startTimedMessage(paste0("Reading header from ", bamfile, " ..."))
chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
stopTimedMessage(ptm)
} else if (!is.null(assembly)) {
if (is.character(assembly)) {
ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...")
df <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
stopTimedMessage(ptm)
} else if (is.data.frame(assembly)) {
df <- assembly
} else {
stop("Unknown assembly")
}
chrom.lengths <- df$size
if (chromosome.format=='UCSC') {
} else if (chromosome.format=='NCBI') {
df$chrom = sub('^chr', '', df$chrom)
}
names(chrom.lengths) <- df$chrom
chrom.lengths <- chrom.lengths[!is.na(names(chrom.lengths))]
chrom.lengths <- chrom.lengths[!is.na(chrom.lengths)]
} else if (!is.null(chrom.lengths)) {
chrom.lengths <- chrom.lengths[!is.na(names(chrom.lengths))]
chrom.lengths <- chrom.lengths[!is.na(chrom.lengths)]
}
chroms.in.data <- names(chrom.lengths)
if (is.null(chromosomes)) {
chromosomes <- chroms.in.data
}
chroms2use <- intersect(chromosomes, chroms.in.data)
## Stop if none of the specified chromosomes exist
if (length(chroms2use)==0) {
chrstring <- paste0(chromosomes, collapse=', ')
stop('Could not find length information for any of the specified chromosomes: ', chrstring)
}
## Issue warning for non-existent chromosomes
diff <- setdiff(chromosomes, chroms.in.data)
if (length(diff)>0) {
diffs <- paste0(diff, collapse=', ')
warning('Could not find length information for the following chromosomes: ', diffs)
}
### Making fixed-width bins ###
bins.list <- list()
for (binsize in binsizes) {
ptm <- startTimedMessage("Making fixed-width bins for bin size ", binsize, " ...")
chrom.lengths.floor <- floor(chrom.lengths / binsize) * binsize
clfloor2use <- chrom.lengths.floor[chroms2use]
clfloor2use <- clfloor2use[clfloor2use >= binsize]
if (length(clfloor2use) == 0) {
stop("All selected chromosomes are smaller than binsize ", binsize)
}
bins <- unlist(GenomicRanges::tileGenome(clfloor2use, tilewidth=binsize), use.names=FALSE)
if (any(width(bins)!=binsize)) {
stop("tileGenome failed")
}
seqlevels(bins) <- chroms2use
seqlengths(bins) <- chrom.lengths[seqlevels(bins)]
skipped.chroms <- setdiff(chromosomes, as.character(unique(seqnames(bins))))
bins <- dropSeqlevels(bins, skipped.chroms, pruning.mode = 'coarse')
bins.list[[as.character(binsize)]] <- bins
if (length(skipped.chroms)>0) {
warning("The following chromosomes were dropped because they are smaller than binsize ", binsize, ": ", paste0(skipped.chroms, collapse=', '))
}
stopTimedMessage(ptm)
}
return(bins.list)
}
#' Make variable-width bins
#'
#' Make variable-width bins based on a reference BAM file. This can be a simulated file (produced by TODO: insert link and aligned with your favourite aligner) or a real reference.
#'
#' Variable-width bins are produced by first binning the reference BAM file with fixed-width bins and selecting the desired number of reads per bin as the (non-zero) maximum of the histogram. A new set of bins is then generated such that every bin contains the desired number of reads.
#'
#' @param reads A \code{\link{GRanges-class}} with reads. See \code{\link{readBamFileAsGRanges}} and \code{\link{readBedFileAsGRanges}}.
#' @param binsizes A vector with binsizes. Resulting bins will be close to the specified binsizes.
#' @param chromosomes A subset of chromosomes for which the bins are generated.
#' @return A \code{list()} of \code{\link{GRanges-class}} objects with variable-width bins.
#' @author Aaron Taudt
#' @importFrom S4Vectors endoapply
#' @export
#'
#'@examples
#'## Get an example BAM file with ChIP-seq reads
#'bamfile <- system.file("extdata", "euratrans", "lv-H3K4me3-BN-female-bio1-tech1.bam",
#' package="chromstaRData")
#'## Read the file into a GRanges object
#'reads <- readBamFileAsGRanges(bamfile, chromosomes='chr12', pairedEndReads=FALSE,
#' min.mapq=10, remove.duplicate.reads=TRUE)
#'## Make variable-width bins of size 1000bp
#'bins <- variableWidthBins(reads, binsizes=1000)
#'## Plot the distribution of binsizes
#'hist(width(bins[['1000']]), breaks=50)
#'
variableWidthBins <- function(reads, binsizes, chromosomes=NULL) {
### Check user input ###
chroms.in.data <- unique(seqnames(reads))
if (is.null(chromosomes)) {
chromosomes <- chroms.in.data
}
chroms2use <- intersect(chromosomes, chroms.in.data)
## Stop if non of the specified chromosomes exist
if (length(chroms2use)==0) {
chrstring <- paste0(chromosomes, collapse=', ')
stop('Could not find length information for any of the specified chromosomes: ', chrstring)
}
## Issue warning for non-existent chromosomes
diff <- setdiff(chromosomes, chroms.in.data)
if (length(diff)>0) {
diffs <- paste0(diff, collapse=', ')
warning('Could not find length information for the following chromosomes: ', diffs)
}
## Drop unwanted seqlevels
reads <- reads[seqnames(reads) %in% chroms2use]
reads <- keepSeqlevels(reads, chroms2use)
## Make fixed width bins
ptm <- startTimedMessage("Binning reads in fixed-width windows ...")
binned.list <- suppressMessages( binReads(reads, assembly=NULL, binsizes=binsizes, chromosomes=chromosomes) )
stopTimedMessage(ptm)
## Sort the reads
strand(reads) <- '*'
reads <- sort(reads)
## Loop over binsizes
bins.list <- list()
for (i1 in 1:length(binsizes)) {
binsize <- binsizes[i1]
ptm <- startTimedMessage("Making variable-width windows for bin size ", binsize, " ...")
if (is(binned.list,'GRanges')) {
binned <- binned.list
} else {
binned <- binned.list[[i1]]
}
## Get mode of histogram
tab <- table(binned$counts)
modecount <- as.integer(names(which.max(tab[names(tab)!=0])))
## Pick only every modecount read
subreads <- GRangesList()
skipped.chroms <- character()
for (chrom in chroms2use) {
reads.chr <- reads[seqnames(reads)==chrom]
if (length(reads.chr) >= modecount) {
idx <- seq(modecount, length(reads.chr), by=modecount)
subreads[[chrom]] <- reads.chr[idx]
} else {
skipped.chroms[chrom] <- chrom
}
}
if (length(skipped.chroms)>0) {
warning("The following chromosomes were dropped because they are smaller than binsize ", binsize, ": ", paste0(skipped.chroms, collapse=', '))
}
subreads <- unlist(subreads, use.names=FALSE)
## Adjust length of reads to get consecutive bins
subreads <- resize(subreads, width=1)
## Make new bins
bins <- gaps(subreads, start=1L, end=seqlengths(subreads)-1L) # gaps until seqlengths-1 because we have to add 1 later to get consecutive bins
bins <- bins[strand(bins)=='*']
end(bins) <- end(bins) + 1
## We don't want incomplete bins at the end
bins.split <- split(bins, seqnames(bins))
bins.split <- S4Vectors::endoapply(bins.split, function(x) { x[-length(x)] })
bins <- unlist(bins.split, use.names=FALSE)
## Remove skipped chromosomes
bins <- bins[!seqnames(bins) %in% skipped.chroms]
bins <- dropSeqlevels(bins, skipped.chroms, pruning.mode = 'coarse')
bins.list[[as.character(binsize)]] <- bins
stopTimedMessage(ptm)
}
return(bins.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.