Nothing
#' Make a blacklist for genomic regions
#'
#' Produce a blacklist of genomic regions with a high ratio of duplicate to unique reads. This blacklist can be used to exclude reads for analysis in \code{\link{Aneufinder}}, \code{\link{bam2GRanges}} and \code{\link{bed2GRanges}}. This function produces a pre-blacklist which has to manually be filtered with a sensible cutoff. See the examples section for details.
#'
#' @param files A character vector of either BAM or BED files.
#' @param bins A list with one \code{\link{GRanges-class}} with binned read counts generated by \code{\link{fixedWidthBins}}.
#' @inheritParams bed2GRanges
#' @inheritParams binReads
#' @return A \code{\link{GRanges-class}} with the same coordinates as \code{bins} with metadata columns ratio, duplicated counts and deduplicated counts.
#' @importFrom S4Vectors DataFrame
#' @export
#'@examples
#'## Get an example BAM file with single-cell-sequencing reads
#'bamfile <- system.file("extdata", "BB150803_IV_074.bam", package="AneuFinderData")
#'## Prepare the blacklist
#'bins <- fixedWidthBins(assembly='mm10', binsizes=1e6, chromosome.format='NCBI')
#'pre.blacklist <- blacklist(bamfile, bins=bins)
#'## Plot a histogram to decide on a sensible cutoff
#'qplot(pre.blacklist$ratio, binwidth=0.1)
#'## Make the blacklist with cutoff = 1.9
#'blacklist <- pre.blacklist[pre.blacklist$ratio > 1.9]
blacklist <- function(files, assembly, bins, min.mapq=10, pairedEndReads=FALSE) {
## Input checks ##
if (length(bins) > 1 & is.list(bins)) {
stop("argument 'bins' must only contain one element")
}
if (class(bins) == 'GRanges') {
bins <- list(bins)
names(bins) <- width(bins[[1]])[1]
}
## Bin the reads with and without duplicates kept
dupcounts.matrix <- array(dim=c(length(bins[[1]]), length(files)), dimnames=list(bin=1:length(bins[[1]]), file=basename(files)))
counts.matrix <- dupcounts.matrix
for (file in files) {
dup.binned <- binReads(file, assembly = assembly, min.mapq = min.mapq, pairedEndReads = pairedEndReads, binsizes = NULL, bins = bins, chromosomes = seqlevels(bins[[1]]), calc.complexity = FALSE, remove.duplicate.reads = FALSE)
dupcounts.matrix[,basename(file)] <- dup.binned[[1]]$counts
binned <- binReads(file, assembly = assembly, min.mapq = min.mapq, pairedEndReads = pairedEndReads, binsizes = NULL, bins = bins, chromosomes = seqlevels(bins[[1]]), calc.complexity = FALSE, remove.duplicate.reads = TRUE)
counts.matrix[,basename(file)] <- binned[[1]]$counts
}
dupcounts <- rowSums(dupcounts.matrix)
counts <- rowSums(counts.matrix)
ratio <- dupcounts / counts
ratio[is.na(ratio)] <- 1
pre.blacklist <- bins[[1]]
mcols(pre.blacklist) <- S4Vectors::DataFrame(ratio, dupcounts, counts)
return(pre.blacklist)
}
#' Merge Strand-seq libraries
#'
#' Merge strand libraries to generate a high-coverage Strand-seq library.
#'
#' @param files A character vector with files with aligned reads.
#' @inheritParams bam2GRanges
#' @inheritParams bed2GRanges
#' @return A \code{\link{GRanges-class}} object with reads.
# #' @examples
# #' files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE)
# #' reads <- mergeStrandseqFiles(files, assembly='hg38')
# #'
mergeStrandseqFiles <- function(files, assembly, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000) {
## Determine format
files.clean <- sub('\\.gz$','', files)
formats <- sapply(strsplit(files.clean, '\\.'), function(x) { rev(x)[1] })
datafiles <- files[formats %in% c('bam','bed')]
files.clean <- sub('\\.gz$','', datafiles)
formats <- sapply(strsplit(files.clean, '\\.'), function(x) { rev(x)[1] })
if (any(formats == 'bed') & is.null('assembly')) {
stop("Please specify 'assembly' if you have BED files in your inputfolder.")
}
### Get chromosome lengths ###
## Get first bam file
bamfile <- grep('bam$', datafiles, value=TRUE)[1]
if (!is.na(bamfile)) {
ptm <- startTimedMessage("Obtaining chromosome length information from file ", bamfile, " ...")
chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
stopTimedMessage(ptm)
} else {
## Read chromosome length information
if (is.character(assembly)) {
if (file.exists(assembly)) {
ptm <- startTimedMessage("Obtaining chromosome length information from file ", assembly, " ...")
df <- utils::read.table(assembly, sep='\t', header=TRUE)
stopTimedMessage(ptm)
} else {
ptm <- startTimedMessage("Obtaining chromosome length information from UCSC ...")
df.chroms <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
## Get first bed file
bedfile <- grep('bed$|bed.gz$', datafiles, value=TRUE)[1]
if (!is.na(bedfile)) {
firstline <- read.table(bedfile, nrows=1)
df <- df.chroms[,c('chrom','size')]
if (grepl('^chr',firstline[1,1])) {
} else {
df$chrom = sub('^chr', '', df$chrom)
}
}
stopTimedMessage(ptm)
}
} else if (is.data.frame(assembly)) {
df <- assembly
} else {
stop("'assembly' must be either a data.frame with columns 'chromosome' and 'length' or a character specifying the assembly.")
}
chrom.lengths <- df[,2]
names(chrom.lengths) <- df[,1]
chrom.lengths <- chrom.lengths[!is.na(chrom.lengths) & !is.na(names(chrom.lengths))]
}
chrom.lengths.df <- data.frame(chromosome=names(chrom.lengths), length=chrom.lengths)
### Loop through files ###
reads.list <- GRangesList()
for (file in files) {
message("Importing file ", basename(file))
reads <- suppressMessages( binReads(file, assembly = chrom.lengths.df, reads.return = TRUE) )
reads.split <- split(reads, reads@seqnames)
numreads <- sapply(reads.split, function(x) { table(strand(x)) })
absreadratio <- exp(abs(log(numreads['+',] / numreads['-',])))
readratio <- numreads['+',] / numreads['-',]
which.keep <- absreadratio > 10
which.swap <- readratio < 0.1
## Swap strands
for (chrom in names(which.keep)) {
if (which.keep[chrom]) {
if (which.swap[chrom]) {
sreads <- reads.split[[chrom]]
mask <- strand(sreads) == '+'
strand(sreads)[mask] <- '-'
strand(sreads)[!mask] <- '+'
}
reads.split[[chrom]] <- sreads
} else {
reads.split[[chrom]] <- NULL
}
}
reads.list[[basename(file)]] <- unlist(reads.split, use.names = FALSE)
}
reads <- unlist(reads.list, use.names=FALSE)
reads <- sort(reads)
return(reads)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.