R/blacklist.R
In AneuFinder: Analysis of Copy Number Variation in Single-Cell-Sequencing Data

Documented in blacklist mergeStrandseqFiles

#' Make a blacklist for genomic regions
#' 
#' Produce a blacklist of genomic regions with a high ratio of duplicate to unique reads. This blacklist can be used to exclude reads for analysis in \code{\link{Aneufinder}}, \code{\link{bam2GRanges}} and \code{\link{bed2GRanges}}. This function produces a pre-blacklist which has to manually be filtered with a sensible cutoff. See the examples section for details.
#' 
#' @param files A character vector of either BAM or BED files.
#' @param bins A list with one \code{\link{GRanges-class}} with binned read counts generated by \code{\link{fixedWidthBins}}.
#' @inheritParams bed2GRanges
#' @inheritParams binReads
#' @return A \code{\link{GRanges-class}} with the same coordinates as \code{bins} with metadata columns ratio, duplicated counts and deduplicated counts.
#' @importFrom S4Vectors DataFrame
#' @export
#'@examples
#'## Get an example BAM file with single-cell-sequencing reads
#'bamfile <- system.file("extdata", "BB150803_IV_074.bam", package="AneuFinderData")
#'## Prepare the blacklist
#'bins <- fixedWidthBins(assembly='mm10', binsizes=1e6, chromosome.format='NCBI')
#'pre.blacklist <- blacklist(bamfile, bins=bins)
#'## Plot a histogram to decide on a sensible cutoff
#'qplot(pre.blacklist$ratio, binwidth=0.1)
#'## Make the blacklist with cutoff = 1.9
#'blacklist <- pre.blacklist[pre.blacklist$ratio > 1.9]
blacklist <- function(files, assembly, bins, min.mapq=10, pairedEndReads=FALSE) {
	
	## Input checks ##
	if (length(bins) > 1 & is.list(bins)) {
		stop("argument 'bins' must only contain one element")
	}
	if (class(bins) == 'GRanges') {
		bins <- list(bins)
		names(bins) <- width(bins[[1]])[1]
	}
	## Bin the reads with and without duplicates kept
	dupcounts.matrix <- array(dim=c(length(bins[[1]]), length(files)), dimnames=list(bin=1:length(bins[[1]]), file=basename(files)))
	counts.matrix <- dupcounts.matrix
	for (file in files) {
		dup.binned <- binReads(file, assembly = assembly, min.mapq = min.mapq, pairedEndReads = pairedEndReads, binsizes = NULL, bins = bins, chromosomes = seqlevels(bins[[1]]), calc.complexity = FALSE, remove.duplicate.reads = FALSE)
		dupcounts.matrix[,basename(file)] <- dup.binned[[1]]$counts
		binned <- binReads(file, assembly = assembly, min.mapq = min.mapq, pairedEndReads = pairedEndReads, binsizes = NULL, bins = bins, chromosomes = seqlevels(bins[[1]]), calc.complexity = FALSE, remove.duplicate.reads = TRUE)
		counts.matrix[,basename(file)] <- binned[[1]]$counts
	}
	dupcounts <- rowSums(dupcounts.matrix)
	counts <- rowSums(counts.matrix)
	ratio <- dupcounts / counts
	ratio[is.na(ratio)] <- 1
	
	pre.blacklist <- bins[[1]]
	mcols(pre.blacklist) <- S4Vectors::DataFrame(ratio, dupcounts, counts)
	
	return(pre.blacklist)
	
}


#' Merge Strand-seq libraries
#' 
#' Merge strand libraries to generate a high-coverage Strand-seq library.
#' 
#' @param files A character vector with files with aligned reads.
#' @inheritParams bam2GRanges
#' @inheritParams bed2GRanges
#' @return A \code{\link{GRanges-class}} object with reads.
# #' @examples
# #' files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE)
# #' reads <- mergeStrandseqFiles(files, assembly='hg38')
# #'
mergeStrandseqFiles <- function(files, assembly, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000) {
  
  	## Determine format
    files.clean <- sub('\\.gz$','', files)
    formats <- sapply(strsplit(files.clean, '\\.'), function(x) { rev(x)[1] })
    datafiles <- files[formats %in% c('bam','bed')]
    files.clean <- sub('\\.gz$','', datafiles)
    formats <- sapply(strsplit(files.clean, '\\.'), function(x) { rev(x)[1] })
    if (any(formats == 'bed') & is.null('assembly')) {
    	stop("Please specify 'assembly' if you have BED files in your inputfolder.")
    }

    ### Get chromosome lengths ###
    ## Get first bam file
    bamfile <- grep('bam$', datafiles, value=TRUE)[1]
    if (!is.na(bamfile)) {
        ptm <- startTimedMessage("Obtaining chromosome length information from file ", bamfile, " ...")
        chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
        stopTimedMessage(ptm)
    } else {
        ## Read chromosome length information
        if (is.character(assembly)) {
            if (file.exists(assembly)) {
                ptm <- startTimedMessage("Obtaining chromosome length information from file ", assembly, " ...")
                df <- utils::read.table(assembly, sep='\t', header=TRUE)
                stopTimedMessage(ptm)
            } else {
                ptm <- startTimedMessage("Obtaining chromosome length information from UCSC ...")
                df.chroms <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
                ## Get first bed file
                bedfile <- grep('bed$|bed.gz$', datafiles, value=TRUE)[1]
                if (!is.na(bedfile)) {
                    firstline <- read.table(bedfile, nrows=1)
                    df <- df.chroms[,c('chrom','size')]
                    if (grepl('^chr',firstline[1,1])) {
                    } else {
                        df$chrom = sub('^chr', '', df$chrom)
                    }
                }
                stopTimedMessage(ptm)
            }
        } else if (is.data.frame(assembly)) {
            df <- assembly
        } else {
            stop("'assembly' must be either a data.frame with columns 'chromosome' and 'length' or a character specifying the assembly.")
        }
        chrom.lengths <- df[,2]
        names(chrom.lengths) <- df[,1]
        chrom.lengths <- chrom.lengths[!is.na(chrom.lengths) & !is.na(names(chrom.lengths))]
    }
    chrom.lengths.df <- data.frame(chromosome=names(chrom.lengths), length=chrom.lengths)
    
    ### Loop through files ###
    reads.list <- GRangesList()
    for (file in files) {
        message("Importing file ", basename(file))
        reads <- suppressMessages( binReads(file, assembly = chrom.lengths.df, reads.return = TRUE) )
        reads.split <- split(reads, reads@seqnames)
        numreads <- sapply(reads.split, function(x) { table(strand(x)) })
        absreadratio <- exp(abs(log(numreads['+',] / numreads['-',])))
        readratio <- numreads['+',] / numreads['-',]
        which.keep <- absreadratio > 10
        which.swap <- readratio < 0.1
        ## Swap strands
        for (chrom in names(which.keep)) {
            if (which.keep[chrom]) {
                if (which.swap[chrom]) {
                    sreads <- reads.split[[chrom]]
                    mask <- strand(sreads) == '+'
                    strand(sreads)[mask] <- '-'
                    strand(sreads)[!mask] <- '+'
                }
                reads.split[[chrom]] <- sreads
            } else {
                reads.split[[chrom]] <- NULL
            }
        }
        reads.list[[basename(file)]] <- unlist(reads.split, use.names = FALSE)
    }
    reads <- unlist(reads.list, use.names=FALSE)
    reads <- sort(reads)
  
    return(reads)
  
}