R/importReads.R
In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

Documented in readBamFileAsGRanges readBedFileAsGRanges

#' Import BAM file into GRanges
#'
#' Import aligned reads from a BAM file into a \code{\link{GRanges-class}} object.
#'
#' @param bamfile A sorted BAM file.
#' @param bamindex BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued.
#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here.
#' @param pairedEndReads Set to \code{TRUE} if you have paired-end reads in your BAM files (not implemented for BED files).
#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=0} to keep all reads.
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments due to mapping errors of paired end reads.
#' @param blacklist A \code{\link{GRanges-class}} or a bed(.gz) file with blacklisted regions. Reads falling into those regions will be discarded.
#' @param what A character vector of fields that are returned. Uses the \code{Rsamtools::scanBamWhat} function. See \code{Rsamtools::ScanBamParam} to see what is available.
#' @return A \code{\link{GRanges-class}} object containing the reads.
#' @importFrom Rsamtools indexBam BamFile ScanBamParam scanBamFlag
#' @importFrom GenomicAlignments readGAlignmentPairs readGAlignments first
#' @importFrom S4Vectors queryHits
#' @export
#'
#'@examples
#'## Get an example BAM file with ChIP-seq reads
#'bamfile <- system.file("extdata", "euratrans", "lv-H3K4me3-BN-female-bio1-tech1.bam",
#'                       package="chromstaRData")
#'## Read the file into a GRanges object
#'reads <- readBamFileAsGRanges(bamfile, chromosomes='chr12', pairedEndReads=FALSE,
#'                     min.mapq=10, remove.duplicate.reads=TRUE)
#'print(reads)
#'
readBamFileAsGRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000, blacklist=NULL, what='mapq') {

    ## Input checks
    if (!is.null(blacklist)) {
        if ( !(is.character(blacklist) | is(blacklist,'GRanges')) ) {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
    }

    ## Check if bamindex exists
    bamindex.raw <- sub('\\.bai$', '', bamindex)
    bamindex <- paste0(bamindex.raw,'.bai')
    if (!file.exists(bamindex)) {
        ptm <- startTimedMessage("Making bam-index file ...")
        bamindex.own <- Rsamtools::indexBam(bamfile)
        # warning("Couldn't find BAM index-file ",bamindex,". Creating our own file ",bamindex.own," instead.")
        bamindex <- bamindex.own
        stopTimedMessage(ptm)
    }
    chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
    chroms.in.data <- names(chrom.lengths)
    if (is.null(chromosomes)) {
        chromosomes <- chroms.in.data
    }
    chroms2use <- intersect(chromosomes, chroms.in.data)
    ## Stop if none of the specified chromosomes exist
    if (length(chroms2use)==0) {
        chrstring <- paste0(chromosomes, collapse=', ')
        stop('The specified chromosomes ', chrstring, ' do not exist in the data.')
    }
    ## Issue warning for non-existent chromosomes
    diff <- setdiff(chromosomes, chroms.in.data)
    if (length(diff)>0) {
        diffs <- paste0(diff, collapse=', ')
        warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
    }

    ## Import the file into GRanges
    ptm <- startTimedMessage("Reading file ",basename(bamfile)," ...")
    gr <- GenomicRanges::GRanges(seqnames=chroms2use, ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
    if (!remove.duplicate.reads) {
        if (pairedEndReads) {
            data.raw <- GenomicAlignments::readGAlignmentPairs(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what))
        } else {
            data.raw <- GenomicAlignments::readGAlignments(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what))
        }
    } else {
        if (pairedEndReads) {
            data.raw <- GenomicAlignments::readGAlignmentPairs(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=Rsamtools::scanBamFlag(isDuplicate=FALSE)))
        } else {
            data.raw <- GenomicAlignments::readGAlignments(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=Rsamtools::scanBamFlag(isDuplicate=FALSE)))
        }
    }
    stopTimedMessage(ptm)

    if (length(data.raw) == 0) {
        if (pairedEndReads) {
            stop(paste0("No reads imported. Does your file really contain paired end reads? Try with 'pairedEndReads=FALSE'"))
        }
        stop(paste0('No reads imported! Check your BAM-file ', bamfile))
    }
    
    ## Filter by mapping quality
    if (pairedEndReads) {
        ptm <- startTimedMessage("Converting to GRanges ...")
        data <- methods::as(data.raw, 'GRanges') # treat as one fragment
        stopTimedMessage(ptm)

        ptm <- startTimedMessage("Filtering reads (mapq >= ", min.mapq, ") ...")
        if (min.mapq>0) {
            mapq.first <- mcols(GenomicAlignments::first(data.raw))$mapq
            mapq.last <- mcols(GenomicAlignments::last(data.raw))$mapq
            mapq.mask <- mapq.first >= min.mapq & mapq.last >= min.mapq
            if (any(is.na(mapq.mask))) {
                warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
            }
            data <- data[which(mapq.mask)]
        }
        # Filter out too long fragments
        data <- data[width(data)<=max.fragment.width]
        stopTimedMessage(ptm)
    } else {
        ptm <- startTimedMessage("Converting to GRanges ...")
        data <- methods::as(data.raw, 'GRanges')
        stopTimedMessage(ptm)

        ptm <- startTimedMessage("Filtering reads (mapq >= ", min.mapq, ") ...")
        if (min.mapq>0) {
            if (any(is.na(mcols(data)$mapq))) {
                warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
                mcols(data)$mapq[is.na(mcols(data)$mapq)] <- -1
            }
            data <- data[mcols(data)$mapq >= min.mapq]
        }
        # Filter out too long fragments
        data <- data[width(data)<=max.fragment.width]
        stopTimedMessage(ptm)
    }

    if (length(data)==0) {
        stop("No reads present after filtering. Please lower your 'min.mapq'.")
    }

    ## Exclude reads falling into blacklisted regions
    if (!is.null(blacklist)) {
        ptm <- startTimedMessage("Filtering blacklisted regions ...")
        if (is.character(blacklist)) {
            if (grepl('^chr', seqlevels(data)[1])) {
                chromosome.format <- 'UCSC'
            } else {
                chromosome.format <- 'NCBI'
            }
            black <- readCustomBedFile(blacklist, skip=0, chromosome.format=chromosome.format)
        } else if (is(blacklist,'GRanges')) {
            black <- blacklist
        } else {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
        overlaps <- findOverlaps(data, black)
        idx <- setdiff(1:length(data), S4Vectors::queryHits(overlaps))
        data <- data[idx]
        stopTimedMessage(ptm)
    }
    
    ## Select only desired chromosomes
    ptm <- startTimedMessage("Subsetting chromosomes ...")
    data <- data[seqnames(data) %in% chroms2use]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    ## Drop seqlevels where seqlength is NA
    na.seqlevels <- seqlevels(data)[is.na(seqlengths(data))]
    data <- data[seqnames(data) %in% seqlevels(data)[!is.na(seqlengths(data))]]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    if (length(na.seqlevels) > 0) {
        warning("Dropped seqlevels because no length information was available: ", paste0(na.seqlevels, collapse=', '))
    }
    stopTimedMessage(ptm)

    if (length(data) == 0) {
        stop(paste0('No reads imported!'))
    }

    

    return(data)

}


#' Import BED file into GRanges
#'
#' Import aligned reads from a BED file into a \code{\link{GRanges-class}} object.
#'
#' @param bedfile A file with aligned reads in BED-6 format. The columns have to be c('chromosome','start','end','description','mapq','strand').
#' @param assembly Please see \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}} for available assemblies. Only necessary when importing BED files. BAM files are handled automatically. Alternatively a data.frame with columns 'chromosome' and 'length'.
#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here.
#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=0} to keep all reads.
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments.
#' @param blacklist A \code{\link{GRanges-class}} or a bed(.gz) file with blacklisted regions. Reads falling into those regions will be discarded.
#' @return A \code{\link{GRanges-class}} object containing the reads.
#' @importFrom utils read.table
#' @importFrom S4Vectors queryHits
#' @export
#'
#'@examples
#'## Get an example BED file with single-cell-sequencing reads
#'bedfile <- system.file("extdata", "liver-H3K4me3-BN-male-bio2-tech1.bed.gz",
#'                        package="chromstaRData")
#'## Read the file into a GRanges object
#'data(rn4_chrominfo)
#'reads <- readBedFileAsGRanges(bedfile, assembly=rn4_chrominfo, chromosomes='chr12',
#'                     min.mapq=10, remove.duplicate.reads=TRUE)
#'print(reads)
#'
readBedFileAsGRanges <- function(bedfile, assembly, chromosomes=NULL, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000, blacklist=NULL) {

    ## Input checks
    if (!is.null(blacklist)) {
        if ( !(is.character(blacklist) | is(blacklist,'GRanges')) ) {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
    }

    # File with reads, specify classes for faster import (0-based)
    ptm <- startTimedMessage("Reading file ",basename(bedfile)," ...")
    classes <- c('character','numeric','numeric','NULL','integer','character')
    # Determine if data has trackline
    firstline <- utils::read.table(bedfile, nrows = 1)
    if (grepl('^track', firstline[1,1])) {
        skip <- 1
    } else {
        skip <- 0
    }
    data.raw <- utils::read.table(bedfile, colClasses=classes, skip=skip)
    # Convert to GRanges object
    data <- GenomicRanges::GRanges(seqnames=data.raw[,1], ranges=IRanges(start=data.raw[,2]+1, end=data.raw[,3]), strand=data.raw[,5])    # start+1 to go from [0,x) -> [1,x]
    mcols(data)$mapq <- data.raw[,4]
    remove(data.raw)
    stopTimedMessage(ptm)
    ## Read chromosome length information
    if (is.character(assembly)) {
        if (file.exists(assembly)) {
            df <- utils::read.table(assembly, sep='\t', header=TRUE)
        } else {
            ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...")
            df.chroms <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
            stopTimedMessage(ptm)
            df <- df.chroms[,c('chrom','size')]
            if (grepl('^chr',seqlevels(data)[1])) {
            } else {
                df$chrom = sub('^chr', '', df$chrom)
            }
        }
    } else if (is.data.frame(assembly)) {
        df <- assembly
    } else {
        stop("'assembly' must be either a data.frame with columns 'chromosome' and 'length' or a character specifying the assembly.")
    }
    chrom.lengths <- df[,2]
    names(chrom.lengths) <- df[,1]
    seqlengths(data) <- as.numeric(chrom.lengths[names(seqlengths(data))])

    chroms.in.data <- seqlevels(data)
    if (is.null(chromosomes)) {
        chromosomes <- chroms.in.data
    }
    chroms2use <- intersect(chromosomes, chroms.in.data)
    ## Stop if none of the specified chromosomes exist
    if (length(chroms2use)==0) {
        chrstring <- paste0(chromosomes, collapse=', ')
        stop('The specified chromosomes ', chrstring, ' do not exist in the data.')
    }
    ## Issue warning for non-existent chromosomes
    diff <- setdiff(chromosomes, chroms.in.data)
    if (length(diff)>0) {
        diffs <- paste0(diff, collapse=', ')
        warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
    }

    ## Select only desired chromosomes
    ptm <- startTimedMessage("Subsetting chromosomes ...")
    data <- data[seqnames(data) %in% chroms2use]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    ## Drop seqlevels where seqlength is NA
    na.seqlevels <- seqlevels(data)[is.na(seqlengths(data))]
    data <- data[seqnames(data) %in% seqlevels(data)[!is.na(seqlengths(data))]]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    if (length(na.seqlevels) > 0) {
        warning("Dropped seqlevels because no length information was available: ", paste0(na.seqlevels, collapse=', '))
    }
    stopTimedMessage(ptm)

    if (length(data) == 0) {
        stop(paste0('No reads imported!'))
    }

    ## Filter by mapping quality
    ptm <- startTimedMessage("Filtering reads (mapq >= ", min.mapq, ") ...")
    if (min.mapq>0) {
        if (any(is.na(mcols(data)$mapq))) {
            warning(paste0(bedfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
            mcols(data)$mapq[is.na(mcols(data)$mapq)] <- -1
        }
        data <- data[mcols(data)$mapq >= min.mapq]
    }
    # Filter out too long fragments
    data <- data[width(data)<=max.fragment.width]
    stopTimedMessage(ptm)
    
    if (length(data)==0) {
        stop("No reads present after filtering. Please lower your 'min.mapq'.")
    }

    ## Exclude reads falling into blacklisted regions
    if (!is.null(blacklist)) {
        ptm <- startTimedMessage("Filtering blacklisted regions ...")
        if (is.character(blacklist)) {
            if (grepl('^chr', seqlevels(data)[1])) {
                chromosome.format <- 'UCSC'
            } else {
                chromosome.format <- 'NCBI'
            }
            black <- readCustomBedFile(blacklist, skip=0, chromosome.format=chromosome.format)
        } else if (is(blacklist,'GRanges')) {
            black <- blacklist
        } else {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
        overlaps <- findOverlaps(data, black)
        idx <- setdiff(1:length(data), S4Vectors::queryHits(overlaps))
        data <- data[idx]
        stopTimedMessage(ptm)
    }

    return(data)

}
Any scripts or data that you put into this service are public.
chromstaR documentation built on Nov. 8, 2020, 8:29 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
chromstaR
Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

R/importReads.R
In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

Defines functions readBedFileAsGRanges readBamFileAsGRanges

Documented in readBamFileAsGRanges readBedFileAsGRanges

Try the chromstaR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

chromstaR Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

R/importReads.R In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

Defines functions readBedFileAsGRanges readBamFileAsGRanges

Documented in readBamFileAsGRanges readBedFileAsGRanges

Try the chromstaR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

chromstaR
Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

R/importReads.R
In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data