R/Coverage.R

Defines functions .bin_ranges .bin_df .internal_get_coverage_as_df getCoverageBins getCoverageRegions getCoverage_DF .cov_process_regions getCoverage

Documented in getCoverage getCoverageBins getCoverage_DF getCoverageRegions

#' Calls SpliceWiz's C++ function to retrieve coverage from a COV file
#'
#' This function returns an RLE / RLEList or data.frame
#' containing coverage data from the given COV file\cr\cr
#' COV files are generated by SpliceWiz's [processBAM] and [BAM2COV] functions.
#' It records alignment coverage for each nucleotide in the given BAM file.
#' It stores this data in "COV" format, which is an indexed BGZF-compressed
#' format specialised for the storage of unstranded and stranded alignment
#' coverage in RNA sequencing.\cr\cr
#' Unlike BigWig files, COV files store coverage for both positive and negative
#' strands.\cr\cr
#' These functions retrieves coverage data from the specified COV file. They
#' are computationally efficient as they utilise random-access to rapidly
#' search for the requested data from the COV file.\cr\cr
#'
#' @param file (Required) The file name of the COV file
#' @param seqname (Required for `getCoverage_DF`) A string denoting the
#'  chromosome name. If left blank in `getCoverage`, retrieves RLEList
#' containing coverage of the entire file.
#' @param start,end 1-based genomic coordinates. If `start = 0` and
#'   `end = 0`, will retrieve RLE of specified chromosome.
#' @param strand Either "*", "+", or "-"
#' @param strandMode The stranded-ness of the RNA-seq experiment. "unstranded"
#'   means that an unstranded protocol was used. Stranded protocols can be
#'   either "forward", where the first read is the same strand as the
#'   expressed transcript, or "reverse" where the second strand is the same
#'   strand as the expressed transcript.
#' @param regions A GRanges object for a set of regions to obtain mean / total
#'   coverage from the given COV file.
#' @param region In `getCoverageBins`, a single query region as a GRanges object
#' @param bins In `getCoverageBins`, the number of bins to divide the given
#'   `region`. If `bin_size` is given, overrides this parameter
#' @param bin_size In `getCoverageBins`, the number of nucleotides per bin
#' @return
#' For `getCoverage`: If seqname is left as "", returns an RLEList of the
#'   whole BAM file, with each RLE in the list containing coverage data for
#'   one chromosome. Otherwise, returns an RLE containing coverage data for
#'   the requested genomic region
#'
#' For `getCoverage_DF`: Returns a two-column data frame, with the first column
#' `coordinate` denoting genomic coordinate, and the second column `value`
#' containing the coverage depth for each coordinate nucleotide.
#'
#' For `getCoverageRegions`: Returns a GRanges object with an extra metacolumn:
#'   `cov_mean`, which gives the mean coverage of each of the given ranges.
#'
#' For `getCoverageBins`: Returns a GRanges object which spans the given
#'   `region`, divided by the number of `bins` or by width as given by
#'   `bin_size`. Mean coverage in each bin is calculated (returned by the
#'   `cov_mean` metadata column). This function is useful for retrieving
#'   coverage of a large region for visualisation, especially when the
#'   size of the region vastly exceeds the width of the figure.
#'
#' @examples
#' se <- SpliceWiz_example_NxtSE()
#'
#' cov_file <- covfile(se)[1]
#'
#' # Retrieve Coverage as RLE
#'
#' cov <- getCoverage(cov_file, seqname = "chrZ",
#'   start = 10000, end = 20000,
#'   strand = "*"
#' )
#'
#' # Retrieve Coverage as data.frame
#'
#' cov.df <- getCoverage_DF(cov_file, seqname = "chrZ",
#'   start = 10000, end = 20000,
#'   strand = "*"
#' )
#'
#' # Retrieve mean coverage of 100-nt window regions as defined
#' # in a GRanges object:
#'
#' gr <- GenomicRanges::GRanges(
#'     seqnames = "chrZ",
#'     ranges = IRanges::IRanges(
#'         start = seq(1, 99901, by = 100),
#'         end = seq(100, 100000, by = 100)
#'     ), strand = "-"
#' )
#'
#' gr.unstranded <- getCoverageRegions(cov_file,
#'     regions = gr,
#'     strandMode = "unstranded"
#' )
#'
#' gr.stranded <- getCoverageRegions(cov_file,
#'     regions = gr,
#'     strandMode = "reverse"
#' )
#'
#' # Retrieve binned coverage of a large region
#'
#' gr.fetch <- getCoverageBins(
#'     cov_file,
#'     region = GenomicRanges::GRanges(seqnames = "chrZ",
#'         ranges = IRanges::IRanges(start = 100, end = 100000),
#'         strand = "*"
#'     ),
#'     bins = 2000
#' )
#'
#' # Plot coverage using ggplot:
#'
#' require(ggplot2)
#'
#' ggplot(cov.df, aes(x = coordinate, y = value)) +
#'     geom_line() + theme_white
#'
#' ggplot(as.data.frame(gr.unstranded),
#'     aes(x = (start + end) / 2, y = cov_mean)) +
#'     geom_line() + theme_white
#'
#' ggplot(as.data.frame(gr.fetch), 
#'     aes(x = (start + end)/2, y = cov_mean)) +
#'     geom_line() + theme_white
#'
#' # Export COV data as BigWig
#'
#' cov_whole <- getCoverage(cov_file)
#' bw_file <- file.path(tempdir(), "sample.bw")
#' rtracklayer::export(cov_whole, bw_file, "bw")
#' @name Coverage
#' @md
NULL

#' @describeIn Coverage Retrieves alignment coverage as an RLE or RLElist
#' @export
getCoverage <- function(file, seqname = "", start = 0, end = 0,
        strand = c("*", "+", "-")) {
    strand <- match.arg(strand)
    if (!(strand %in% c("*", "+", "-"))) {
        .log(paste("In getCoverage(),",
            "Invalid strand. '*', '+' or '-'"))
    }
    strand_int <- ifelse(strand == "*", 2,
        ifelse(strand == "+", 1, 0))

    if (!is.numeric(start) || !is.numeric(end) ||
            (as.numeric(start) > as.numeric(end) & as.numeric(end) > 0)) {
        .log(paste("In getCoverage(),",
            "Zero or negative regions are not allowed"))
    }
    if (seqname == "") {
        raw_list <- c_RLEList_From_Cov(normalizePath(file), strand_int)
        final_list <- list()
        if (length(raw_list) > 0) {
            for (i in seq_len(length(raw_list))) {
                final_list[[i]] <- S4Vectors::Rle(
                    raw_list[[i]]$values, raw_list[[i]]$lengths
                )
            }
        } else {
            return(NULL)
        }
        final_RLE <- as(final_list, "RleList")
        names(final_RLE) <- names(raw_list)
        return(final_RLE)
    } else if (end == 0) {
        raw_RLE <- c_RLE_From_Cov(
            normalizePath(file), as.character(seqname),
            0, 0, strand_int
        )
        final_RLE <- S4Vectors::Rle(raw_RLE$values, raw_RLE$lengths)
    } else {
        raw_RLE <- c_RLE_From_Cov(
            normalizePath(file), as.character(seqname),
            round(as.numeric(start)), round(as.numeric(end)),
            strand_int
        )
        final_RLE <- S4Vectors::Rle(raw_RLE$values, raw_RLE$lengths)
    }
}

.cov_process_regions <- function(file, gr, seq, strand_gr, strand_cov) {
    # adds cov_mean from cov file to gr, only for given seqname seq
    # strand_gr and strand_cov are matching strand info for gr and cov
    if (!any(
        as.character(seqnames(gr)) %in% seq &
        as.character(strand(gr)) %in% strand_gr
    )) {
        return(gr)
    }

    todo <- which(
        as.character(seqnames(gr)) %in% seq &
        as.character(strand(gr)) %in% strand_gr
    )

    cov_data <- getCoverage(
        file, seq,
        min(start(gr[todo])) - 1,
        max(end(gr[todo])),
        strand_cov
    )

    if (is.null(gr$cov_mean)) gr$cov_mean <- 0
    gr$cov_mean[todo] <- round(
        aggregate(
            cov_data, IRanges(start(gr[todo]), end(gr[todo])), FUN = mean
        ), 2
    )

    return(gr)
}

#' @describeIn Coverage Retrieves alignment coverage as a data.frame
#' @export
getCoverage_DF <- function(file, seqname = "", start = 0, end = 0,
        strand = c("*", "+", "-")
) {
    if (seqname == "") .log("seqname must not be omitted in getCoverage_DF")
    cov <- getCoverage(file, seqname, start, end, strand)
    view <- IRanges::Views(cov, start + 1, end)
    view.df <- as.data.frame(view[[1]])
    return(data.frame(
        coordinate = seq(start + 1, end), value = view.df$value
    ))
}

#' @describeIn Coverage Retrieves total and mean coverage of a GRanges object
#' from a COV file
#' @export
getCoverageRegions <- function(file, regions,
        strandMode = c("unstranded", "forward", "reverse")
) {
    strandMode <- match.arg(strandMode)
    if (strandMode == "") strandMode <- "unstranded"

    if (!is(regions, "GRanges")) .log("regions must be a GRanges object")
    if (!isCOV(file)) .log("Given file is not of COV format")
    seqlevels <- c_Cov_Seqnames(normalizePath(file))

    # trim regions by available seqlevels
    if (!any(seqlevels %in% seqlevels(regions)))
        .log("COV file and given regions have incompatible seqnames")

    seqlevels(regions, pruning.mode = "coarse") <- seqlevels
    if (length(regions) == 0) {
        return(regions)
    }

    if (strandMode == "unstranded") {
        for (seq in unique(seqnames(regions))) {
            regions <- .cov_process_regions(file, regions, seq, 
                c("+", "-", "*"), "*")
        }
    } else if (strandMode == "forward") {
        for (seq in unique(seqnames(regions))) {
            regions <- .cov_process_regions(file, regions, seq, "*", "*")
            regions <- .cov_process_regions(file, regions, seq, "+", "+")
            regions <- .cov_process_regions(file, regions, seq, "-", "-")
        }
    } else {
        for (seq in unique(seqnames(regions))) {
            regions <- .cov_process_regions(file, regions, seq, "*", "*")
            regions <- .cov_process_regions(file, regions, seq, "-", "+")
            regions <- .cov_process_regions(file, regions, seq, "+", "-")
        }
    }

    return(regions)
}

#' @describeIn Coverage Retrieves coverage of a single region from a COV file,
#'   binned by the given number of bins or bin_size
#' @export
getCoverageBins <- function(file, region, bins = 2000, 
        strandMode = c("unstranded", "forward", "reverse"),
        bin_size
) {
    strandMode <- match.arg(strandMode)
    if (strandMode == "") strandMode <- "unstranded"

    if (!is(region, "GRanges")) .log("region must be a GRanges object")
    region <- region[1]
    
    if (!isCOV(file)) .log("Given file is not of COV format")
    seqlevels <- c_Cov_Seqnames(normalizePath(file))
    if(!(as.character(seqnames(region)) %in% seqlevels))
        .log("Given region is on a chromosome that is missing in COV file")

    if(!is.numeric(bins) || bins < 1) .log("`bins` must be a numeric value")

    if(missing(bin_size) || !is.numeric(bin_size) ||
            bin_size > width(region) || bin_size < 1) {
        bin_size <- ceiling(width(region) / bins)
    }

    if (strandMode == "unstranded") {
        strand <- "*"
    } else if (strandMode == "reverse") {
        if(strand(region) == "+") {
            strand <- "-"
        } else if(strand(region == "-")) {
            strand <- "+"
        } else {
            strand <- "*"
        }
    } else {
        strand <- as.character(strand(region))
    }

    df <- as.data.frame(.internal_get_coverage_as_df(
        "sample", file,
        as.character(seqnames(region)), 
        start(region), end(region), strand)
    )
    bin_gr <- .bin_ranges(df$x, binwidth = bin_size)
    bin_gr$seqnames <- as.character(GenomeInfoDb::seqnames(region))
    bin_gr$strand <- strand
    gr.fetch <- .grDT(bin_gr)

    df <- .bin_df(df, bin_size)
    gr.fetch$cov_mean <- df$sample

    return(gr.fetch)
}

################################# PLOT TRACKS INTERNALS ########################

.internal_get_coverage_as_df <- function(samples, files, seqname, start, end,
        strand = c("*", "+", "-")) {
    strand <- match.arg(strand)
    if (!(strand %in% c("*", "+", "-"))) {
        .log(paste("In getCoverage(),",
            "Invalid strand. '*', '+' or '-'"))
    }
    covData <- list()
    for (i in seq_len(length(files))) {
        cov <- getCoverage(files[i], seqname, start - 1, end, strand)
        view <- IRanges::Views(cov, start, end)
        view.df <- as.data.frame(view[[1]])
        covData[[i]] <- view.df
    }
    df <- do.call(cbind, covData)
    colnames(df) <- samples
    x <- seq(start, end)
    df <- cbind(x, df)
    return(df)
}

.bin_df <- function(df, binwidth = 3, exon_gr = NULL) {
    DT <- as.data.table(df)
    brks <- seq(1, 
        nrow(DT) + 1, 
        length.out = (nrow(DT) + 1) / binwidth
    )
    
    # Use single nucleotide resolution for exon bins
    if(!is.null(exon_gr)) {
        for(i in seq_len(length(exon_gr))) {
            brks <- c(brks, which(
                DT$x >= BiocGenerics::start(exon_gr[i]) &
                DT$x <= BiocGenerics::end(exon_gr[i])
            ))
        }
    }
    brks <- sort(unique(brks))
    
    bin <- NULL
    DT[, c("bin") := findInterval(seq_len(nrow(DT)), brks)]
    DT2 <- DT[, lapply(.SD, mean, na.rm = TRUE), by = "bin"]
    DT2[, c("bin") := NULL]
    return(as.data.frame(DT2))
}

.bin_ranges <- function(coords, binwidth = 3, exon_gr = NULL) {
    # coords is a vector of coordinates
    brks <- seq(1, 
        length(coords) + 1, 
        length.out = (length(coords) + 1) / binwidth
    )
    if(!is.null(exon_gr)) {
        for(i in seq_len(length(exon_gr))) {
            brks <- c(brks, which(
                coords >= BiocGenerics::start(exon_gr[i]) &
                coords <= BiocGenerics::end(exon_gr[i])
            ))
        }
    }
    brks <- sort(unique(brks))
    starts <- ceiling(brks)[-length(brks)]
    ends <- c(starts[-1] - 1, length(coords))
    return(data.frame(
        start = coords[starts],
        end = coords[ends]
    ))
}
alexchwong/SpliceWiz documentation built on Oct. 15, 2024, 10:12 a.m.