R/hotspotter.R
In breakpointR: Find breakpoints in Strand-seq data

Documented in hotspotter

#' Find hotspots of genomic events
#'
#' Find hotspots of genomic events by using kernel \link{density} estimation.
#'
#' The hotspotter uses \code{\link[stats]{density}} to perform a KDE. A p-value is calculated by comparing the density profile of the genomic events with the density profile of a randomly subsampled set of genomic events. Due to this random sampling, the result can vary for each function call, most likely for hotspots whose p-value is close to the specified \code{pval}.
#' 
#' @param gr.list A list or \code{\link{GRangesList-class}} with \code{\link{GRanges-class}} object containing the coordinates of the genomic events.
#' @param bw Bandwidth used for kernel density estimation (see \code{\link[stats]{density}}).
#' @param pval P-value cutoff for hotspots.
#' @return A \code{\link{GRanges-class}} object containing coordinates of hotspots with p-values.
#' @importFrom stats density runif ecdf
#' @importFrom S4Vectors endoapply
#' @author Aaron Taudt
#' @export
#' @examples 
#'## Get example BreakPoint objects
#'exampleFolder <- system.file("extdata", "example_results", package="breakpointRdata")
#'exampleFiles <- list.files(exampleFolder, full.names=TRUE)
#'breakpoint.objects <- loadFromFiles(exampleFiles)
#'## Extract breakpoint coordinates
#'breaks <- lapply(breakpoint.objects, '[[', 'breaks')
#'## Get hotspot coordinates
#'hotspots <- hotspotter(gr.list=breaks, bw=1e6)
#'
hotspotter <- function(gr.list, bw, pval=1e-8) {

    ptm <- startTimedMessage("Searching for breakpoint hotspots ...")
    #set.seed(0) # fix seed for random permutations of bootstrapping
  
    ## Coerce into one GRanges
    names(gr.list) <- NULL
    gr <- do.call(c, gr.list)
    gr <- GenomicRanges::sort(gr)
  
    ## Iterate over chromosomes and calculate p-values
    pranges.list <- GenomicRanges::GRangesList()
    for (chrom in seqlevels(gr)) {
        grc <- gr[seqnames(gr)==chrom]
        if (length(grc)>1) {
            midpoints <- (start(grc)+end(grc))/2
            kde <- stats::density(midpoints,bw=bw,kernel='gaussian')
            # Random distribution of genomic events
            kde.densities <- numeric()
            for (i1 in seq_len(100)) {
                midpoints.r <- round(stats::runif(length(midpoints),1,seqlengths(gr)[chrom]))
                kde.r <- stats::density(midpoints.r,bw=bw,kernel='gaussian')
                kde.densities <- c(kde.densities, kde.r$y)
            }
            # Use ecdf to calculate p-values 
            p <- 1-stats::ecdf(kde.densities)(kde$y)
            pvalues <- data.frame(chromosome=chrom,start=kde$x,pvalue=p)
            # Make GRanges
            pvalues$end <- pvalues$start
            pvalues$chromosome <- factor(pvalues$chromosome, levels=seqlevels(gr))
            pvalues <- as(pvalues,'GRanges')
            seqlevels(pvalues) <- seqlevels(gr)
            suppressWarnings(
                seqlengths(pvalues) <- seqlengths(gr)[names(seqlengths(pvalues))]
            )
            # Resize from pointsize to bandwidth
            suppressWarnings(
                pvalues <- GenomicRanges::resize(pvalues, width=bw, fix='center')
            )
            pvalues <- trim(pvalues)
            ## Find regions where p-value is below specification
            mask <- pvalues$pvalue <= pval
            rle.pvals <- rle(mask)
            rle.pvals$values <- cumsum(rle.pvals$values+1)
            pvalues$group <- inverse.rle(rle.pvals)
            if (length(which(mask))>0) {
                pvalues.split <- split(pvalues[mask],pvalues$group[mask])
                pranges <- unlist(endoapply(pvalues.split, function(x) { y <- x[1]; end(y) <- end(x)[length(x)]; y$pvalue <- min(x$pvalue); return(y) }))
                pranges$group <- NULL
                pranges$num.events <- GenomicRanges::countOverlaps(pranges,grc)
                pranges.list[[chrom]] <- pranges
            }
        }
    }
    pranges <- unlist(pranges.list, use.names=FALSE)
    names(pranges) <- NULL

    stopTimedMessage(ptm)
    return(pranges)
}