R/clusterWindows.R
In csaw: ChIP-Seq Analysis with Windows

Documented in clusterWindows

#' Cluster DB windows into clusters
#' 
#' Clusters significant windows into clusters while controlling the cluster-level FDR.
#' 
#' @param ranges A \linkS4class{GRanges} or \linkS4class{RangedSummarizedExperiment} object containing window coordinates.
#' @param tab A data.frame of results with a \code{PValue} field for each window.
#' @param target A numeric scalar indicating the desired cluster-level FDR.
#' @param pval.col A string or integer scalar specifying the column of \code{tab} with the p-values.
#' Defaults to \code{"PValue"}.
#' @param fc.col A string or integer scalar specifying the column of \code{tab} with the log-fold changes.
#' Defaults to \code{"logFC"}.
#' @param signs A logical scalar indicating whether the sign of the log-fold change (specified by \code{fc.col}) should be used in \code{\link{mergeWindows}}.
#' @param tol,... Arguments to be passed to \code{\link{mergeWindows}}.
#' @param weights,grid.length,iterations Arguments to be passed to \code{\link{controlClusterFDR}}.
#' 
#' @return
#' A named list containing:
#' \itemize{
#' \item \code{ids}, an integer vector of cluster IDs for each window in \code{ranges}.
#' Non-significant windows that were not used to construct \code{regions} are marked with \code{NA} values.
#' \item \code{regions}, a \link{GRanges} containing the coordinates for each cluster.
#' \item \code{FDR}, a numeric scalar containing the estimate of the cluster-level FDR for the returned regions.
#' \item \code{threshold}, a numeric scalar containing the per-window FDR threshold used to identify significant windows.
#' \item \code{stats}, a \linkS4class{DataFrame} containing some descriptive per-cluster statistics.
#' }
#' 
#' @details
#' In this function, windows are identified as significantly DB based on the BH-adjusted p-values in \code{tab}.
#' Only those windows are then used directly for clustering via \code{\link{mergeWindows}},
#' which subsequently yields DB regions consisting solely of DB windows.
#' If \code{tol} is not specified, it is set to 100 bp by default and a warning is raised.
#' If \code{fc.col} is used to specify the column of log-fold changes, clusters are formed according to the sign of the log-fold change in \code{\link{mergeWindows}}.
#' 
#' DB-based clustering is obviously not blind to the DB status, so standard methods for FDR control are not valid.
#' Instead, post-hoc control of the cluster-level FDR is applied by using \code{\link{controlClusterFDR}},
#' which attempts to control the cluster-level FDR at \code{target} (which is set to 0.05 if not specified).
#' Our aim here is to provide some interpretable results when DB-blind clustering is not appropriate, e.g., for diffuse marks involving long stretches of the genome.
#' Reporting each marked stretch in its entirety would be cumbersome, so this method allows the DB subintervals to be identified directly.
#' 
#' The output \code{stats} DataFrame is generated by running \code{\link{combineTests}} on the \code{ids} and \code{tab} for only the significant windows.
#' Here, the \code{fc.threshold} argument is set to the p-value threshold used to identify significant windows.
#' We also remove the \code{FDR} field from the output as this has little meaning when the clusters are not blind to the clustering.
#' Indeed, the p-value is only retained for purposes of ranking.
#'
#' @examples
#' set.seed(10)
#' x <- round(runif(100, 100, 1000))
#' gr <- GRanges("chrA", IRanges(x, x+5))
#' tab <- data.frame(PValue=rbeta(length(x), 1, 50), logFC=rnorm(length(x)))
#' 
#' clusterWindows(gr, tab, target=0.05, tol=10)
#' clusterWindows(gr, tab, target=0.05, tol=10, fc.col="logFC")
#' 
#' @seealso
#' \code{\link{mergeWindows}},
#' \code{\link{controlClusterFDR}}
#' 
#' @author Aaron Lun
#' 
#' @keywords clustering
#'
#' @export
clusterWindows <- function(ranges, tab, target, pval.col=NULL, fc.col=NULL, signs=FALSE, tol, 
    ..., weights=NULL, grid.length=21, iterations=4)
{
    ranges <- .toGRanges(ranges)
    if (nrow(tab)!=length(ranges)) { 
        stop("number of ranges is not consistent with entries in 'tab'") 
    }
    if (missing(tol)) {
        tol <- 100
        warning("'tol' for 'mergeWindows' set to a default of 100 bp")
    }
    if (missing(target)) { 
        target <- 0.05
        warning("unspecified 'target' for the cluster-level FDR set to 0.05")
    }

    # Computing a frequency-weighted adjusted p-value.
    pval.col <- .getPValCol(pval.col, tab)
    if (is.null(weights)) { 
        weights <- rep(1, nrow(tab)) 
    }
    adjp <- .weightedFDR(tab[,pval.col], weights)

    # Getting the sign.
    fc.col <- .parseFCcol(fc.col, tab) 
    if (isFALSE(signs)) { 
        signs <- NULL
    } else {
        signs <- tab[,fc.col] > 0
    }

    # Controlling the cluster-level FDR
    FUN <- function(sig) { 
        mergeWindows(ranges[sig], tol=tol, signs=signs[sig], ...) 
    }
    out <- controlClusterFDR(target=target, adjp=adjp, 
        FUN=function(sig) { FUN(sig)$ids }, 
        weights=weights, grid.length=grid.length, iterations=iterations)
    sig <- adjp <= out$threshold
    clusters <- FUN(sig)

    # Creating the output object.
    full.ids <- rep(NA_integer_, nrow(tab))
    full.ids[sig] <- clusters$ids
    clusters$ids <- full.ids
    clusters$FDR <- out$FDR
    clusters$threshold <- out$threshold

    clusters$stats <- combineTests(clusters$ids[sig], tab[sig,,drop=FALSE], 
        weights=weights[sig], pval.col=pval.col, fc.col=fc.col, 
        fc.threshold=out$threshold)
    clusters$stats$FDR <- NULL
    clusters$stats$rep.test <- which(sig)[clusters$stats$rep.test]

    clusters
}

.weightedFDR <- function(p, w) {
    if (length(p)!=length(w)) { stop("weights and p-value vector are not of same length") }
    o <- order(p)
    p <- p[o]
    w <- w[o]
    adjp <- numeric(length(o))
    adjp[o] <- rev(cummin(rev(sum(w)*p/cumsum(w))))
    pmin(adjp, 1)
}