#' @title Merge Sliding Windows using the Harmonic Mean P
#'
#' @description Merge overlapping windows using harmonic mean p-values from significance
#' testing
#'
#' @details
#' When using sliding windows to test for differential signal, overlapping
#' windows can be merged based on the significance of results.
#' `mergeByHMP()` merges overlapping windows using the asymptotically exact
#' harmonic mean p-value \link[harmonicmeanp]{p.hmp} from the individual,
#' window-level tests. This tests the Null Hypothesis that there is no
#' significance amongst the initial set of p-values, and returns a summarised
#' value which controls the FDR within a set of tests (Wilson, PNAS, 2019).
#' Multilevel testing across the set of results is currently implemented using
#' `p_adj_method = "fwer"`
#'
#' Given that the harmonic mean p-value is calculated from the inverse p-values,
#' these are used to provide a *weighted average* of expression and logFC values
#' in the returned object. Any weights provided in `w` are ignored for these
#' values as they are simple representative estimates.
#' The representative range returned in `keyval_range` corresponds to the window
#' with the lowest p-value.
#'
#' The total number of windows is also returned in the final object, with the
#' summarised values n_up and n_down indicating the number of windows with raw
#' p-values below the calculated harmonic mean p-value, and with the
#' corresponding direction of change.
#'
#' The column containing the harmonic mean p-values is returned as 'hmp'.
#' An additional column with adjusted hmp-values is returned with the suffix
#' '_*' added where the p-value adjustment method is added after the underscore.
#'
#' @param x GenomicRanges object
#' @param df data.frame with results of differential binding analysis performed
#' using a sliding window strategy. If not provided, the columns in the
#' `mcols()` element of `x` will be used
#' @param w vector of weights to applied when calculating harmonic mean p-values
#' @param logfc,pval,cpm Column names for the values holding window specific
#' estimates of change in binding (logfc), overall signal intensity (cpm) and
#' the significance from statistical testing (pval).
#' @param inc_cols (Optional) Character vector of any additional columns in
#' `df` to return. Values will correspond to the range in the `keyval_range`
#' column
#' @param p_adj_method One of `p.adjust.methods` or "fwer". If "fwer" is
#' specified the adjusted harmonic-mean p-value will be returned in a form
#' which strictly controls the experiment-wide FWER. Please see
#' vignette("harmonicmeanp") for more details
#' @param merge_within Merge any non-overlapping windows within this distance
#' @param ignore_strand Passed internally to \link[GenomicRanges]{reduce} and
#' \link[GenomicRanges]{findOverlaps}
#' @param min_win Only keep merged windows derived from at least this number
#' @param keyval Return the key-value range as the window associated with the
#' minimum p-value, or by merging the ranges from all windows with raw p-values
#' below the merged harmonic-mean p-value
#' @param hm_pre Prefix to add to the beginning of all HMP-derived columns
#' @param ... Not used
#'
#' @return
#' A GenomicRanges object with merged ranges from the original object along with
#' summarised or representative values from the relevant columns. The range
#' corresponding to a representative values is also returned as described above
#'
#' @examples
#' x <- GRanges(c("chr1:1-10", "chr1:6-15", "chr1:51-60"))
#' set.seed(1001)
#' df <- DataFrame(logFC = rnorm(3), logCPM = rnorm(3,8), p = rexp(3, 10))
#' mergeByHMP(x, df, pval = "p")
#' mcols(x) <- df
#' x
#' mergeByHMP(x, pval = "p", p_adj_method = "fwer")
#'
#' @name mergeByHMP
#' @rdname mergeByHMP-methods
#' @export
setGeneric("mergeByHMP", function(x, ...) standardGeneric("mergeByHMP"))
#' @importFrom S4Vectors DataFrame mcols mcols<- subjectHits
#' @importFrom dplyr group_by summarise arrange distinct left_join bind_rows
#' @importFrom dplyr across
#' @importFrom tidyselect all_of
#' @importFrom rlang := !! sym
#' @importFrom stats weighted.mean
#' @import GenomicRanges
#' @rdname mergeByHMP-methods
#' @export
setMethod(
"mergeByHMP",
signature = signature(x = "GenomicRanges"),
function(
x, df = NULL, w = NULL,
logfc = "logFC", pval = "P", cpm = "logCPM", inc_cols = NULL,
p_adj_method = "fdr", merge_within = 1L, ignore_strand = TRUE,
min_win = 1, keyval = c("min", "merged"), hm_pre = "hm", ...
){
## Checks & defining the key columns
if (is.null(df)) df <- mcols(x)
stopifnot(nrow(df) == length(x))
df <- as.data.frame(df)
df_cols <- colnames(df)
logfc <- match.arg(logfc, df_cols)
pcol <- match.arg(pval, df_cols, several.ok = TRUE)
cpm <- match.arg(cpm, df_cols)
p_adj_method <- match.arg(p_adj_method, c(p.adjust.methods, "fwer"))
keyval <- match.arg(keyval)
if (is.null(w)) {
## Need to sum \leq one using the HMP algorithms
df[["weights"]] <- 1 / nrow(df)
} else {
df[["weights"]] <- w / sum(w)
}
## Define the columns to return
ret_cols <- c("keyval_range", cpm, logfc, pcol)
if (!is.null(inc_cols)) {
inc_cols <- vapply(
inc_cols, match.arg, character(1), choices = df_cols
)
ret_cols <- unique(c(inc_cols, ret_cols))
}
## Always return the columns counting the total, up & down windows
n_cols <- c("n_windows", "n_up", "n_down")
if ("index" %in% inc_cols)
warning("The column named 'index' will not be returned")
## Merge the ranges and get the map back to the original windows
ranges_out <- GenomicRanges::reduce(
x, min.gapwidth = merge_within, ignore.strand = ignore_strand
)
ol <- findOverlaps(x, ranges_out, ignore.strand = ignore_strand)
stopifnot(length(ol) == length(x))
## Summarise the key values
df[["subjectHits"]] <- subjectHits(ol)
grp_df <- group_by(df, subjectHits)
ref_p <- pcol[[1]]
ref_hmp <- paste0(hm_pre, ref_p)
ret_df <- summarise(
grp_df,
n_windows = dplyr::n(),
across(
all_of(c(cpm, logfc)), \(x) weighted.mean(x, 1 / !!sym(ref_p))
),
across(
all_of(pcol), \(x) .ec_HMP(x, !!sym("weights")),
.names = "{hm_pre}{.col}"
),
n_up = sum(!!sym(logfc) > 0 & !!sym(ref_p) < !!sym(ref_hmp)),
n_down = sum(!!sym(logfc) < 0 & !!sym(ref_p) < !!sym(ref_hmp))
)
## Replace the 'pval' column in the return columns with 'hmp'
new_cols <- paste0(hm_pre, ret_cols[ret_cols %in% pcol])
ret_cols[ret_cols %in% pcol] <- new_cols
## Remaining columns, including the keyval range are based on the
## minimal p-value. Form a separate df with these columns, then merge
inc_df <- as.data.frame(df[c(pcol, inc_cols)])
inc_df[["keyval_range"]] <- as.character(x)
inc_df[["subjectHits"]] <- subjectHits(ol)
inc_df <- arrange(inc_df, !!sym("subjectHits"), !!sym(ref_p))
inc_df <- distinct(inc_df, !!sym("subjectHits"), .keep_all = TRUE)
inc_df <- inc_df[!names(inc_df) %in% pcol]
## If keyval = "merged" (as opposed to "min")
if (keyval == "merged") { ## Reform the keyvalue range by merging
df$range <- as.character(x)
## If a blank prefix has been passed ref_p & ref_hmp will be the same &
## the subsequent left_join will add suffixes causing a failure
df <- left_join(
df, ret_df[c("subjectHits", new_cols)], by = "subjectHits",
suffix = c("_raw", "_hmp")
)
if (ref_p == ref_hmp) {
ref_p <- paste0(ref_p, "_raw")
ref_hmp <- paste0(ref_hmp, "_hmp")
}
df <- dplyr::filter(
df,
(!!sym(ref_p) <= !!sym(ref_hmp) | !!sym(ref_p) == min(!!sym(ref_p))),
.by = !!sym("subjectHits")
)
kv_gr <- granges(colToRanges(df, "range"))
kv_grl <- splitAsList(kv_gr, df$subjectHits)
kv_grl <- range(kv_grl)
kv_gr <- unlist(kv_grl)
mcols(kv_gr)[["subjectHits"]] <- as.integer(names(kv_gr))
names(kv_gr) <- NULL
kv_df <- as_tibble(kv_gr, name = "keyval_range")
inc_df <- inc_df[names(inc_df) != "keyval_range"]
inc_df <- left_join(
inc_df, kv_df, by = "subjectHits", relationship = "one-to-one"
)
}
## Merge the two, select the final columns, adjust p & return
ret_df <- left_join(
ret_df, inc_df, by = "subjectHits", relationship = "one-to-one"
)
ret_df <- ret_df[c(n_cols, ret_cols)]
if (p_adj_method == "fwer") {
## Apply the FWER adjusted version if requested
L <- length(x)
adj_df <- summarise(
grp_df,
across(
all_of(pcol), \(x) .ec_HMP_adj(x, !!sym("weights"), L),
.names = "{hm_pre}{.col}_fwer"
), .groups = "drop"
)
## This could be revisited for better integration with filtering
stopifnot(nrow(ret_df) == nrow(adj_df))
ret_df <- bind_cols(ret_df, adj_df)
}
## Apply the filter based on window size
stopifnot(is.numeric(min_win))
keep_win <- ret_df$n_windows >= min_win
ranges_out <- ranges_out[keep_win]
ret_df <- ret_df[keep_win,]
## Adjust p-values using p.adj.methods (after window size filtering)
if (p_adj_method != "fwer") {
## This can now create multiple p-adjust columns.
ret_df <- mutate(
ret_df,
across(
all_of(new_cols), \(x) p.adjust(x, method = p_adj_method),
.names = "{.col}_{p_adj_method}"
)
)
}
## Add to mcols to the ranges
mcols(ranges_out) <- as.data.frame(ret_df)
ranges_out$keyval_range <- GRanges(
ranges_out$keyval_range, seqinfo = seqinfo(ranges_out)
)
ranges_out
}
)
#' @rdname mergeByHMP-methods
#' @export
setMethod(
"mergeByHMP",
signature = signature(x = "RangedSummarizedExperiment"),
function(
x, df = NULL, w = NULL,
logfc = "logFC", pval = "P", cpm = "logCPM", inc_cols = NULL,
p_adj_method = "fdr", merge_within = 1L, ignore_strand = FALSE,
hm_pre = "hm", ...
) {
gr <- rowRanges(x)
mergeByHMP(
gr, df, w, logfc, pval, cpm, inc_cols, p_adj_method, merge_within,
ignore_strand, hm_pre = hm_pre, ...
)
}
)
# This is a modified version of harmonicmeanp::p.hmp developed by Prof Daniel
# Wilson, and hardwired to simply return a combined asymptotically exact HMP.
# Hardwiring like this gives a 10-fold speed-up. Further modifications may be
# possible, but this seems enough for now
# @param p vector of p-values
# @param w vector of weights
#' @useDynLib extraChIPs, .registration = TRUE
#' @keywords internal
.ec_HMP <- function(p, w) {
n <- length(p)
hmp <- sum(w) / sum(w / p)
loc <- log(n) + 1 + digamma(1) - log(2/pi)
dbl <- double(1)
# cout <- .C(
# "RtailsMSS", 1, 0, 1, loc, log(pi/2), 1.0, 1/hmp,
# out1 = dbl, out2 = dbl, out3 = dbl, out4 = dbl,
# out5 = dbl, out6 = dbl, COPY = rep(c(FALSE, TRUE), c(7, 6)),
# PACKAGE = "FMStable"
# )
# Still need to figure out if all of those outputs above can be dropped
Rcout <- .C(
"my_RtailsMSS", loc, 1/hmp, d = dbl, logd = dbl, `F` = dbl, logF = dbl,
cF = dbl, logcF = dbl,
COPY = rep(c(FALSE, TRUE), c(2, 6)), PACKAGE = "extraChIPs"
)
Rcout$cF
}
# Similar to the above, this produces the FWER-controlled version in a
# streamlined way
# @param p vector of p-values
# @param w vector of weights
# @param L Number of global tests
#' @useDynLib extraChIPs, .registration = TRUE
#' @keywords internal
.ec_HMP_adj <- function(p, w, L) {
hmp <- sum(w) / sum(w / p)
w.sum <- sum(w)
loc <- log(L[[1]]) + 1 + digamma(1) - log(2/pi)
dbl <- double(1)
Rcout <- .C(
"my_RtailsMSS", loc, w.sum/hmp, d = dbl, logd = dbl, `F` = dbl,
logF = dbl, cF = dbl, logcF = dbl,
COPY = rep(c(FALSE, TRUE), c(2, 6)), PACKAGE = "extraChIPs"
)
Rcout$cF
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.