#' @description
#'
#' Utility function to group rows/columns in an nxn logical matrix which
#' have a `TRUE`. Note that this groups also rows/columns together that are not
#' direclty *linked* with a `TRUE`, but that are linked via other rows/columns
#' they have in common (i.e. if between row 2 and 4 is a `TRUE`, but also
#' between 3 and 4, all 3 of them are joined together, even if they are not
#' directly linked).
#'
#' @param x `logical` `matrix` with same number of rows and columns. See below
#' for examples.
#'
#' @return `list` of `integer` indexes of rows (columns) that are grouped.
#'
#' @noRd
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' ## Find groups of rows with a correlation higher than 0.9
#' x <- rbind(
#' c(1, 3, 2, 5),
#' c(2, 6, 4, 7),
#' c(1, 1, 3, 1),
#' c(1, 3, 3, 6),
#' c(0, 4, 3, 1))
#' xcor <- cor(t(x))
#'
#' .group_logic_matrix(xcor > 0.9)
#'
#' xcor <- matrix(FALSE, ncol = 13, nrow = 13)
#' for (i in 1:13)
#' xcor[i, i] <- TRUE
#' xcor[8, 6] <- TRUE
#' xcor[8, 7] <- TRUE
#' xcor[9, 7] <- TRUE
#' xcor[11, 7] <- TRUE
#' xcor[6, 8] <- TRUE
#' xcor[7, 8] <- TRUE
#' xcor[10, 8] <- TRUE
#' xcor[13, 8] <- TRUE
#' xcor[7, 9] <- TRUE
#' xcor[8, 10] <- TRUE
#' xcor[7, 11] <- TRUE
#' xcor[12, 11] <- TRUE
#' xcor[11, 12] <- TRUE
#' xcor[8, 13] <- TRUE
#' .group_logic_matrix(xcor)
#'
#' xcor <- matrix(FALSE, ncol = 10, nrow = 10)
#' for (i in seq_len(ncol(xcor))) {
#' xcor[i, i] <- TRUE
#' }
#' xcor[1, 4] <- TRUE
#' xcor[4, 1] <- TRUE
#' xcor[2, 8] <- TRUE
#' xcor[8, 2] <- TRUE
#' xcor[3, 9] <- TRUE
#' xcor[9, 3] <- TRUE
#' xcor[8, 9] <- TRUE
#' xcor[9, 8] <- TRUE
#' .group_logic_matrix(xcor)
.group_logic_matrix <- function(x) {
x <- unname(x)
nr <- nrow(x)
if (nr != ncol(x))
stop("'x' is supposed to be a symmetric matrix")
groups <- vector("list", nr)
idx <- which(x, arr.ind = TRUE)
is_in_group <- rep(FALSE, nr)
for (i in seq_len(nr)) {
if (is_in_group[i])
next
elms <- idx[idx[, 1] == i, 2]
if (length(elms) == 1) {
groups[[i]] <- i
is_in_group[i] <- TRUE
} else {
while(!all(is_in_group[elms])) {
is_in_group[elms] <- TRUE
## Get all rows containing these elements
elms <- unique(idx[idx[, 1] %in% elms, 2])
}
groups[[i]] <- elms
}
}
groups[lengths(groups) > 0]
}
#' @note this expects a `list` of `integer` were each index is only present
#' **once** and in addition no indices should be missing!
#'
#' @noRd
.index_list_to_factor <- function(x) {
len <- sum(lengths(x))
res <- integer(len)
for (i in seq_along(x))
res[x[[i]]] <- i
as.factor(res)
}
#' @title Group rows in a matrix based on their correlation
#'
#' @description
#'
#' The `groupByCorrelation` allows to group rows in a numeric matrix based on
#' their correlation with each other.
#'
#' Two types of groupings are available:
#'
#' - `inclusive = FALSE` (the default): the algorithm creates small groups of
#' highly correlated members, all of which have a correlation with each other
#' that are `>= threshold`. Note that with this algorithm, rows in `x` could
#' still have a correlation `>= threshold` with one or more elements of a
#' group they are not part of. See notes below for more information.
#' - `inclusive = TRUE`: the algorithm creates large groups containing rows that
#' have a correlation `>= threshold` with at least one element of that group.
#' For example, if row 1 and 3 have a correlation above the threshold and
#' rows 3 and 5 too (but correlation between 1 and 5 is below the threshold)
#' all 3 are grouped into the same group (i.e. rows 1, 3 **and** 5).
#'
#' Note that with parameter `f` it is also possible to pre-define groups of
#' rows that should be further sub-grouped based on correlation with each other.
#' In other words, if `f` is provided, correlations are calculated only between
#' rows with the same value in `f` and hence these pre-defined groups of rows
#' are further sub-grouped based on pairwise correlation. The returned `factor`
#' is then `f` with the additional subgroup appended (and separated with a
#' `"."`). See examples below.
#'
#' @note
#'
#' Implementation note of the grouping algorithm:
#'
#' - all correlations between rows in `x` which are `>= threshold` are
#' identified and sorted decreasingly.
#' - starting with the pair with the highest correlation groups are defined:
#' - if none of the two is in a group, both are put into the same new group.
#' - if one of the two is already in a group, the other is put into the same
#' group if **all** correlations of it to that group are `>= threshold`
#' (and are not `NA`).
#' - if both are already in the same group nothing is done.
#' - if both are in different groups: an element is put into the group of the
#' other if a) all correlations of it to members of the other's group
#' are not `NA` and `>= threshold` **and** b) the average correlation to the
#' other group is larger than the average correlation to its own group.
#'
#' This ensures that groups are defined in which all elements have a correlation
#' `>= threshold` with each other and the correlation between members of the
#' same group is maximized.
#'
#' @param x `numeric` `matrix` where rows should be grouped based on
#' correlation of their values across columns being larger than `threshold`.
#'
#' @param method `character(1)` with the method to be used for correlation. See
#' [corr()] for options.
#'
#' @param use `character(1)` defining which values should be used for the
#' correlation. See [corr()] for details.
#'
#' @param threshold `numeric(1)` defining the cut of value above which
#' rows are considered to be correlated and hence grouped.
#'
#' @param f optional vector of length equal to `nrow(x)` pre-defining groups
#' of rows in `x` that should be further sub-grouped. See description for
#' details.
#'
#' @param inclusive `logical(1)` whether a version of the grouping algorithm
#' should be used that leads to larger, more loosely correlated, groups. The
#' default is `inclusive = FALSE`. See description for more information.
#'
#' @return `factor` with same length than `nrow(x)` with the group each row
#' is assigned to.
#'
#' @author Johannes Rainer
#'
#' @importFrom stats cor median quantile residuals sd
#'
#' @importFrom MsFeatures groupSimilarityMatrix
#'
#' @family grouping operations
#'
#' @export
#'
#' @examples
#'
#' x <- rbind(
#' c(1, 3, 2, 5),
#' c(2, 6, 4, 7),
#' c(1, 1, 3, 1),
#' c(1, 3, 3, 6),
#' c(0, 4, 3, 1),
#' c(1, 4, 2, 6),
#' c(2, 8, 2, 12))
#'
#' ## define which rows have a high correlation with each other
#' groupByCorrelation(x)
#'
#' ## assuming we have some prior grouping of rows, further sub-group them
#' ## based on pairwise correlation.
#' f <- c(1, 2, 2, 1, 1, 2, 2)
#' groupByCorrelation(x, f = f)
groupByCorrelation <- function(x, method = "pearson",
use = "pairwise.complete.obs",
threshold = 0.9, f = NULL, inclusive = FALSE) {
if (length(threshold) > 1)
stop("'threshold' has to be of length 1")
if (!is.null(f)) {
if (length(f) != nrow(x))
stop("If 'f' is provided its length has to be equal to 'nrow(x)'")
if (!is.factor(f))
f <- factor(f, levels = unique(f))
fnew <- rep(NA_character_, length(f))
for (fg in levels(f)) {
idx <- which(f == fg)
idxl <- length(idx)
if (idxl > 1) {
cors <- cor(t(x[idx, ]), method = method, use = use)
if (inclusive) {
## Ensure diagonal matrix is TRUE so that even if some
## features have a correlation value of NA they are not
## dropped
cors <- cors >= threshold
cors[cbind(1:idxl, 1:idxl)] <- TRUE
fids <- .index_list_to_factor(.group_logic_matrix(cors))
} else
fids <- groupSimilarityMatrix(cors, threshold = threshold)
fnew[idx] <- paste0(fg, ".", MsFeatures:::.format_id(fids))
} else
fnew[idx] <- paste0(fg, ".001")
}
as.factor(fnew)
} else {
cors <- cor(t(x), method = method, use = use)
if (inclusive)
.index_list_to_factor(.group_logic_matrix(cors >= threshold))
else
as.factor(groupSimilarityMatrix(cors, threshold = threshold))
}
}
#' @title Sub-group allowing only single positive/polarity pairs per group
#'
#' @description
#'
#' Based on a grouping `f` and the polarity of each element `polarity`, the
#' function ensures each feature group to consist of only a single
#' positive/negative feature pair. Thus, each of the groups in `f` is further
#' sub-grouped into positive/negative feature pairs with the highest correlation
#' of feature values across samples. In the returned grouping no group will
#' have more than one feature of the same polarity.
#'
#' This function can be helpful for merging feature grouping results from
#' positive and negative polarity.
#'
#' @param f vector defining the initial grouping of elements (e.g. such as
#' returned by `groupByCorrelation`.
#'
#' @param polarity vector (same length than `f`) with the polarity for each
#' element (feature).
#'
#' @param fvals `numeric` `matrix` (number of rows matching `length(f)`) with
#' feature values across e.g. samples. Correlations will be calculated
#' between rows of this matrix.
#'
#' @return `factor` with the grouping, ensuring that each group consists of only
#' a single positive/negative pair.
#'
#' @author Johannes Rainer
#'
#' @family grouping operations
#'
#' @export
#'
#' @examples
#'
#' ## Define a simple matrix where the first 4 and the last 3 rows are highly
#' ## correlated with each other.
#' x <- rbind(
#' c(4, 3, 5, 1),
#' c(4, 2, 5, 1),
#' c(4, 3, 4, 1),
#' c(4, 3, 4, 1),
#' c(4, 4, 4, 9),
#' c(4, 4, 4, 9),
#' c(4, 4, 4, 9))
#'
#' ## Determin the expected grouping by correlation
#' grp <- groupByCorrelation(x)
#' grp
#'
#' ## Each second row represents however a feature measured in a different
#' ## polarity. From another grouping (e.g. based on EIC correlation) we know
#' ## however that none of the positive polarity features should be
#' ## grouped together. We apply now the function to further subgroup `grp`
#' ## keeping only single positive/negative pairs in each group of `grp`.
#' pol <- c("NEG", "POS", "NEG", "POS", "NEG", "POS", "NEG")
#'
#' groupToSinglePolarityPairs(grp, pol, x)
groupToSinglePolarityPairs <- function(f, polarity = rep(1, length(f)), fvals) {
if (missing(f) || missing(polarity) || missing(fvals))
stop("Parameters 'f', 'polarity' and 'fvals' are required")
if (length(f) != length(polarity))
stop("Length of 'f' and 'polarity' has to match")
if (length(f) != nrow(fvals))
stop("Number of rows of 'fvals' and length of 'f' has to match")
if (!is.factor(f))
f <- factor(f, levels = unique(f))
res <- rep(NA_character_, length(f))
for (fg in levels(f)) {
idx <- which(f == fg)
idxl <- length(idx)
if (idxl > 1) {
pols <- polarity[idx]
if (length(unique(pols)) > 1) {
cors <- cor(t(fvals[idx, ]), use = "pairwise.complete.obs",
method = "pearson")
cors[!upper.tri(cors)] <- NA
grpl <- list()
while(any(!is.na(cors))) {
idx_max <- which(cors == max(cors, na.rm = TRUE),
arr.ind = TRUE)[1, , drop = FALSE]
cors[idx_max] <- NA
idx_max <- as.numeric(idx_max)
## Only consider pairing if we have pos AND neg polarity
if (length(unique(pols[idx_max])) > 1) {
grpl[[(length(grpl) + 1)]] <- idx_max
cors[idx_max, ] <- NA
cors[, idx_max] <- NA
}
}
grps_tmp <- rep(NA_integer_, idxl)
for (i in seq_along(grpl))
grps_tmp[grpl[[i]]] <- i
nas <- is.na(grps_tmp)
if (any(nas))
grps_tmp[nas] <- seq((length(grpl) + 1),
length.out = sum(nas))
res[idx] <- paste0(fg, ".", grps_tmp)
} else
res[idx] <- paste0(fg, ".", seq_len(idxl))
} else
res[idx] <- paste0(fg, ".1")
}
as.factor(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.