# mistyR utility functions
# Copyleft (ɔ) 2020-2021 Jovan Tanevski <jovan.tanevski@uni-heidelberg.de>
#' Aggregate collected results
#'
#' Helper function
#'
#' @param improvements collected improvements.
#' @param contributions collected contributions.
#' @param importances collected importances.
#'
#' @return a list of improvement stats, contribution stats and
#' aggregated importances.
#'
#' @seealso \code{\link{collect_results}()} to collect results.
#'
#' @noRd
aggregate_results <- function(improvements, contributions, importances) {
improvements.stats <- improvements %>%
dplyr::filter(!stringr::str_starts(measure, "p\\.")) %>%
dplyr::group_by(target, measure) %>%
dplyr::summarise(
mean = mean(value), sd = stats::sd(value),
cv = sd / mean, .groups = "drop"
)
contributions.stats <- dplyr::inner_join(
# mean coefficients
(contributions %>%
dplyr::filter(!stringr::str_starts(view, "p\\.") &
view != "intercept") %>%
dplyr::group_by(target, view) %>%
dplyr::summarise(mean = mean(value), .groups = "drop_last") %>%
dplyr::mutate(fraction = abs(mean) / sum(abs(mean))) %>%
dplyr::ungroup()),
# p values
(contributions %>%
dplyr::filter(stringr::str_starts(view, "p\\.") &
!stringr::str_detect(view, "intercept")) %>%
dplyr::group_by(target, view) %>%
dplyr::mutate(view = stringr::str_remove(view, "^p\\.")) %>%
dplyr::summarise(
p.mean = mean(value),
p.sd = stats::sd(value),
.groups = "drop"
)),
by = c("target", "view")
)
importances.aggregated <- importances %>%
tidyr::unite(".PT", "Predictor", "Target", sep = "&") %>%
dplyr::group_by(view, .PT) %>%
dplyr::summarise(
Importance = mean(Importance),
nsamples = dplyr::n(), .groups = "drop"
) %>%
tidyr::separate(".PT", c("Predictor", "Target"), sep = "&")
return(list(
improvements.stats = improvements.stats,
contributions.stats = contributions.stats,
importances.aggregated = importances.aggregated
))
}
#' Collect and aggregate results
#'
#' Collect and aggregate performance, contribution and importance estimations
#' of a set of raw results produced by \code{\link{run_misty}()}.
#'
#' @param folders Paths to folders containing the raw results from
#' \code{\link{run_misty}()}.
#'
#' @return List of collected performance, contributions and importances per sample,
#' performance and contribution statistics and aggregated importances.
#' \describe{
#' \item{\var{improvements}}{Long format \code{tibble} with measurements
#' of performance for each \var{target} and each \var{sample}.
#' Available performance measures are RMSE and variance explained
#' (R2) for a model containing only an intrinsic view
#' (\var{intra.RMSE}, \var{intra.R2}), model with all views
#' (\var{multi.RMSE}, \var{multi.R2}), gain of RMSE and gain of
#' variance explained of multi-view model over the intrisic model
#' where \var{gain.RMSE} is the relative decrease of RMSE in percent,
#' while \var{gain.R2} is the absolute increase of variance explained
#' in percent. Each \var{value} represents the mean performance across
#' folds (k-fold cross-validation). The p values of a one sided
#' t-test of improvement of performance (\var{p.RMSE}, \var{p.R2})
#' are also available as a measure.}
#' \item{\var{improvements.stats}}{Long format \code{tibble} with summary
#' statistics (mean, standard deviation and coefficient of variation)
#' for all performance measures for each {target} over all samples.}
#' \item{\var{contributions}}{Long format \code{tibble} with the values
#' of the coefficients for each \var{view} in the meta-model, for each
#' \var{target} and each \var{sample}. The p values for the coefficient
#' for each view, under the null hypothesis of zero contribution to the
#' meta model are also available.}
#' \item{\var{contributions.stats}}{Long format \code{tibble} with summary
#' statistics for all views per target over all samples. Including
#' mean coffecient value, fraction of contribution, mean and standard
#' deviation of p values.}
#' \item{\var{importances}}{List of view-specific predictor-target
#' importance tables per sample. The importances in each table are
#' standardized per target and weighted by the quantile of the coefficient
#' for the target in that view. Columns other than \var{Predictor}
#' represent target markers.}
#' \item{\var{importances.aggregated}}{A list of aggregated view-specific
#' predictor-target importance tables . Aggregation is
#' reducing by mean over all samples.}
#' }
#'
#' @seealso \code{\link{run_misty}()} to train models and
#' generate results.
#'
#' @examples
#' # Train and collect results for 3 samples in synthetic
#'
#' library(dplyr)
#' library(purrr)
#'
#' data("synthetic")
#'
#' misty.results <- synthetic[seq_len(3)] %>%
#' imap_chr(~ create_initial_view(.x %>% select(-c(row, col, type))) %>%
#' add_paraview(.x %>% select(row, col), l = 10) %>%
#' run_misty(paste0("results/", .y))) %>%
#' collect_results()
#' str(misty.results)
#' @export
collect_results <- function(folders) {
samples <- R.utils::getAbsolutePath(folders)
message("\nCollecting improvements")
improvements <- samples %>%
furrr::future_map_dfr(function(sample) {
performance <- readr::read_table(paste0(sample, .Platform$file.sep, "performance.txt"),
na = c("", "NA", "NaN"), col_types = readr::cols()
) %>% dplyr::distinct()
performance %>%
dplyr::mutate(
sample = sample,
gain.RMSE = 100 * (intra.RMSE - multi.RMSE) / intra.RMSE,
gain.R2 = multi.R2 - intra.R2
)
}, .progress = TRUE) %>%
tidyr::pivot_longer(-c(sample, target), names_to = "measure")
message("\nCollecting contributions")
contributions <- samples %>% furrr::future_map_dfr(function(sample) {
coefficients <- readr::read_table(paste0(sample, .Platform$file.sep, "coefficients.txt"),
na = c("", "NA", "NaN"), col_types = readr::cols()
) %>% dplyr::distinct()
coefficients %>%
dplyr::mutate(sample = sample, .after = "target") %>%
tidyr::pivot_longer(cols = -c(sample, target), names_to = "view")
}, .progress = TRUE)
message("\nCollecting importances")
importances <- samples %>%
furrr::future_map_dfr(function(sample) {
targets <- contributions %>%
dplyr::filter(sample == !!sample) %>%
dplyr::pull(target) %>%
unique() %>%
sort(method = "radix")
views <- contributions %>%
dplyr::pull(view) %>%
unique() %>%
stringr::str_subset("^p\\.", negate = TRUE) %>%
stringr::str_subset("^intercept$", negate = TRUE)
# one heatmap per view
maps <- views %>%
furrr::future_map_dfr(function(view) {
all.importances <- targets %>% purrr::map(~ readr::read_csv(paste0(
sample, .Platform$file.sep, "importances_", .x, "_", view, ".txt"
),
col_types = readr::cols()
) %>%
dplyr::distinct() %>%
dplyr::rename(feature = target))
features <- all.importances %>%
purrr::map(~ .x$feature) %>%
unlist() %>%
unique() %>%
sort(method = "radix")
pvalues <- contributions %>%
dplyr::filter(sample == !!sample, view == paste0("p.", !!view)) %>%
dplyr::mutate(value = 1 - value)
# importances are standardized for each target
# and multiplied by 1-pval(view)
all.importances %>%
purrr::imap_dfc(~
tibble::tibble(feature = features, zero.imp = 0) %>%
dplyr::left_join(.x, by = "feature") %>%
dplyr::arrange(feature) %>%
dplyr::mutate(
imp = scale(imp)[, 1],
!!targets[.y] := zero.imp + (imp *
(pvalues %>%
dplyr::filter(target == targets[.y]) %>%
dplyr::pull(value)))
)
%>%
dplyr::select(targets[.y])) %>%
dplyr::mutate(Predictor = features) %>%
tidyr::pivot_longer(
names_to = "Target",
values_to = "Importance",
-Predictor
) %>%
dplyr::mutate(Importance = replace(
Importance,
is.nan(Importance), 0
)) %>%
dplyr::mutate(view = view, .before = 1)
}) %>%
dplyr::mutate(sample = sample, .before = 1)
}, .progress = TRUE)
message("\nAggregating")
misty.results <- c(
list(
improvements = improvements,
contributions = contributions,
importances = importances
),
aggregate_results(improvements, contributions, importances)
)
return(misty.results)
}
#' Aggregate a subset of results
#'
#' @inheritParams collect_results
#'
#' @param misty.results a results list generated by
#' \code{\link{collect_results}()}.
#'
#' @return the \code{misty.results} list with an added list item
#' \code{importances.aggregated.subset} containing the aggregated importances
#' for the subset of \code{folders}.
#'
#' @seealso \code{\link{collect_results}()} to generate a
#' results list from raw results.
#'
#' @noRd
aggregate_results_subset <- function(misty.results, folders) {
assertthat::assert_that(("importances" %in% names(misty.results)),
msg = "The provided result list is malformed. Consider using collect_results()."
)
normalized.folders <- R.utils::getAbsolutePath(folders)
# check if folders are in names of misty.results
assertthat::assert_that(all(normalized.folders %in%
(misty.results$importances %>% dplyr::pull(sample))),
msg = "The provided results list doesn't contain information about some of
the requested result folders. Consider using collect_results()."
)
message("Aggregating subset")
importances.aggregated.subset <- misty.results$importances %>%
dplyr::filter(sample %in% normalized.folders) %>%
tidyr::unite(".PT", "Predictor", "Target", sep = "&") %>%
dplyr::group_by(view, .PT) %>%
dplyr::summarise(
Importance = mean(Importance),
nsamples = dplyr::n(), .groups = "drop"
) %>%
tidyr::separate(".PT", c("Predictor", "Target"), sep = "&")
misty.results[["importances.aggregated.subset"]] <- importances.aggregated.subset
return(misty.results)
}
#' Clear cached objects
#'
#' Purge the cache or clear the cached objects for a single sample.
#'
#' The cached objects are removed from disk and cannot be retrieved. Whenever
#' possible specifying an \code{id} is reccomended. If \code{id = NULL} all
#' contents of the folder \file{.misty.temp} will be removed.
#'
#' @param id the unique id of the sample.
#'
#' @return None (\code{NULL})
#'
#' @examples
#' clear_cache("b98ad35f4e671871cba35f2155228612")
#'
#' clear_cache()
#' @export
clear_cache <- function(id = NULL) {
cache.folder <- R.utils::getAbsolutePath(".misty.temp")
if (is.null(id)) {
if (dir.exists(cache.folder)) {
unlink(cache.folder, recursive = TRUE)
} else {
warning("Cache folder doesn't exist.")
}
} else {
sample.cache.folder <- paste0(cache.folder, .Platform$file.sep, id)
if (dir.exists(sample.cache.folder)) {
unlink(sample.cache.folder, recursive = TRUE)
} else {
warning("Cache folder for requested id doesn't exist.")
}
}
}
#' Removes empty cache folders.
#'
#' @return None (\code{NULL})
#'
#' @noRd
sweep_cache <- function() {
cache.folder <- R.utils::getAbsolutePath(".misty.temp")
if (dir.exists(cache.folder)) {
list.files(cache.folder, full.names = TRUE) %>%
purrr::walk(function(path) {
if (length(list.files(path)) == 0) {
unlink(path, recursive = TRUE)
}
})
if (length(list.files(cache.folder, full.names = TRUE)) == 0) {
clear_cache()
}
}
}
#' Extract signatures from the results
#'
#' Signature is a representation of each sample in the space of mistyR results.
#'
#' The performance signature of each sample is a concatenation of the estimated
#' values of variance explained using only the intraview, the variance explained
#' by the multiview model and the gain in variance explained for each marker.
#' The performance signature vector for each sample available in
#' \code{misty.results} is of length \eqn{\textrm{markers} \cdot 3}{markers x 3}.
#'
#' The contribution signature of each sample is a concatenation of the estimated
#' fraction of contribution of each view for each marker.
#' The contribution signature vector for each sample available in
#' \code{misty.results} is of length
#' \eqn{\textrm{markers} \cdot \textrm{views}}{markers x views}.
#'
#' The importance signature of each sample is a concatenation of the estimated
#' and weighted importances for each predictor-target marker pair from all views.
#' The importance signature vector for each sample available in
#' \code{misty.results} is of length
#' \eqn{\textrm{markers}^2 \cdot \textrm{views}}{markers^2 x views}.
#'
#' @inheritParams plot_interaction_heatmap
#' @param type type of signature to extract from the results.
#'
#' @return A table with one row per sample from \code{misty.results} representing
#' its signature.
#'
#' @seealso \code{\link{collect_results}()} to generate a
#' results list from raw results.
#'
#' @examples
#' library(dplyr)
#'
#' misty.results <-
#' list.files("results", full.names = TRUE) %>% collect_results()
#'
#' extract_signature(misty.results, "performance")
#' @export
extract_signature <- function(misty.results,
type = c(
"performance", "contribution", "importance"
), trim = -Inf, trim.measure = c(
"gain.R2", "multi.R2", "intra.R2",
"gain.RMSE", "multi.RMSE", "intra.RMSE"
)) {
signature.type <- match.arg(type)
trim.measure.type <- match.arg(trim.measure)
assertthat::assert_that(
all(c(
"improvements", "contributions",
"importances", "importances.aggregated"
) %in%
names(misty.results)),
msg = "The provided result list is malformed.
Consider using collect_results()."
)
inv <- sign((stringr::str_detect(trim.measure.type, "gain") |
stringr::str_detect(trim.measure.type, "RMSE", negate = TRUE)) - 0.5)
targets <- misty.results$improvements.stats %>%
dplyr::filter(
measure == trim.measure.type,
inv * mean >= inv * trim
) %>%
dplyr::pull(target)
switch(signature.type,
"performance" = {
target.intersection <- misty.results$improvements %>%
dplyr::group_by(sample) %>%
dplyr::summarize(ts = list(unique(target))) %>%
dplyr::pull(ts) %>%
purrr::reduce(intersect) %>%
intersect(targets)
misty.results$improvements %>%
dplyr::filter(
target %in% target.intersection,
stringr::str_ends(measure, "R2"),
!stringr::str_ends(measure, "p.R2")
) %>%
tidyr::unite(".Feature", target, measure) %>%
# grouping necessary?
dplyr::group_by(sample) %>%
tidyr::pivot_wider(names_from = ".Feature", values_from = "value") %>%
dplyr::ungroup()
},
"contribution" = {
target.intersection <- misty.results$contributions %>%
dplyr::group_by(sample) %>%
dplyr::summarize(ts = list(unique(target))) %>%
dplyr::pull(ts) %>%
purrr::reduce(intersect) %>%
intersect(targets)
misty.results$contributions %>%
dplyr::filter(
target %in% target.intersection,
!stringr::str_starts(view, "p\\."),
!stringr::str_detect(view, "intercept")
) %>%
dplyr::group_by(sample, target) %>%
dplyr::mutate(frac = abs(value) / sum(abs(value)), value = NULL) %>%
tidyr::unite(".Feature", view, target) %>%
tidyr::pivot_wider(names_from = ".Feature", values_from = "frac") %>%
dplyr::ungroup()
},
"importance" = {
views <- misty.results$importances %>%
dplyr::pull(view) %>%
unique()
views %>%
purrr::map(function(view) {
view.importances <- misty.results$importances %>%
dplyr::filter(view == !!view, !is.na(Importance))
target.intersection <- view.importances %>%
dplyr::group_by(sample) %>%
dplyr::summarize(ts = list(unique(Target))) %>%
dplyr::pull(ts) %>%
purrr::reduce(intersect) %>%
intersect(targets)
predictor.intersection <- view.importances %>%
dplyr::group_by(sample) %>%
dplyr::summarize(ts = list(unique(Predictor))) %>%
dplyr::pull(ts) %>%
purrr::reduce(intersect)
view.importances %>%
dplyr::filter(
Predictor %in% predictor.intersection,
Target %in% target.intersection
) %>%
tidyr::unite(".vPT", view, Predictor, Target) %>%
tidyr::pivot_wider(names_from = ".vPT", values_from = "Importance")
}) %>%
purrr::reduce(dplyr::full_join, by = "sample")
}
)
}
#' Function to merge named arguments from two lists without removing NULL entries
#'
#' @param l1 list 1
#' @param l2 list 2
#'
#' @noRd
merge_two <- function(l1, l2) {
n1 <- names(l1)
n2 <- names(l2)
diff <- n1[!(n1 %in% n2)]
n1_list <- diff %>%
purrr::set_names() %>%
purrr::map(function(name) l1[[name]])
union <- n2[!(n2 %in% diff)]
n2_list <- union %>%
purrr::set_names() %>%
purrr::map(function(name) l2[[name]])
return(c(n1_list, n2_list))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.