#------------------------------------------------------------------------------#
# Internal functions
#------------------------------------------------------------------------------#
## All functions in this file are NOT exported, to be used internally only.
#### ---- General utilities ----####
# Splits a data frame in a predefined number of chunks
#' @importFrom purrr map2 list_c
#' @importFrom dplyr mutate group_by group_split select
.split_df_in_chunks <- function(df, n_chunks, keep_chunk_id = TRUE) {
max_rows <- round(nrow(df) / n_chunks)
chunk_load <- rep(max_rows, n_chunks)
names(chunk_load) <- seq_len(n_chunks)
delta <- sum(chunk_load) - nrow(df)
i <- length(chunk_load)
while (delta > 0) {
chunk_load[i] <- chunk_load[i] - 1
delta <- sum(chunk_load) - nrow(df)
i <- i - 1
}
chunk_id_vec <- purrr::map2(names(chunk_load), chunk_load, ~ rep(.x, .y)) |>
purrr::list_c()
df <- df |>
dplyr::mutate(
chunk_id = chunk_id_vec
)
chunks <- df |>
dplyr::group_by(.data$chunk_id) |>
dplyr::group_split()
if (!keep_chunk_id) {
chunks <- purrr::map(chunks, ~ .x |> dplyr::select(-"chunk_id"))
}
return(chunks)
}
#### ---- Internals for dynamic variables ----####
# Internal for setting mandatory or annotation is vars
# tbl_type must be in {mand_vars, annot_vars, af_vars, iss_vars}
#' @importFrom purrr map_lgl is_empty
#' @importFrom rlang is_formula abort warn .data
#' @importFrom dplyr filter
#' @importFrom tibble tibble_row
.new_IS_vars_checks <- function(specs, err, tbl_type) {
new_vars <- if (is.data.frame(specs)) {
# if data frame supplied
colnames_ok <- all(c("names", "types", "transform", "flag", "tag")
%in% colnames(specs))
col_types_ok <- if (colnames_ok) {
all(purrr::map_lgl(
specs[c("names", "types", "flag", "tag")],
is.character
))
} else {
FALSE
}
types_ok <- if (colnames_ok & col_types_ok) {
all(specs$types %in% .types_mapping()[["types"]])
} else {
FALSE
}
flags_ok <- if (colnames_ok & col_types_ok) {
all(specs$flag %in% c("required", "optional"))
} else {
FALSE
}
transform_col_ok <- if (colnames_ok & col_types_ok) {
is.list(specs$transform) &&
all(purrr::map_lgl(
specs$transform,
~ {
is.null(.x) || rlang::is_formula(.x) || is.function(.x)
}
))
} else {
FALSE
}
if (!(colnames_ok & col_types_ok & types_ok & transform_col_ok &
flags_ok)) {
rlang::abort(err)
}
specs
} else {
# if vector supplied
if (is.null(names(specs)) || !all(specs %in%
.types_mapping()[["types"]])) {
rlang::abort(err)
}
purrr::map2(
names(specs), specs,
~ tibble::tibble_row(
names = .x,
types = .y,
transform = list(NULL),
flag = "required",
tag = NA_character_
)
) |>
purrr::list_rbind()
}
# -- check critical tags
if (!is.null(tbl_type)) {
available_tags_tbl <- available_tags()
to_check <- available_tags_tbl |>
dplyr::filter(
.data$dyn_vars_tbl == tbl_type,
!purrr::map_lgl(.data$needed_in, purrr::is_empty)
)
if (!all(to_check$tag %in% new_vars$tag)) {
missing_tags <- to_check$tag[!to_check$tag %in% new_vars$tag]
inspect_cmd <- paste0(
"inspect_tags(c(", paste0("'", missing_tags,
"'",
collapse = ","
),
"))"
)
warn <- c("Warning: important tags missing",
i = paste(
"Some tags are required for proper execution",
"of some functions. If these tags are not",
"provided, execution of dependent functions",
"might fail.",
"Review your inputs carefully."
),
i = paste("Missing tags:", paste0(missing_tags,
collapse = ", "
)),
i = paste0(
"To see where these are involved type `",
inspect_cmd, "`"
)
)
rlang::warn(warn, class = "missing_crit_tags")
}
}
new_vars
}
# Applies transformations on columns as specified in variables specs
# expects specs to be in data frame format
.apply_col_transform <- function(df, specs) {
# Extract and associate names and transf
non_null <- purrr::pmap(specs, function(
names, types,
transform, flags, tag) {
if (is.null(transform)) {
return(NULL)
}
return(transform)
}) |>
purrr::set_names(specs$names)
# Retain only non-null transform
non_null <- non_null[purrr::map_lgl(non_null, ~ !is.null(.x))]
if (length(non_null) > 0) {
# if there are transf to apply
apply_transform <- function(col, col_name) {
if (!col_name %in% names(non_null)) {
return(col)
}
transformation <- non_null[[col_name]]
t <- if (rlang::is_formula(transformation)) {
unlist(purrr::map(col, transformation))
} else {
# if it is a function
do.call(what = transformation, args = list(col))
}
}
df <- purrr::map2(df, colnames(df), apply_transform) |>
tibble::as_tibble()
}
df
}
# Internal to quickly convert columns marked as dates with lubridate
.convert_dates <- function(date_vec, format) {
if (format %in% date_formats()) {
parsed <- lubridate::parse_date_time(date_vec, format)
if (format %in% c(
"ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm",
"dmy_h", "mdy_hms", "mdy_hm", "mdy_h",
"ydm_hms", "ydm_hm", "ydm_h"
)) {
return(parsed)
} else {
return(lubridate::as_date(parsed))
}
}
if (format == "date") {
# Guesses the format
return(lubridate::as_date(as.character(date_vec)))
}
}
# Generates a data frame holding file suffixes combinations for
# matrices
.generate_suffix_specs <- function(
quantification_suffix,
annotation_suffix,
file_ext,
glue_file_spec) {
# Calculate combinations
combinations <- tidyr::expand_grid(
quantification_suffixes = purrr::list_c(quantification_suffix),
annotation_suffixes = purrr::list_c(annotation_suffix),
file_ext = file_ext
)
final_specs <- combinations |>
dplyr::mutate(
file_suffix = paste(.data$quantification_suffixes, "_matrix",
.data$annotation_suffixes, ".",
.data$file_ext,
sep = ""
),
matrix_type = purrr::map_chr(
.data$annotation_suffixes,
~ names(annotation_suffix[annotation_suffix == .x])
),
quantification = purrr::map_chr(
.data$quantification_suffixes,
~ names(quantification_suffix[quantification_suffix == .x])
)
) |>
dplyr::select(dplyr::all_of(c(
"quantification", "matrix_type",
"file_suffix"
)))
final_specs
}
#### ---- Internals for utilities ----####
#---- USED IN : generate_default_folder_structure ----
.process_af_for_gen <- function(af) {
association_file <- if (!is.data.frame(af) && all(af == "default")) {
af_sym <- "association_file"
utils::data(
list = af_sym, envir = rlang::current_env(),
package = "ISAnalytics"
)
rlang::eval_tidy(rlang::sym(af_sym))
} else {
af
}
required_af_tags <- c(
"pcr_repl_id" = "char",
"vispa_concatenate" = "char",
"tag_seq" = "char",
"project_id" = "char"
)
tag_to_cols_af <- .check_required_cols(required_af_tags,
vars_df = association_file_columns(TRUE),
duplicate_politic = "error"
)
if (!all(tag_to_cols_af$names %in% colnames(association_file))) {
rlang::abort(.missing_req_cols(
tag_to_cols_af$names,
tag_to_cols_af$names[!tag_to_cols_af$names %in%
colnames(association_file)]
), class = "missing_req_col_err")
}
tag_to_cols_af_list <- purrr::map(tag_to_cols_af$tag, ~ tag_to_cols_af |>
dplyr::filter(.data$tag == .x) |>
dplyr::pull(.data$names)) |>
purrr::set_names(tag_to_cols_af$tag)
return(list(af = association_file, tag_list = tag_to_cols_af_list))
}
.process_m_for_gen <- function(matrices, af, tag_list) {
integration_matrices <- if (!is.data.frame(matrices) &&
all(matrices == "default")) {
matrices_sym <- "integration_matrices"
utils::data(
list = matrices_sym, envir = rlang::current_env(),
package = "ISAnalytics"
)
rlang::eval_tidy(rlang::sym(matrices_sym))
} else {
matrices
}
matrix_required_cols <- c(
mandatory_IS_vars(),
tag_list$pcr_repl_id
)
if (!all(matrix_required_cols %in% colnames(integration_matrices))) {
rlang::abort(.missing_req_cols(
matrix_required_cols,
matrix_required_cols[!matrix_required_cols %in%
colnames(integration_matrices)]
), class = "missing_req_col_err")
}
# First separate by project
sep_matrices <- integration_matrices |>
dplyr::left_join(
af |>
dplyr::select(
dplyr::all_of(c(
tag_list$pcr_repl_id,
tag_list$vispa_concatenate,
tag_list$project_id
))
),
by = tag_list$pcr_repl_id
) |>
dplyr::group_by(dplyr::across(dplyr::all_of(tag_list$project_id)))
proj_names <- sep_matrices |>
dplyr::group_keys() |>
dplyr::pull(dplyr::all_of(tag_list$project_id))
sep_matrices <- sep_matrices |>
dplyr::group_split(.keep = FALSE) |>
purrr::set_names(proj_names)
# Then separate by pool
sep_matrices <- purrr::map(sep_matrices, ~ {
tmp <- .x |>
dplyr::group_by(dplyr::across(
dplyr::all_of(tag_list$vispa_concatenate)
))
pool_names <- tmp |>
dplyr::group_keys() |>
dplyr::pull(dplyr::all_of(tag_list$vispa_concatenate))
tmp <- tmp |>
dplyr::group_split(.keep = FALSE) |>
purrr::set_names(pool_names)
return(tmp)
})
return(sep_matrices)
}
# NOTE: the function expects vispa stats to be embedded in the association
# file, not provided as separate files
.process_iss_for_gen <- function(association_file, tag_list) {
# Find VISPA stats minimal required columns
minimal_iss_cols <- iss_stats_specs(TRUE) |>
dplyr::filter(
.data$flag == "required",
!.data$tag %in% c(
"vispa_concatenate",
"tag_seq"
)
) |>
dplyr::pull(.data$names)
# If the minimal cols are not found in af, nothing to do, return
if (!all(minimal_iss_cols %in% colnames(association_file))) {
return(NULL)
}
# Check required tags
iss_specs <- .check_required_cols(
required_tags = c(
"vispa_concatenate" = "char",
"tag_seq" = "char"
),
vars_df = iss_stats_specs(TRUE),
duplicate_politic = "error"
)
iss_tags <- purrr::map(iss_specs$tag, ~ iss_specs |>
dplyr::filter(.data$tag == .x) |>
dplyr::pull(.data$names)) |>
purrr::set_names(iss_specs$tag)
iss_cols <- iss_stats_specs(TRUE) |>
dplyr::filter(!.data$tag %in% c("vispa_concatenate", "tag_seq")) |>
dplyr::pull(.data$names)
iss_cols <- colnames(association_file)[colnames(association_file) %in%
iss_cols]
iss_data <- association_file |>
dplyr::select(
dplyr::all_of(c(
tag_list$project_id,
tag_list$vispa_concatenate,
tag_list$tag_seq,
iss_cols
))
) |>
dplyr::rename(
!!iss_tags$tag_seq := tag_list$tag_seq,
!!iss_tags$vispa_concatenate :=
tag_list$vispa_concatenate
) |>
dplyr::distinct() |>
dplyr::filter(!dplyr::if_all(dplyr::all_of(iss_cols), is.na))
# Separate by project
stats_split <- iss_data |>
dplyr::group_by(dplyr::across(dplyr::all_of(tag_list$project_id)))
proj_names <- stats_split |>
dplyr::group_keys() |>
dplyr::pull(dplyr::all_of(tag_list$project_id))
stats_split <- stats_split |>
dplyr::group_split(.keep = FALSE) |>
purrr::set_names(proj_names)
# Separate by pool
stats_split <- purrr::map(stats_split, ~ {
tmp <- .x |>
dplyr::group_by(dplyr::across(
dplyr::all_of(iss_tags$vispa_concatenate)
))
pool_names <- tmp |>
dplyr::group_keys() |>
dplyr::pull(dplyr::all_of(iss_tags$vispa_concatenate))
tmp <- tmp |>
dplyr::group_split() |>
purrr::set_names(pool_names)
return(tmp)
})
return(stats_split)
}
# Generates a correct standard file folder structure (either with package
# included data or provided)
.generate_correct <- function(dir, sep_matrices, sep_stats) {
corr_fold <- fs::path(dir, "fs")
fs::dir_create(corr_fold)
## -- for each project
for (proj in names(sep_matrices)) {
proj_fold <- fs::path(corr_fold, proj)
quant_fold <- fs::path(proj_fold, "quantification")
fs::dir_create(quant_fold)
## --- Write matrices
purrr::walk2(sep_matrices[[proj]], names(sep_matrices[[proj]]), ~ {
sparse <- as_sparse_matrix(.x)
pool_fold <- fs::path(quant_fold, .y)
fs::dir_create(pool_fold)
fe_suffix <- dplyr::if_else(.is_annotated(.x),
"fragmentEstimate_matrix.no0.annotated.tsv.gz",
"fragmentEstimate_matrix.tsv.gz"
)
sc_suffix <- dplyr::if_else(.is_annotated(.x),
"seqCount_matrix.no0.annotated.tsv.gz",
"seqCount_matrix.tsv.gz"
)
prefix <- paste(proj, .y, sep = "_")
readr::write_tsv(sparse$fragmentEstimate,
file = fs::path(pool_fold, paste(prefix,
fe_suffix,
sep = "_"
)),
na = ""
)
readr::write_tsv(sparse$seqCount,
file = fs::path(pool_fold, paste(prefix,
sc_suffix,
sep = "_"
)),
na = ""
)
})
}
# --- Write iss
if (any(purrr::map_lgl(sep_stats, ~ !is.null(.x)))) {
for (proj in names(sep_stats)) {
proj_fold <- fs::path(corr_fold, proj)
iss_fold <- fs::path(proj_fold, "iss")
fs::dir_create(iss_fold)
purrr::walk2(sep_stats[[proj]], names(sep_stats[[proj]]), ~ {
pool_fold <- fs::path(iss_fold, .y)
fs::dir_create(pool_fold)
if (nrow(.x) > 0) {
filename <- paste0(
"stats.sequence_", proj,
".", .y, ".tsv"
)
readr::write_tsv(.x,
file = fs::path(pool_fold, filename),
na = ""
)
} else {
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
warn_empty_iss <- c(
paste0(
"Warning: empty stats for project '",
proj, "', pool '", .y, "'"
),
i = paste(
"No files will be produced for",
"this pool, if this behaviour is",
"not desired check the association",
"file provided in input"
)
)
rlang::inform(warn_empty_iss, class = "warn_empty_iss")
}
}
})
}
}
return(corr_fold)
}
# Generates an incorrect standard file folder structure (either with package
# included data or provided) - intended for testing purposes only
.generate_incorrect <- function(dir, sep_matrices, sep_stats) {
err_fold <- fs::path(dir, "fserr")
fs::dir_create(err_fold)
# -- Generate error for projects
if (length(sep_matrices) > 1) {
names(sep_matrices)[1] <- paste0(names(sep_matrices)[1], "-err")
}
## -- for each project
for (proj in names(sep_matrices)) {
proj_fold <- fs::path(err_fold, proj)
quant_fold <- fs::path(proj_fold, "quantification")
fs::dir_create(quant_fold)
### -- Generate pool error
if (length(sep_matrices[[proj]]) > 1) {
names(sep_matrices[[proj]])[1] <- paste0(
names(sep_matrices[[proj]])[1], "-err"
)
}
### -- Choose a pool for matrix error
pool_to_mod <- purrr::detect(
names(sep_matrices[[proj]]),
~ stringr::str_detect(.x,
"-err",
negate = TRUE
)
)
## --- Write matrices
purrr::walk2(sep_matrices[[proj]], names(sep_matrices[[proj]]), ~ {
sparse <- as_sparse_matrix(.x)
pool_fold <- fs::path(quant_fold, .y)
fs::dir_create(pool_fold)
fe_suffix <- dplyr::if_else(.is_annotated(.x),
"fragmentEstimate_matrix.no0.annotated.tsv.gz",
"fragmentEstimate_matrix.tsv.gz"
)
sc_suffix <- dplyr::if_else(.is_annotated(.x),
"seqCount_matrix.no0.annotated.tsv.gz",
"seqCount_matrix.tsv.gz"
)
prefix <- paste(proj, .y, sep = "_")
if (!.y == pool_to_mod) {
readr::write_tsv(sparse$fragmentEstimate,
file = fs::path(pool_fold, paste(prefix,
fe_suffix,
sep = "_"
)),
na = ""
)
}
readr::write_tsv(sparse$seqCount,
file = fs::path(pool_fold, paste(prefix,
sc_suffix,
sep = "_"
)),
na = ""
)
})
}
# --- Write iss
if (any(purrr::map_lgl(sep_stats, ~ !is.null(.x)))) {
for (proj in names(sep_stats)) {
proj_fold <- fs::path(err_fold, proj)
iss_fold <- fs::path(proj_fold, "iss")
fs::dir_create(iss_fold)
### -- Generate pool error
if (length(sep_stats[[proj]]) > 1) {
names(sep_stats[[proj]])[1] <- paste0(
names(sep_stats[[proj]])[1], "-err"
)
}
purrr::walk2(sep_stats[[proj]], names(sep_stats[[proj]]), ~ {
pool_fold <- fs::path(iss_fold, .y)
fs::dir_create(pool_fold)
if (!.y == pool_to_mod) {
if (nrow(.x) > 0) {
filename <- paste0(
"stats.sequence_", proj,
".", .y, ".tsv"
)
readr::write_tsv(.x,
file = fs::path(pool_fold, filename),
na = ""
)
} else {
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
warn_empty_iss <- c(
paste0(
"Warning: empty stats for project '",
proj, "', pool '", .y, "'"
),
i = paste(
"No files will be produced for",
"this pool, if this behaviour is",
"not desired check the association",
"file provided in input"
)
)
rlang::inform(warn_empty_iss,
class = "warn_empty_iss"
)
}
}
}
})
}
}
return(err_fold)
}
#### ---- Internals for multiple functions/general purpose ----####
# Returns the file format for each of the file paths passed as a parameter.
#' @importFrom fs path_ext
#' @importFrom tools file_path_sans_ext
#' @importFrom purrr is_empty
.check_file_extension <- function(file_path) {
## Get last portion of file name
last <- fs::path_ext(file_path)
compressed <- which(last %in% .compressed_formats())
if (!purrr::is_empty(compressed)) {
## for compressed files try extracting the true extension
file_path[compressed] <- tools::file_path_sans_ext(
file_path[compressed]
)
last <- fs::path_ext(file_path)
}
return(last)
}
# Internal to check if required tags are in the specified variables.
# * required_tags is a vector of char, if supplied with names
# in the form of c("tag" = "col_type") also types are checked
# * duplicate_politic is used to determine what to do in case a tag
# is found more than once:
# - "error": raises an error and returns
# - "keep": keeps all columns, no error or msgs
# - "first": keeps the first row it finds
# - named vector with names as tags and values either "keep" for
# keeping all duplicates, "error" to raise an error if duplicates
# are found for that specific column, "first" to keep only first
.check_required_cols <- function(required_tags, vars_df, duplicate_politic) {
error_msg <- function(missing_cols) {
e <- c("Missing required tags or wrong types",
x = paste(
"Missing tags:",
paste0(missing_cols, collapse = ", ")
)
)
}
# Check just colnames
tags_names <- if (!is.null(names(required_tags))) {
names(required_tags)
} else {
required_tags
}
cols_in_tags <- vars_df |>
dplyr::filter(.data$tag %in% tags_names)
# No tags found
if (nrow(cols_in_tags) == 0) {
rlang::abort(error_msg(tags_names), class = "missing_tags_err")
}
missing_tags <- tags_names[!tags_names %in% cols_in_tags$tag]
if (length(missing_tags) > 0) {
rlang::abort(error_msg(missing_tags), class = "missing_tags_err")
}
# ---- If all tags are found
### Evaluate types if necessary
if (!is.null(names(required_tags))) {
cols_in_tags <- .eval_tag_types(cols_in_tags, required_tags)
}
### Evaluate duplicates
cols_in_tags <- .eval_tag_duplicates(cols_in_tags,
duplicate_politic = duplicate_politic
)
cols_in_tags
}
# * duplicate_politic is used to determine what to do in case a tag
# is found more than once:
# - "error": raises an error and returns
# - "keep": keeps all columns, no error or msgs
# - "first": keeps the first row it finds
# - named vector with names as tags and values either "keep" for
# keeping all duplicates, "error" to raise an error if duplicates
# are found for that specific column, "first" to keep only first
.eval_tag_duplicates <- function(df, duplicate_politic) {
if (any(duplicate_politic != "keep")) {
tags_split <- df |>
dplyr::group_by(dplyr::across(dplyr::all_of("tag"))) |>
dplyr::group_split()
first_politic <- function(sub_df) {
first_row <- sub_df[1, ]
warn <- c(
paste0(
"--> Duplicates found for tag '",
sub_df$tag[1], "', only first match kept"
),
i = paste(
"Found:",
paste0(sub_df$names, collapse = ", ")
),
i = paste("Kept:", first_row$names)
)
return(list(type = "MOD", result = first_row, warning = warn))
}
error_politic <- function(sub_df) {
err <- c(
paste0(
"Duplicates not allowed for tag '", sub_df$tag[1],
"', found ", nrow(sub_df)
),
utils::capture.output(print((sub_df)))
)
return(list(type = "ERROR", result = err))
}
# If named vector
single_tag_eval <- function(sub_df) {
if (nrow(sub_df) == 1) {
return(list(type = "DF", result = sub_df))
}
choice <- duplicate_politic[sub_df$tag[1]]
if (choice == "error") {
return(error_politic(sub_df))
}
if (choice == "first") {
return(first_politic(sub_df))
}
if (choice == "keep") {
return(list(type = "DF", result = sub_df))
}
}
dupl_eval <- if (!is.null(names(duplicate_politic))) {
purrr::map(tags_split, single_tag_eval)
} else if (all(duplicate_politic == "error")) {
purrr::map(tags_split, ~ {
if (nrow(.x) == 1) {
return(list(type = "DF", result = .x))
}
return(error_politic(.x))
})
} else {
purrr::map(tags_split, ~ {
if (nrow(.x) == 1) {
return(list(type = "DF", result = .x))
}
return(first_politic(.x))
})
}
errors <- purrr::map(dupl_eval, ~ {
if (.x$type == "ERROR") {
return(.x$result)
}
NULL
})
if (any(purrr::map_lgl(errors, ~ !is.null(.x)))) {
single_err <- c("Duplicates found for some tags",
purrr::reduce(errors, c),
i = paste(
"Check the function documentation",
"to know more on how to set variables",
"correctly"
)
)
rlang::abort(single_err, class = "tag_dupl_err")
}
warnings <- purrr::map(dupl_eval, ~ {
if (.x$type == "MOD") {
return(.x$warning)
} else {
NULL
}
})
if (any(purrr::map_lgl(warnings, ~ !is.null(.x)))) {
single_warn <- c("Warning: duplicates found for some tags",
purrr::reduce(warnings, c),
i = paste(
"Check the function documentation",
"to know more on how to set variables",
"correctly"
)
)
rlang::inform(single_warn, class = "tag_dupl_warn")
}
out_df <- purrr::map(dupl_eval, ~ .x$result) |>
purrr::reduce(dplyr::bind_rows)
return(out_df)
}
return(df)
}
# Types is a named vec in the form of c("tag" = "col_type")
.eval_tag_types <- function(df, types) {
tag_split <- df |>
dplyr::group_by(dplyr::across(dplyr::all_of("tag"))) |>
dplyr::group_split()
single_type_eval <- function(sub_df) {
tag_type <- types[[sub_df$tag[1]]]
if (is.null(tag_type) || purrr::is_empty(tag_type)) {
return(list(type = "DF", result = sub_df))
}
out <- sub_df |>
dplyr::filter(.data$types %in% tag_type)
if (nrow(out) == 0) {
err <- c(paste("Wrong col class for tag '", sub_df$tag[1], "'"),
x = paste0(
"Expected: ",
paste0(tag_type, collapse = ", "),
" - Found: ",
paste0(sub_df$types, collapse = ", ")
)
)
return(list(type = "ERROR", result = err))
}
return(list(type = "DF", result = out))
}
result <- purrr::map(tag_split, single_type_eval)
errors <- purrr::map(result, ~ {
if (.x$type == "ERROR") {
return(.x$result)
}
return(NULL)
})
if (any(purrr::map_lgl(errors, ~ !is.null(.x)))) {
compact_msg <- c(
"Wrong column classes for some tags",
purrr::reduce(errors, c)
)
rlang::abort(compact_msg, class = "tag_type_err")
}
result <- purrr::map(result, ~ .x$result) |>
purrr::reduce(dplyr::bind_rows)
return(result)
}
.check_parallel_packages <- function() {
if (getOption("ISAnalytics.parallel_processing", default = TRUE) == TRUE) {
required_parallel_pkgs <- list(
BiocParallel = "BIOC",
doFuture = "CRAN",
future = "CRAN",
foreach = "CRAN"
)
pkgs_present <- purrr::map_lgl(
names(required_parallel_pkgs),
~ requireNamespace(.x, quietly = TRUE)
)
if (any(pkgs_present == FALSE)) {
missing_pkgs <- required_parallel_pkgs[!pkgs_present]
options(ISAnalytics.parallel_processing = FALSE)
info_msg <- c(.missing_pkg_error(missing_pkgs),
i = paste(
"Packages for parallel computation are not available,",
"switching to default sequential computation",
"(certain operations might be slower).",
"To re-activate the functionality, install the",
"packages and set",
"`options(ISAnalytics.parallel_processing = TRUE)`"
)
)
rlang::inform(info_msg)
}
}
}
# Gets the default number of parallel workers
# If pkgs_check = FALSE, checks the presence of parallel packages first,
# otherwise gives for granted the check has already been done
.get_def_workers <- function(pkgs_check = FALSE) {
if (!pkgs_check) {
.check_parallel_packages()
}
def_workers <- if (getOption("ISAnalytics.parallel_processing",
default = TRUE
) == TRUE) {
# Manage workers
if (.Platform$OS.type == "windows") {
BiocParallel::snowWorkers()
} else {
BiocParallel::multicoreWorkers()
}
} else {
1
}
return(def_workers)
}
# Establishes whether a map job can be executed in parallel (multi-threading)
# or needs to be executed sequentially and calls appropriate functions.
#
# Functions in "fun_to_apply" should always have at least 2 args:
# - the data
# - a `progress` arg
# The list of args must contain everything needed by the function except for
# the first argument (passed in data_list) and progress (internally managed)
#' @importFrom purrr map map_lgl
.execute_map_job <- function(
data_list,
fun_to_apply,
fun_args,
stop_on_error,
max_workers = NULL,
progrs = NULL) {
# Set up progressor if package available
prog <- if (rlang::is_installed("progressr") & is.null(progrs)) {
progressr::progressor(steps = length(data_list))
} else {
progrs
}
.check_parallel_packages()
if (length(data_list) == 1) {
max_workers <- 1
} else {
if (getOption("ISAnalytics.parallel_processing",
default = TRUE
) == TRUE) {
bioc_workers <- .get_def_workers(TRUE)
if (is.null(max_workers)) {
max_workers <- min(length(data_list), bioc_workers)
} else {
max_workers <- min(length(data_list), max_workers)
}
}
}
if (getOption("ISAnalytics.parallel_processing", default = TRUE) == TRUE &
(!is.null(max_workers) && max_workers > 1)) {
# Set up parallel workers
old_be <- doFuture::registerDoFuture()
old_plan <- future::plan(future::multisession, workers = max_workers)
on.exit(
{
future::plan(old_plan)
foreach::setDoPar(
fun = old_be$fun,
data = old_be$data, info = old_be$info
)
},
add = TRUE
)
p <- BiocParallel::DoparParam()
if (!stop_on_error) {
fun_to_apply <- purrr::safely(fun_to_apply)
}
# Execute
arg_list <- append(fun_args, list(
X = data_list,
FUN = fun_to_apply,
BPPARAM = p,
progress = prog
))
results <- rlang::exec(
BiocParallel::bplapply,
!!!arg_list
)
if (!stop_on_error) {
return(list(
res = purrr::map(results, ~ .x$result),
mode = "par", err = purrr::map(results, ~ .x$error)
))
}
return(list(res = results, mode = "par"))
}
# Sequential
arg_list <- append(
list(
.x = data_list,
.f = fun_to_apply,
progress = prog
),
fun_args
)
if (stop_on_error == TRUE) {
results <- rlang::exec(purrr::map, !!!arg_list)
return(list(res = results, mode = "seq"))
} else {
arg_list[[".f"]] <- purrr::safely(fun_to_apply)
results <- rlang::exec(purrr::map, !!!arg_list)
errs <- purrr::map(results, ~ .x$error)
results <- purrr::map(results, ~ .x$result)
return(list(res = results, mode = "seq", err = errs))
}
}
#### ---- Internals for checks on integration matrices----####
# Internal helper function for checking mandatory vars presence in x.
#
# Checks if the elements of `mandatory_IS_vars` are present as column names
# in the data frame.
# @param x A data.frame object (or any extending class)
# @keywords internal
#
# @return FALSE if all or some elements are not found in the data frame, TRUE
# otherwise
.check_mandatory_vars <- function(x) {
stopifnot(is.data.frame(x))
res <- if (all(mandatory_IS_vars() %in% colnames(x))) {
TRUE
} else {
FALSE
}
return(res)
}
# Internal helper function for checking that the pcr replicate
# column presence in x.
#
# @param x A data.frame object (or any extending class)
# @keywords internal
#
# @return FALSE if not found, TRUE otherwise
.check_sample_col <- function(x) {
stopifnot(is.data.frame(x))
if (pcr_id_column() %in% colnames(x)) {
return(TRUE)
} else {
return(FALSE)
}
}
# Finds experimental columns in an integration matrix.
#
# The function checks if there are numeric columns which are not
# standard integration matrix columns, if there are returns their names.
#
# @param x A data.frame
#' @importFrom purrr map_lgl
#' @importFrom rlang expr eval_tidy
#' @keywords internal
#
# @return A character vector of column names
.find_exp_cols <- function(x, to_exclude) {
stopifnot(is.data.frame(x))
remaining <- colnames(x)[!colnames(x) %in% to_exclude]
remaining_numeric <- purrr::map_lgl(remaining, function(y) {
exp <- rlang::expr(`$`(x, !!y))
exp <- rlang::eval_tidy(exp)
is.numeric(rlang::eval_tidy(exp)) | is.integer(rlang::eval_tidy(exp))
})
remaining[remaining_numeric]
}
# Checks if the integration matrix is annotated or not.
#
# @param x A data.frame
# @keywords internal
#
# @return A logical value
.is_annotated <- function(x) {
stopifnot(is.data.frame(x))
if (all(annotation_IS_vars() %in% colnames(x))) {
return(TRUE)
} else {
return(FALSE)
}
}
#### ---- Internals for matrix import ----####
#---- USED IN : import_single_Vispa2Matrix ----
# Reads an integration matrix using data.table::fread
# note: this function is never called if data.table is not installed
#' @importFrom purrr map2 map_lgl is_empty
#' @importFrom dplyr bind_rows filter mutate pull
.read_with_fread <- function(path, additional_cols, annotated, sep) {
col_types <- .mandatory_IS_types("fread")
if (annotated) {
annot_types <- .annotation_IS_types("fread")
col_types <- purrr::map2(
col_types, names(col_types),
~ c(.x, annot_types[[.y]])
)
add_types <- annot_types[!names(col_types) %in% names(annot_types)]
if (!all(purrr::map_lgl(add_types, is.null))) {
col_types <- append(col_types, add_types)
}
}
to_drop <- NULL
if (!is.null(additional_cols) && !purrr::is_empty(additional_cols)) {
to_drop <- names(additional_cols[additional_cols == "_"])
others <- additional_cols[!names(additional_cols) %in% to_drop]
others <- purrr::map(others, ~ {
.types_mapping() |>
dplyr::filter(.data$types == .x) |>
dplyr::pull(.data$fread)
})
col_types <- purrr::map2(
col_types, names(col_types),
~ c(.x, names(others[others == .y]))
)
}
tmp <- data.table::fread(
file = path,
sep = sep,
na.strings = c("NONE", "NA", "NULL", "NaN", ""),
verbose = FALSE,
drop = to_drop,
colClasses = col_types,
showProgress = getOption("ISAnalytics.verbose", TRUE),
data.table = TRUE
)
vars <- mandatory_IS_vars(TRUE)
if (annotated) {
vars <- vars |>
dplyr::bind_rows(annotation_IS_vars(TRUE))
}
# Check if there are dates to convert
dates <- vars |>
dplyr::filter(.data$types %in% c(date_formats(), "date"))
if (nrow(dates) > 0) {
as_list <- setNames(dates$types, dates$names)
for (col in names(as_list)) {
tmp <- tmp |>
dplyr::mutate(!!col := .convert_dates(
.data[[col]],
as_list[[col]]
))
}
}
## Apply transformations if present
tmp <- .apply_col_transform(tmp, vars)
tmp <- data.table::setDT(tmp)
return(tmp)
}
# Reads an integration matrix using readr::read_delim
#' @importFrom readr read_delim cols
#' @importFrom purrr is_empty map
#' @importFrom dplyr bind_rows filter mutate pull
.read_with_readr <- function(path, additional_cols, annotated, sep) {
col_types <- .mandatory_IS_types("classic")
if (annotated) {
col_types <- append(
col_types,
.annotation_IS_types("classic")
)
}
if (!is.null(additional_cols) && !purrr::is_empty(additional_cols)) {
to_drop <- additional_cols[additional_cols == "_"]
others <- additional_cols[!names(additional_cols) %in% names(to_drop)]
others <- purrr::map(others, ~ {
.types_mapping() |>
dplyr::filter(.data$types == .x) |>
dplyr::pull(.data$mapping)
})
for (x in names(to_drop)) {
col_types[[x]] <- "_"
}
for (x in names(others)) {
col_types[[x]] <- others[[x]]
}
}
col_types[[".default"]] <- "n"
df <- readr::read_delim(
file = path,
delim = sep,
col_types = do.call(readr::cols, col_types),
na = c("NONE", "NA", "NULL", "NaN", ""),
trim_ws = TRUE,
progress = getOption("ISAnalytics.verbose", TRUE)
)
vars <- mandatory_IS_vars(TRUE)
if (annotated) {
vars <- vars |>
dplyr::bind_rows(annotation_IS_vars(TRUE))
}
# Check if there are dates to convert
dates <- vars |>
dplyr::filter(.data$types %in% c(date_formats(), "date"))
if (nrow(dates) > 0) {
as_list <- setNames(dates$types, dates$names)
for (col in names(as_list)) {
df <- df |>
dplyr::mutate(!!col := .convert_dates(
.data[[col]],
as_list[[col]]
))
}
}
## Apply transformations if present
df <- .apply_col_transform(df, vars)
return(df)
}
# Melts dataframe (chunks) with data.table melt
# This function is never called if data.table is not installed
.melt_datatable <- function(data, id_vars, sample_col, value_col, progress) {
mtd <- data.table::melt.data.table(data,
id.vars = id_vars,
variable.name = sample_col,
value.name = value_col,
na.rm = TRUE,
verbose = FALSE,
variable.factor = FALSE,
value.factor = FALSE
)
if (!is.null(progress)) {
progress()
}
return(mtd)
}
.melt_tidyr <- function(data, id_vars, sample_col, value_col, progress) {
mtd <- tidyr::pivot_longer(
data = data,
cols = !dplyr::all_of(id_vars),
names_to = sample_col,
values_to = value_col,
values_drop_na = TRUE
)
if (!is.null(progress)) {
progress()
}
return(mtd)
}
# Melts a big matrix divided in chunks in parallel if possible (otherwise
# does the job sequentially after notifying)
#' @importFrom purrr reduce
#' @importFrom dplyr bind_rows
.melt_in_parallel <- function(df, n_chunks, melt_fun, id_vars,
id_col_name, val_col_name) {
### - Split in chunks
chunks <- .split_df_in_chunks(df, n_chunks,
keep_chunk_id = FALSE
)
### - Distribute work
tidy_chunks <- .execute_map_job(
data_list = chunks,
fun_to_apply = melt_fun,
fun_args = list(
id_vars = id_vars,
sample_col = id_col_name,
value_col = val_col_name
),
stop_on_error = TRUE,
max_workers = n_chunks
)
tidy <- purrr::reduce(tidy_chunks$res, dplyr::bind_rows)
return(tidy)
}
# - call_mode: either INTERNAL or EXTERNAL. First is used when calling
# the function
# from parallel importing function, second one when calling
# import_single_Vispa2_matrix
#' @importFrom rlang abort inform is_installed
#' @importFrom readr read_delim cols
.import_single_matrix <- function(
path,
separator = "\t",
additional_cols = NULL,
transformations = NULL,
call_mode = "INTERNAL",
id_col_name = pcr_id_column(),
val_col_name = "Value") {
## - Check additional cols specs: must be named vector or list or NULL
stopifnot(is.null(additional_cols) ||
(is.character(additional_cols) &
!is.null(names(additional_cols))) ||
(is.list(additional_cols) & !is.null(names(additional_cols))))
correct_types <- additional_cols %in% c(.types_mapping()$types, "_")
if (any(correct_types == FALSE)) {
add_types_err <- c(
"Unknown column type in specified additional columns",
x = paste("Types must be in the allowed types"),
i = paste(
"Unknown formats: ",
paste0(unique(
additional_cols[correct_types == FALSE]
), collapse = ", ")
),
i = paste(
"See documentation for details",
"?import_single_Vispa2Matrix"
)
)
rlang::abort(add_types_err, class = "add_types_err")
}
### Peak headers
peek_headers <- readr::read_delim(path,
delim = separator, n_max = 0,
col_types = readr::cols()
)
## - Mandatory vars should always be present (as declared in specs)
if (!all(mandatory_IS_vars() %in% colnames(peek_headers))) {
missing_mand <- mandatory_IS_vars()[!mandatory_IS_vars() %in%
colnames(peek_headers)]
rlang::abort(.missing_req_cols(mandatory_IS_vars(), missing_mand),
class = "im_single_miss_mand_vars"
)
}
is_annotated <- .is_annotated(peek_headers)
additional_cols <- additional_cols[names(additional_cols) %in%
colnames(peek_headers)]
mode <- "classic"
add_pkg_installed <- rlang::is_installed("data.table")
## OPTIONAL - if data.table is available
if (add_pkg_installed) {
mode <- "fread"
## Is the file compressed?
is_compressed <- fs::path_ext(path) %in% .compressed_formats()
if (is_compressed) {
## The compression type is supported by data.table::fread?
compression_type <- fs::path_ext(path)
if (!compression_type %in% .supported_fread_compression_formats()) {
### If not, switch to classic for reading
mode <- "classic"
if (call_mode == "EXTERNAL" &&
getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform(.unsupported_comp_format_inf(),
class = "unsup_comp_format"
)
}
}
}
}
## - Start reading
if (call_mode == "EXTERNAL" &&
getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform(c("Reading file...", i = paste0("Mode: ", mode)))
}
df <- if (mode == "fread") {
.read_with_fread(
path = path, additional_cols = additional_cols,
annotated = is_annotated, sep = separator
)
} else {
.read_with_readr(
path = path, additional_cols = additional_cols,
annotated = is_annotated, sep = separator
)
}
df_dim <- dim(df)
## - Melt
id_vars <- c(
mandatory_IS_vars(),
names(additional_cols[additional_cols != "_"])
)
if (is_annotated) {
id_vars <- c(id_vars, annotation_IS_vars())
}
mt <- if (mode == "fread") {
.melt_datatable
} else {
.melt_tidyr
}
if (call_mode == "EXTERNAL" &&
getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform("Reshaping...")
}
max_workers <- trunc(.get_def_workers())
if (call_mode == "INTERNAL" || nrow(df) <= 500000 || max_workers <= 1) {
tidy <- mt(df, id_vars, id_col_name, val_col_name, NULL)
} else if (nrow(df) > 500000 & max_workers > 1) {
## - Melt in parallel
tidy <- .melt_in_parallel(
df = df, n_chunks = max_workers,
melt_fun = mt, id_vars = id_vars,
id_col_name = id_col_name,
val_col_name = val_col_name
)
}
tidy <- tidy |>
dplyr::filter(.data[[val_col_name]] > 0) |>
tibble::as_tibble()
## Transform cols
if (call_mode == "EXTERNAL" &&
getOption("ISAnalytics.verbose", TRUE) == TRUE &&
!is.null(transformations)) {
rlang::inform("Applying column transformations...")
}
if (!is.null(transformations)) {
tidy <- transform_columns(tidy, transf_list = transformations)
}
## - Report summary
if (call_mode == "EXTERNAL" &&
getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform(
.summary_ism_import_msg(
is_annotated,
df_dim,
mode,
length(unique(tidy[[id_col_name]]))
),
class = "ism_import_summary"
)
}
return(tidy)
}
#---- USED IN : import_association_file ----
# Checks if the association file contains at least the the columns flagged
# as required in `association_columns(TRUE)`
#
# @param df The imported association file
.check_af_correctness <- function(df) {
af_cols_specs <- association_file_columns(TRUE)
### Retrieves the column names flagged as required
min_required <- af_cols_specs |>
dplyr::filter(.data$flag == "required") |>
dplyr::pull(.data$names)
if (length(min_required) == 0) {
return(TRUE) # No required columns
}
if (all(min_required %in% colnames(df))) {
return(TRUE)
} else {
return(FALSE)
}
}
# Imports association file from disk. Converts dates and pads timepoints,
# reporting parsing problems.
#' @importFrom rlang is_installed inform abort
#' @importFrom readr read_delim cols problems
#' @importFrom purrr map_chr pluck map2_lgl is_empty
#' @importFrom tibble tibble
.read_af <- function(path, date_format, delimiter) {
mode <- "readr"
## - Check file extension
file_ext <- .check_file_extension(path)
if (file_ext %in% c("xls", "xlsx")) {
mode <- "readxl"
if (!rlang::is_installed("readxl")) {
rlang::abort(.missing_pkg_error("readxl"))
}
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform(.xls_file_warning(),
class = "xls_file"
)
}
}
## - Peek headers to know column types
col_types <- if (mode != "readxl") {
headers_peek <- readr::read_delim(path,
delim = delimiter,
n_max = 0,
col_types = readr::cols()
)
t_readr <- .af_col_types("readr")
t_readr[names(t_readr) %in% colnames(headers_peek)]
} else {
headers_peek <- readxl::read_excel(path, n_max = 0)
t_readr <- .af_col_types("readr")
ordered <- purrr::map_chr(
colnames(headers_peek),
function(x) {
el <- getElement(t_readr, x)
el <- if (is.null(el)) {
"guess"
} else if (el == "c") {
"text"
} else if (el %in% c("i", "d")) {
"numeric"
} else {
"guess"
}
}
)
ordered
}
suppressWarnings({
as_file <- if (mode == "readr") {
df <- readr::read_delim(path,
delim = delimiter,
na = c("NONE", "NA", "NULL", "NaN", ""),
col_types = do.call(readr::cols, col_types),
trim_ws = TRUE,
progress = getOption("ISAnalytics.verbose", TRUE)
)
problems <- readr::problems(df)
df
} else {
df <- readxl::read_excel(path,
col_types = col_types,
na = c("NONE", "NA", "NULL", "NaN", ""),
trim_ws = TRUE,
progress = getOption("ISAnalytics.verbose", TRUE)
)
problems <- NULL
df
}
vars <- association_file_columns(TRUE)
# Check if there are dates to convert
dates <- vars |>
dplyr::filter(.data$types %in% c(date_formats(), "date")) |>
dplyr::mutate(types = dplyr::if_else(.data$types == "date",
true = date_format,
false = .data$types
))
dates <- dates |>
dplyr::filter(.data$names %in% colnames(as_file))
date_failures <- NULL
if (nrow(dates) > 0) {
before <- as_file |>
dplyr::select(dplyr::all_of(dates$names))
as_list <- setNames(dates$types, dates$names)
for (col in names(as_list)) {
as_file <- as_file |>
dplyr::mutate(!!col := .convert_dates(
.data[[col]],
as_list[[col]]
))
}
date_failures <- purrr::map(dates$names, function(col) {
before_col <- purrr::pluck(before, col)
row_failed <- which(
purrr::map2_lgl(
before_col, as_file[[col]],
function(d1, d2) {
if (!is.na(d1) & is.na(d2)) {
TRUE
} else {
FALSE
}
}
)
)
if (!is.null(row_failed) && !purrr::is_empty(row_failed)) {
tibble::tibble(
row = row_failed, col = col,
original = before_col[row_failed],
parsed = NA, format = date_format
)
} else {
return(NULL)
}
}) |>
purrr::list_rbind()
}
## Apply transformations if present
as_file <- .apply_col_transform(as_file, vars)
})
return(list(af = as_file, probs = problems, date_fail = date_failures))
}
# Internal function to check alignment between association file and file
# system starting from the root. The alignment is checked at folder level.
#
# - df: The imported association file (data.frame or tibble)
# - root_folder: Path to the root folder
#' @importFrom rlang .data `:=` abort inform
#' @importFrom fs path dir_ls
# Returns a data frame with this format:
# ProjectID - ConcatenatePoolIDSeqRun - PathToFolderProjectID - Found - Path -
# Path_quant - Path_iss (NOTE: headers are dynamic!)
.check_file_system_alignment <- function(
df,
root_folder,
proj_fold_col,
concat_pool_col,
project_id_col) {
if (!proj_fold_col %in% colnames(df)) {
rlang::abort(.af_missing_pathfolder_error(proj_fold_col))
}
temp_df <- df |>
dplyr::select(dplyr::all_of(c(
project_id_col,
concat_pool_col,
proj_fold_col
))) |>
dplyr::distinct()
path_cols <- .path_cols_names()
proj_folders_exist <- temp_df |>
dplyr::select(dplyr::all_of(proj_fold_col)) |>
dplyr::distinct() |>
dplyr::mutate(Found = !is.na(
.data[[proj_fold_col]]
) &
unname(fs::dir_exists(
fs::path(
fs::path(root_folder),
.data[[proj_fold_col]]
)
)))
partial_check <- temp_df |>
dplyr::left_join(proj_folders_exist, by = proj_fold_col)
temp_df <- partial_check |>
dplyr::filter(.data$Found == TRUE)
partial_check <- partial_check |>
dplyr::filter(.data$Found == FALSE) |>
dplyr::mutate(
!!path_cols$project := NA_character_,
!!path_cols$quant := NA_character_,
!!path_cols$iss := NA_character_
)
FUN <- function(...) {
cur <- tibble::tibble(...)
project_folder <- fs::path(
fs::path(root_folder),
cur[[proj_fold_col]]
)
quant_folder <- if (!is.na(cur[[concat_pool_col]])) {
paste0(fs::path(
"quantification",
fs::path(cur[[concat_pool_col]])
), "$")
} else {
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
msg <- c(paste(
"Warning: found NA",
paste0(concat_pool_col, " field")
))
rlang::inform(msg,
i = "Check association file for possible issues",
class = "na_concat"
)
}
NA_character_
}
iss_folder <- if (!is.na(cur[[concat_pool_col]])) {
paste0(fs::path(
"iss",
fs::path(cur[[concat_pool_col]])
), "$")
} else {
NA_character_
}
quant_found <- unique(fs::dir_ls(
path = project_folder, recurse = TRUE,
type = "directory", fail = FALSE,
regexp = quant_folder
))
if (length(quant_found) == 0 || all(is.na(quant_found))) {
quant_found <- NA_character_
}
iss_found <- unique(fs::dir_ls(
path = project_folder, recurse = TRUE,
type = "directory", fail = FALSE,
regexp = iss_folder
))
if (length(iss_found) == 0 || all(is.na(iss_found))) {
iss_found <- NA_character_
}
return(
cur |>
dplyr::mutate(
!!path_cols$project := project_folder,
!!path_cols$quant := quant_found,
!!path_cols$iss := iss_found
)
)
}
results_df <- purrr::pmap(
temp_df,
FUN
) |>
purrr::list_rbind()
checker_df <- results_df |>
dplyr::bind_rows(partial_check)
checker_df
}
# Updates the association file after the alignment check --> adds the columns
# containing paths to project folder, quant folders and iss folders
.update_af_after_alignment <- function(
as_file, checks,
proj_fold_col,
concat_pool_col,
project_id_col) {
as_file <- as_file |>
dplyr::left_join(
checks |>
dplyr::select(
-dplyr::all_of(c(proj_fold_col, "Found"))
),
by = c(project_id_col, concat_pool_col)
)
as_file
}
# Helper function to be used internally to treat association file.
#' @importFrom rlang enexpr parse_expr eval_tidy
.manage_association_file <- function(
af_path,
root,
format,
delimiter,
filter,
proj_fold_col,
concat_pool_col,
project_id_col) {
# Import the association file
association_file <- .read_af(
path = af_path,
date_format = format,
delimiter = delimiter
)
parsing_probs <- association_file$probs
date_probs <- association_file$date_fail
association_file <- association_file$af
if (!is.null(filter)) {
# Pre-filtering of association file
if (!all(names(filter) %in% colnames(association_file))) {
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
missing_filt <- c(
"Some or all the names in the filter not found",
i = paste(
"Columns not found: ",
paste0(
names(filter)[names(filter) %in%
colnames(association_file)],
collapse = ", "
)
),
"Ignoring the missing columns"
)
rlang::inform(
missing_filt,
class = "filter_warn"
)
}
filter <- filter[names(filter) %in% colnames(association_file)]
}
if (!purrr::is_empty(filter)) {
predicate <- purrr::map2_chr(
filter, names(filter),
function(x, y) {
if (is.character(x)) {
paste0(
y, " %in% c(", paste0("'",
rlang::enexpr(x),
"'",
collapse = ", "
),
")"
)
} else {
paste0(
y, " %in% c(", paste0(rlang::enexpr(x),
collapse = ", "
),
")"
)
}
}
)
predicate <- paste0(c(
"dplyr::filter(association_file, ",
paste0(predicate, collapse = ", "),
")"
), collapse = "")
association_file <- rlang::eval_tidy(
rlang::parse_expr(predicate)
)
}
}
checks <- NULL
if (!is.null(root) & nrow(association_file) > 0) {
checks <- .check_file_system_alignment(
df = association_file,
root_folder = root,
proj_fold_col = proj_fold_col,
concat_pool_col = concat_pool_col,
project_id_col = project_id_col
)
association_file <- .update_af_after_alignment(
as_file = association_file,
checks = checks,
proj_fold_col = proj_fold_col,
concat_pool_col = concat_pool_col,
project_id_col = project_id_col
)
}
res <- list(
af = association_file, check = checks,
parsing_probs = parsing_probs, date_probs = date_probs
)
return(res)
}
# Converts a vector of timepoints expressed as days to a vector
# of timepoints expressed as months with this logic:
# - Month is 0 if and only if timepoint is 0
# - If timepoint is strictly less than 30 returns the smallest integer
# NOT LESS THAN tp / 30
# ---> ex: 29 becomes ceiling(29/30) = ceiling(0.9666667) = 1
# - If timepoint is greater than or equal to 30 returns the rounding of
# tp / 30 to the closest integer
# ---> ex: 35 becomes round(35/30) = round(1.166667) = 1
# - If for any reason the timepoint is negative NA is returned instead
.timepoint_to_months <- function(tp) {
dplyr::if_else(
condition = as.numeric(tp) > 0,
true = dplyr::if_else(
condition = tp < 30,
true = ceiling(as.numeric(tp) / 30),
false = round(as.numeric(tp) / 30)
),
false = dplyr::if_else(
condition = as.numeric(tp) != 0,
true = NA_real_,
false = 0
)
)
}
# Converts a vector of timepoints expressed as days to a vector
# of timepoints expressed as years with this logic:
# - Year is 0 if and only if tp is 0
# - otherwise take the ceiling of tp / 360
.timepoint_to_years <- function(tp) {
dplyr::if_else(
condition = as.numeric(tp) == 0,
true = 0,
false = dplyr::if_else(
condition = as.numeric(tp) < 0,
true = NA_real_,
false = ceiling(as.numeric(tp) / 360)
)
)
}
#---- USED IN : import_Vispa2_stats ----
# Finds automatically the path on disk to each stats file.
#' @importFrom purrr detect_index
#' @importFrom tibble tibble
#' @importFrom fs dir_ls
#' @importFrom rlang .data
# @keywords internal
# @return A tibble with columns: ProjectID, concatenatePoolIDSeqRun,
# Path_iss (or designated dynamic name), stats_files, info
.stats_report <- function(
association_file,
prefixes,
proj_col,
pool_col,
path_iss_col) {
temp <- association_file |>
dplyr::select(
dplyr::all_of(c(proj_col, pool_col, path_iss_col))
) |>
dplyr::distinct() |>
dplyr::mutate(
!!proj_col := as.factor(.data[[proj_col]]),
!!pool_col := as.factor(.data[[pool_col]])
)
# If paths are all NA return
if (all(is.na(temp[[path_iss_col]]))) {
return(temp |> dplyr::mutate(
stats_files = NA_character_,
info = list("NO FOLDER FOUND")
))
}
match_pattern <- function(pattern, temp_row) {
if (is.na(temp_row[[path_iss_col]])) {
return(tibble::tibble(pattern = pattern, file = NA_character_))
}
# For each prefix pattern search in the iss folder
# Note: there can be
# - a single file matching the prefix (ideal)
# - multiple files matching
# - no file matching
files <- fs::dir_ls(temp_row[[path_iss_col]],
type = "file", fail = FALSE,
regexp = pattern
)
if (length(files) == 0) {
files <- NA_character_
}
tibble::tibble(pattern = pattern, file = files)
}
stats_paths <- purrr::pmap(temp, function(...) {
temp_row <- tibble::tibble(...)
matches <- purrr::map(prefixes, ~ match_pattern(.x, temp_row)) |>
purrr::list_rbind()
if (all(is.na(matches$file))) {
# No stats files found for all prefixes
return(temp_row |>
dplyr::mutate(
stats_files = NA_character_,
info = list("NOT FOUND")
))
}
if (length(which(!is.na(matches$file))) == 1) {
return(temp_row |>
dplyr::mutate(
stats_files = (matches |>
dplyr::filter(!is.na(.data$file)) |>
dplyr::pull(.data$file)),
info = list("NULL")
))
}
# Get the count of files found for each pattern and order them
# as preference - preferred pattern is the first in the prefixes
# parameter
pattern_counts <- matches |>
dplyr::mutate(pattern = factor(.data$pattern,
levels = prefixes
)) |>
dplyr::filter(!is.na(.data$file)) |>
dplyr::group_by(dplyr::across(dplyr::all_of("pattern"))) |>
dplyr::tally() |>
dplyr::arrange(.data$pattern)
one_index <- purrr::detect_index(pattern_counts$n, ~ .x == 1)
if (one_index == 0) {
## Means there are duplicates for all patterns, gets reported
## but no files will be imported
return(temp_row |>
dplyr::mutate(
stats_files = NA_character_,
info = list(matches |>
dplyr::filter(!is.na(.data$file)) |>
dplyr::mutate(info = "DUPLICATE"))
))
} else {
## Pick the file with the pattern that has a count of 1 (no
## duplicates)
chosen_pattern <- pattern_counts$pattern[one_index]
return(temp_row |>
dplyr::mutate(
stats_files = matches |>
dplyr::filter(.data$pattern == chosen_pattern) |>
dplyr::pull(.data$file),
info = list("NULL")
))
}
}) |>
purrr::list_rbind()
stats_paths
}
# Checks if the stats file contains the minimal set of columns.
.check_stats <- function(x, required_cols) {
if (all(required_cols %in% colnames(x))) {
TRUE
} else {
FALSE
}
}
# Imports all found Vispa2 stats files.
#
# - association_file: imported and aligned association file
# - prefixes: vector of regex to match file names
# - pool_col: the column containing the pool as specified in input
# - path_iss_col: name of the column that contains the path
# - tags: injected from parent function - collection of required tags and
# corresponding column names
#
# @return A list with the imported stats and a report of imported files. If no
# files were imported returns NULL instead
.import_stats_iss <- function(
association_file,
prefixes,
pool_col,
path_iss_col,
tags) {
proj_col <- tags |>
dplyr::filter(.data$tag == "project_id") |>
dplyr::pull(.data$names)
# Obtain paths
stats_paths <- .stats_report(association_file,
prefixes,
proj_col = proj_col,
pool_col = pool_col,
path_iss_col = path_iss_col
)
stats_paths <- stats_paths |>
tidyr::unnest(dplyr::all_of("info"))
if (all(is.na(stats_paths$stats_files))) {
stats_paths <- stats_paths |>
dplyr::mutate(
Imported = FALSE,
reason = as.factor(.data$info)
) |>
dplyr::mutate(-.data$info)
return(list(stats = NULL, report = stats_paths))
}
vispa_stats_req <- iss_stats_specs(TRUE) |>
dplyr::filter(.data$flag == "required") |>
dplyr::pull(.data$names)
FUN <- function(x, req_cols, progress) {
if (is.na(x)) {
return(NULL)
}
stats <- readr::read_delim(
file = x, delim = "\t", na = c("", "NA", "na", "NONE"),
col_types = readr::cols()
)
ok <- .check_stats(stats, req_cols)
if (ok == TRUE) {
# Apply column transformations if present
stats <- .apply_col_transform(stats, iss_stats_specs(TRUE))
if (!is.null(progress)) {
progress()
}
return(stats)
} else {
if (!is.null(progress)) {
progress()
}
return("MALFORMED")
}
}
stats_dfs <- .execute_map_job(
data_list = stats_paths$stats_files,
fun_to_apply = FUN,
fun_args = list(
req_cols = vispa_stats_req
), stop_on_error = FALSE
)
correct <- purrr::map_chr(stats_dfs$res, function(x) {
if (all(is.data.frame(x))) {
"TRUE"
} else if (is.null(x)) {
"FALSE"
} else {
x
}
})
stats_paths <- stats_paths |>
dplyr::mutate(Imported = correct)
stats_paths <- purrr::pmap(stats_paths, function(...) {
row <- tibble::tibble(...)
condition <- if (row$Imported == "MALFORMED") {
"MALFORMED"
} else if (row$Imported == "TRUE") {
NA_character_
} else {
row$info
}
row |>
dplyr::mutate(reason = as.factor(condition)) |>
dplyr::mutate(Imported = dplyr::if_else(
condition = (.data$Imported == "MALFORMED"),
true = FALSE,
false = as.logical(.data$Imported)
))
}) |>
purrr::list_rbind()
stats_paths <- stats_paths |>
dplyr::select(-dplyr::all_of("info"))
stats_dfs <- stats_dfs$res[stats_paths$Imported]
# Bind rows in single tibble for all files
if (purrr::is_empty(stats_dfs)) {
return(list(stats = NULL, report = stats_paths))
}
stats_dfs <- purrr::reduce(stats_dfs, function(x, y) {
x |>
dplyr::bind_rows(y) |>
dplyr::distinct()
})
list(stats = stats_dfs, report = stats_paths)
}
#---- USED IN : import_parallel_Vispa2Matrices ----
# Base arg check (valid for both versions)
.base_param_check <- function(
association_file,
quantification_type,
matrix_type,
workers,
multi_quant_matrix) {
# Check parameters
stopifnot(is.data.frame(association_file))
stopifnot(is.numeric(workers) & length(workers) == 1)
stopifnot(!missing(quantification_type))
stopifnot(all(quantification_type %in% quantification_types()))
stopifnot(is.logical(multi_quant_matrix) & length(multi_quant_matrix) == 1)
}
# Check AF is aligned with file system
.pre_manage_af <- function(association_file) {
## Check there are the appropriate columns
if (!.path_cols_names()$quant %in% colnames(association_file)) {
rlang::abort(.af_missing_path_error(.path_cols_names()$quant),
class = "missing_path_col"
)
}
association_file <- association_file |>
dplyr::filter(!is.na(.data[[.path_cols_names()$quant]]))
return(association_file)
}
# Updates a files_found tibble with Files_count and Anomalies column.
#
# @param lups A files_found tibble obtained in a lookup function. Must contain
# the Files column (nested table quantification type and files)
# @keywords internal
#
# @return Updated files_found with Anomalies and Files_count columns
.trace_anomalies <- function(lups) {
files_count <- purrr::pmap(lups, function(...) {
temp <- tibble::tibble(...)
an <- temp$Files
an <- an |>
dplyr::group_by(
dplyr::across(dplyr::all_of("Quantification_type"))
) |>
dplyr::summarise(
Found = sum(!is.na(.data$Files_found)),
.groups = "drop_last"
)
an
})
lups <- lups |>
dplyr::mutate(Files_count = files_count) |>
dplyr::mutate(
Anomalies = purrr::map_lgl(
.data$Files_count,
~ any(.x["Found"] != 1)
),
.before = dplyr::all_of("Files")
)
lups
}
# Looks up matrices to import given the association file and the
# root of the file system.
#
# @param association_file Tibble representing the association file
# @param quantification_type The type of quantification matrices to look for
# (one in `quantification_types()`)
# @param matrix_type The matrix_type to lookup (one between "annotated" or
# "not_annotated")
# @keywords internal
#' @importFrom tibble tibble
#' @importFrom fs dir_ls as_fs_path
#' @importFrom stringr str_replace_all
#' @importFrom tidyr nest
#
# @return A tibble containing all found files, including duplicates and missing
.lookup_matrices <- function(
association_file,
quantification_type,
m_type,
proj_col,
pool_col) {
path_col_names <- .path_cols_names()
temp <- association_file |>
dplyr::select(
dplyr::all_of(c(proj_col, pool_col, path_col_names$quant))
) |>
dplyr::distinct()
## Obtain a df with all possible combination of suffixes for each
## quantification
suffixes <- matrix_file_suffixes() |>
dplyr::filter(
.data$matrix_type == m_type,
.data$quantification %in% quantification_type
) |>
dplyr::mutate(suffix_regex = paste0(stringr::str_replace_all(
.data$file_suffix, "\\.", "\\\\."
), "$"))
## For each row in temp (aka for each ProjectID and
## concatenatePoolIDSeqRun) scan the quantification folder
lups <- purrr::pmap(temp, function(...) {
temp_row <- tibble::tibble(...)
found <- purrr::pmap(suffixes, function(...) {
## For each quantification scan the folder for the
## corresponding suffixes
cross_row <- tibble::tibble(...)
matches <- fs::dir_ls(temp_row[[path_col_names$quant]],
type = "file", fail = FALSE,
regexp = cross_row$suffix_regex
)
if (length(matches) == 0) {
matches <- NA_character_
}
tibble::tibble(
Quantification_type = cross_row$quantification,
Files_found = matches
)
}) |>
purrr::list_rbind()
found <- found |>
dplyr::group_by(
dplyr::across(dplyr::all_of("Quantification_type"))
) |>
dplyr::distinct() |>
dplyr::group_modify(~ {
if (nrow(.x) > 1) {
if (any(is.na(.x$Files_found)) &
!all(is.na(.x$Files_found))) {
.x |>
dplyr::filter(!is.na(.data$Files_found))
} else {
.x
}
} else {
.x
}
})
tibble::tibble(
!!proj_col := temp_row[[proj_col]],
!!pool_col := temp_row[[pool_col]],
found
)
}) |>
purrr::list_rbind()
lups <- lups |>
tidyr::nest(Files = dplyr::all_of(
c("Quantification_type", "Files_found")
))
lups <- .trace_anomalies(lups)
lups
}
# Internal function to match user defined patterns on a vector
# of file names.
#
# For each pattern specified by the user, the function tries to find a match
# on all the file names and combines the results as a tibble in which column
# names are the patterns and the values are TRUE if the pattern matched on the
# element at that index or FALSE otherwise.
#
# @param filenames A character vector of file names
# @param patterns A character vector of patterns to be matched
#
# @return A tibble
.pattern_matching <- function(filenames, patterns) {
p_matches <- purrr::map(patterns, function(x) {
mtc <- stringr::str_detect(filenames, x)
tibble::as_tibble_col(mtc, column_name = x)
})
p_matches <- purrr::reduce(p_matches, function(x, y) {
x |> dplyr::bind_cols(y)
})
p_matches
}
# Helper function for checking if any of the elements of the list is true.
#
# @param ... A list of logical values
# @keywords internal
#
# @return TRUE if any of the parameters is true
.any_match <- function(...) {
l <- unlist(list(...))
any(l)
}
# Helper function for checking if all of the elements of the list is true.
#
# @param ... A list of logical values
# @keywords internal
#
# @return TRUE if all of the parameters is true
.all_match <- function(...) {
l <- unlist(list(...))
all(l)
}
# Updates files_found tibble according to pattern and matching options.
#
# @param files_nested The tibble containing Quantification_type and Files_found
# columns relative to a single project/pool pair
# @param p_matches The tibble representing the pattern matchings resulting from
# `pattern_matching`
# @param matching_opt The matching option
#' @importFrom purrr map reduce
#' @importFrom rlang .data
#' @importFrom fs as_fs_path
#
# @return An updated files_found tibble according to the matching option
.update_as_option <- function(files_nested, p_matches, matching_opt) {
patterns <- colnames(p_matches)
# Bind columns of Files nested tbl with pattern matches results
p_matches <- files_nested |> dplyr::bind_cols(p_matches)
# Find the different quantification types in the files_found
types <- p_matches |>
dplyr::select(dplyr::all_of("Quantification_type")) |>
dplyr::distinct() |>
dplyr::pull(.data$Quantification_type)
# For each quantification type
select_matches <- function(x) {
# Obtain a summary of matches (matches ALL patterns or ANY pattern)
temp <- p_matches |>
dplyr::filter(.data$Quantification_type == x) |>
dplyr::rowwise() |>
dplyr::mutate(
ALL = .all_match(
dplyr::c_across(dplyr::all_of(patterns))
),
ANY = .any_match(dplyr::c_across(dplyr::all_of(patterns)))
) |>
dplyr::ungroup()
# If the matching option is "ANY" - preserve files that match any of the
# given patterns
if (matching_opt == "ANY") {
if (!all(is.na(temp$ANY)) & any(temp$ANY)) {
# If there is at least 1 match in the group, files that match
# are kept
files_keep <- temp |>
dplyr::filter(.data$ANY == TRUE) |>
dplyr::select(
dplyr::all_of(c(
"Quantification_type", "Files_found"
))
)
} else {
# If none of the files match, none is preserved, file is
# converted to NA
files_keep <- temp |>
dplyr::slice(1) |>
dplyr::mutate(Files_found = fs::as_fs_path(
NA_character_
)) |>
dplyr::select(dplyr::all_of(c(
"Quantification_type", "Files_found"
)))
}
files_keep
# If the matching option is "ALL" - preserve only files that match
# all the given patterns
} else if (matching_opt == "ALL") {
if (!all(is.na(temp$ALL)) & any(temp$ALL)) {
# If there is at least 1 match in the group, files that match
# are kept
files_keep <- temp |>
dplyr::filter(.data$ALL == TRUE) |>
dplyr::select(dplyr::all_of(c(
"Quantification_type", "Files_found"
)))
} else {
# If none of the files match none is preserved, file is
# converted to NA
files_keep <- temp |>
dplyr::slice(1) |>
dplyr::mutate(Files_found = fs::as_fs_path(
NA_character_
)) |>
dplyr::select(dplyr::all_of(c(
"Quantification_type", "Files_found"
)))
}
files_keep
# If the matching option is "OPTIONAL" - preserve preferentially
# files that match all or any of the given patterns, if none is
# matching preserves file anyway
} else {
# Check first if some files match all the patterns
if (!all(is.na(temp$ALL)) & any(temp$ALL)) {
# Preserve only the ones that match
files_keep <- temp |>
dplyr::filter(.data$ALL == TRUE) |>
dplyr::select(dplyr::all_of(c(
"Quantification_type", "Files_found"
)))
} else if (!all(is.na(temp$ANY)) & any(temp$ANY)) {
# If none match all the patterns check files that match any of
# the patterns and preserve only those
files_keep <- temp |>
dplyr::filter(.data$ANY == TRUE) |>
dplyr::select(dplyr::all_of(c(
"Quantification_type", "Files_found"
)))
} else {
# If there are no files that match any of the patterns simply
# preserve the files found
files_keep <- temp |>
dplyr::select(dplyr::all_of(c(
"Quantification_type", "Files_found"
)))
}
files_keep
}
}
to_keep <- purrr::map(types, select_matches)
to_keep <- purrr::reduce(to_keep, dplyr::bind_rows)
to_keep
}
# Looks up matrices to import given the association file and the
# root of the file system for AUTO mode.
#
# @return A tibble containing all found files, including duplicates and missing
.lookup_matrices_auto <- function(
association_file,
quantification_type,
matrix_type,
patterns,
matching_opt,
proj_col,
pool_col) {
files_found <- .lookup_matrices(
association_file,
quantification_type,
matrix_type,
proj_col,
pool_col
)
if (nrow(files_found) > 0 & !is.null(patterns)) {
pre_filtering <- function(...) {
l <- list(...)
files_nested <- l[["Files"]]
split <- stringr::str_split(files_nested$Files_found, "\\/")
filenames <- unlist(purrr::map(split, function(x) {
utils::tail(x, n = 1)
}))
p_matches <- .pattern_matching(filenames, patterns)
to_keep <- .update_as_option(files_nested, p_matches, matching_opt)
to_keep
}
matching_patterns <- purrr::pmap(files_found, pre_filtering)
files_found <- files_found |>
dplyr::mutate(Files = matching_patterns) |>
dplyr::select(
dplyr::all_of(c(proj_col, pool_col, "Files"))
)
files_found <- .trace_anomalies(files_found)
}
files_found
}
# Manages anomalies for files found.
#
# The function manages anomalies found for files after scanning appropriate
# folders by:
# * Removing files not found (files for which Files_count$Found == 0 and Path
# is NA) and printing a message to notify user
# * Removing duplicates
#
# @param files_found The tibble obtained via calling `.lookup_matrices_auto`
# @keywords internal
#' @importFrom dplyr filter select rename bind_rows arrange
#' @importFrom rlang .data
#' @importFrom tidyr unnest
#' @importFrom tibble as_tibble tibble
#
# @return A tibble containing for each project, pool and quantification type
# the files chosen
.manage_anomalies_auto <- function(
files_found, proj_col,
pool_col) {
# Isolate anomalies in files found
anomalies <- files_found |> dplyr::filter(.data$Anomalies == TRUE)
files_found <- files_found |> dplyr::filter(.data$Anomalies == FALSE)
# If there are no anomalies to fix, return chosen files
if (nrow(anomalies) == 0) {
files_to_import <- files_found |>
dplyr::select(
dplyr::all_of(c(proj_col, pool_col, "Files"))
) |>
tidyr::unnest(dplyr::all_of("Files")) |>
dplyr::rename(Files_chosen = "Files_found")
return(files_to_import)
}
# For each ProjectID and PoolID
process_anomalie <- function(...) {
l <- list(...)
current <- tibble::as_tibble(l[c(
proj_col,
pool_col
)])
current_files <- l[["Files"]]
current_count <- l[["Files_count"]]
reported_missing_or_dupl <- tibble::tibble(
!!proj_col := character(),
!!pool_col := character(),
Quantification = character(),
Missing = logical(),
Duplicated = logical()
)
# Find missing files first
missing <- current_count |> dplyr::filter(.data$Found == 0)
if (nrow(missing) > 0) {
current_files <- current_files |>
dplyr::filter(!.data$Quantification_type %in%
missing$Quantification_type)
reported_missing_or_dupl <- dplyr::bind_rows(
reported_missing_or_dupl,
missing |>
dplyr::select(dplyr::all_of("Quantification_type")) |>
dplyr::mutate(
!!proj_col := current[[proj_col]],
!!pool_col := current[[pool_col]],
Missing = TRUE,
Duplicated = FALSE
) |>
dplyr::rename(Quantification = "Quantification_type")
)
}
# Manage duplicates
duplicate <- current_count |> dplyr::filter(.data$Found > 1)
if (nrow(duplicate) > 0) {
current_files <- current_files |>
dplyr::filter(!.data$Quantification_type %in%
duplicate$Quantification_type)
reported_missing_or_dupl <- dplyr::bind_rows(
reported_missing_or_dupl,
duplicate |>
dplyr::select(dplyr::all_of("Quantification_type")) |>
dplyr::mutate(
!!proj_col := current[[proj_col]],
!!pool_col := current[[pool_col]],
Missing = FALSE,
Duplicated = TRUE
) |>
dplyr::rename(Quantification = "Quantification_type")
)
}
to_import <- tibble::tibble(
ProjectID = current[[proj_col]],
concatenatePoolIDSeqRun =
current[[pool_col]],
current_files
)
to_import <- to_import |> dplyr::rename(Files_chosen = "Files_found")
return(list(to_import = to_import, reported = reported_missing_or_dupl))
}
files_to_import <- purrr::pmap(anomalies, process_anomalie)
reported_anomalies <- purrr::map(files_to_import, ~ .x$reported) |>
purrr::list_rbind()
files_to_import <- purrr::map(files_to_import, ~ .x$to_import) |>
purrr::list_rbind()
# Report missing or duplicated
if (getOption("ISAnalytics.verbose", TRUE) == TRUE &&
nrow(reported_anomalies) > 0) {
if (any(reported_anomalies$Missing)) {
missing_msg <- c("Some files are missing and will be ignored",
i = paste(
"Here is a summary of missing files:\n",
paste0(
utils::capture.output(print(
reported_anomalies |>
dplyr::filter(.data$Missing == TRUE) |>
dplyr::select(
dplyr::all_of(c("Duplicated", "Missing"))
)
)),
collapse = "\n"
)
)
)
rlang::inform(missing_msg, class = "auto_mode_miss")
}
if (any(reported_anomalies$Duplicated)) {
dupl_msg <- c(paste("Duplicates found for some files"),
i = paste(
"In automatic mode duplicates are not preserved.",
"Use interactive mode for more accurate",
"file selection or provide appropriate file patterns",
"(see `?import_parallel_Vispa2Matrices`)"
),
i = paste(
"Here is a summary of duplicated files:\n",
paste0(utils::capture.output(
print(
reported_anomalies |>
dplyr::filter(.data$Duplicated == TRUE) |>
dplyr::select(
dplyr::all_of(c("Duplicated", "Missing"))
)
)
), collapse = "\n")
)
)
rlang::inform(dupl_msg, class = "auto_mode_dupl")
}
}
# Add non-anomalies
if (nrow(files_found) > 0) {
files_found <- files_found |>
dplyr::select(
dplyr::all_of(c(proj_col, pool_col, "Files"))
) |>
tidyr::unnest(dplyr::all_of("Files")) |>
dplyr::rename(Files_chosen = "Files_found")
files_to_import <- files_to_import |>
dplyr::bind_rows(files_found) |>
dplyr::arrange(.data[[proj_col]], .data[[pool_col]])
}
files_to_import
}
# Internal function for parallel import of a single quantification
# type files.
# - type_df is files_to_import df filtered for a single quantification type
# (potentially contains more pools but all the same type)
.import_type <- function(type_df, import_matrix_args, progr, max_workers) {
wrapper <- function(path, import_matrix_args, progress) {
matrix <- rlang::exec(.import_single_matrix,
path = path,
!!!import_matrix_args
)
if (!is.null(progress)) {
progress()
}
return(matrix)
}
type_matrices <- .execute_map_job(
data_list = type_df$Files_chosen,
fun_to_apply = wrapper,
fun_args = list(import_matrix_args = import_matrix_args),
stop_on_error = FALSE,
progrs = progr,
max_workers = max_workers
)
correct <- if (is.null(type_matrices$err)) {
rep_len(TRUE, length(type_matrices$res))
} else {
purrr::map_lgl(type_matrices$err, ~ is.null(.x))
}
errors <- type_matrices$err
type_matrices <- type_matrices$res
sample_col_name <- if ("id_col_name" %in% names(import_matrix_args)) {
import_matrix_args[["id_col_name"]]
} else {
pcr_id_column()
}
samples_count <- purrr::map2_int(type_matrices, correct, ~ {
if (.y) {
.x |>
dplyr::distinct(.data[[sample_col_name]]) |>
dplyr::pull(.data[[sample_col_name]]) |>
length()
} else {
NA_integer_
}
})
distinct_is <- purrr::map2_int(type_matrices, correct, ~ {
if (.y) {
.x |>
dplyr::distinct(dplyr::across(
dplyr::all_of(mandatory_IS_vars())
)) |>
nrow()
} else {
NA_integer_
}
})
imported_files <- type_df |>
dplyr::mutate(
Imported = correct,
Number_of_samples = samples_count,
Distinct_is = distinct_is,
Errors = errors
)
type_matrices <- type_matrices[correct]
# Bind rows in single tibble for all files
if (purrr::is_empty(type_matrices)) {
return(list(matrix = NULL, imported_files = imported_files))
}
type_matrices <- purrr::reduce(type_matrices, ~ dplyr::bind_rows(.x, .y) |>
dplyr::distinct())
list(matrix = type_matrices, imported_files = imported_files)
}
# Internal function for importing all files for each quantification type.
#
# @param files_to_import The tibble containing the files to import
# @param workers Number of parallel workers
#
# @return A named list of tibbles
.parallel_import_merge <- function(files_to_import, workers,
import_matrix_args) {
split_by_quant <- files_to_import |>
dplyr::group_by(.data$Quantification_type) |>
dplyr::group_split()
quants <- purrr::map_chr(split_by_quant, ~ .x$Quantification_type[1])
split_by_quant <- purrr::set_names(split_by_quant, quants)
progrs <- if (rlang::is_installed("progressr")) {
progressr::progressor(steps = nrow(files_to_import))
} else {
NULL
}
imported <- purrr::map(
split_by_quant,
~ .import_type(
type_df = .x,
import_matrix_args = import_matrix_args,
progr = progrs, max_workers = workers
)
)
summary_files <- purrr::map(imported, ~ .x$imported_files) |>
purrr::reduce(dplyr::bind_rows)
imported_matrices <- purrr::map(imported, ~ .x$matrix)
list(matrix = imported_matrices, summary = summary_files)
}
#### ---- Internals for collision removal ----####
#---- USED IN : remove_collisions ----
# Used as a wrapper for basic correctness check on collisions input
# - Returns the mode (LIST or MULTI)
.collisions_check_input_matrices <- function(x, quant_cols) {
stopifnot(is.list(x) & !is.null(names(x)))
stopifnot(is.character(quant_cols) && all(!is.na(names(quant_cols))))
if (!all(names(quant_cols) %in% quantification_types())) {
rlang::abort(
.quantifications_names_err(
quant_cols[!names(quant_cols) %in% quantification_types()]
)
)
}
mode <- NULL
if (!is.data.frame(x)) {
#---- If input x is provided as list of data frames
if (!all(names(x) %in% quantification_types())) {
rlang::abort(.quantifications_names_err(
names(x)[!names(x) %in% quantification_types()]
))
}
## remove_collisions requires seqCount matrix, check if the list
## contains one
if ((!"seqCount" %in% names(x)) ||
nrow(x$seqCount) == 0) {
rlang::abort(.seqCount_df_err())
}
all_correct <- purrr::map2(x, names(x), function(m, quant) {
mand_cols <- .check_mandatory_vars(m)
cAmp_col <- .check_sample_col(m)
if (mand_cols & cAmp_col) {
return(NA_character_)
} else {
msgs <- c()
if (!mand_cols) {
msgs <- .missing_mand_vars()
}
if (!cAmp_col) {
msgs <- c(msgs, .missing_cAmp_sub_msg())
}
msgs <- paste0(quant, " - ", paste0(msgs, collapse = ";\n"))
return(msgs)
}
})
if (!all(is.na(all_correct))) {
message <- unlist(all_correct[!is.na(all_correct)])
names(message) <- NULL
rlang::abort(c("Matrices miss required info, aborting", message),
class = "coll_matrix_issues"
)
}
## Transform the list in a multi-quant matrix
mode <- "LIST"
quant_cols_lst <- as.list(quant_cols)
args <- append(list(x = x), quant_cols_lst)
x <- rlang::exec(comparison_matrix, !!!args)
rlang::env_poke(env = rlang::caller_env(), nm = "x", value = x)
} else {
#---- If input x is provided as data frame
if (.check_mandatory_vars(x) == FALSE) {
rlang::abort(.missing_mand_vars())
}
if (.check_sample_col(x) == FALSE) {
rlang::abort(.missing_cAmp_sub_msg())
}
if (!all(quant_cols %in% colnames(x))) {
rlang::abort(.missing_user_cols_error(
quant_cols[!quant_cols %in% colnames(x)]
))
}
if (!"seqCount" %in% names(quant_cols)) {
rlang::abort(.seqCount_col_err())
}
mode <- "MULTI"
}
return(mode)
}
# Used as a wrapper for basic correctness check on collisions input -
# for association file. Checks if all columns provided in input are present
# and in correct format and checks if required tags are present
# - Returns the selected and found tags if no errors are raised
.collisions_check_input_af <- function(
association_file,
date_col,
independent_sample_id) {
stopifnot(is.data.frame(association_file))
stopifnot(is.character(independent_sample_id) &&
!purrr::is_empty(independent_sample_id))
af_specs <- association_file_columns(TRUE)
## Check if input columns are present actual af
user_input_cols <- unique(c(independent_sample_id, date_col))
if (!all(user_input_cols %in%
colnames(association_file))) {
missing_cols <- user_input_cols[!user_input_cols %in%
colnames(association_file)]
rlang::abort(.missing_af_needed_cols(missing_cols))
}
## Check tags
required_tags <- list(
project_id = "char",
pool_id = "char",
pcr_replicate = c("int", "numeric"),
pcr_repl_id = "char"
)
req_tag_cols <- .check_required_cols(required_tags, af_specs, "error")
if (!all(req_tag_cols$names %in% colnames(association_file))) {
rlang::abort(.missing_af_needed_cols(req_tag_cols$names[
!req_tag_cols$names %in% colnames(association_file)
]))
}
## Check date_col
if (!lubridate::is.Date(association_file[[date_col]])) {
not_date_err <- c(paste0("'", date_col, "'", " is not a date vector"))
rlang::abort(not_date_err, class = "not_date_coll_err")
}
if (any(is.na(association_file[[date_col]]))) {
rlang::abort(.na_in_date_col())
}
return(req_tag_cols)
}
# Checks if association file contains more information than the matrix.
#
# Used to notify the user that wants to know if for the projects contained in
# the examined matrix there are additional CompleteAmplificationIDs contained
# in the association file that weren't included in the integration matrix (for
# example failed runs).
.check_same_info <- function(
association_file, df, req_tag_cols,
indep_sample_id) {
joined <- association_file |>
dplyr::left_join(df, by = pcr_id_column())
proj_col_name <- req_tag_cols |>
dplyr::filter(.data$tag == "project_id") |>
dplyr::pull(.data$names)
projects <- joined |>
dplyr::filter(!dplyr::if_all(
.cols = dplyr::all_of(mandatory_IS_vars()),
.fns = is.na
)) |>
dplyr::distinct(.data[[proj_col_name]]) |>
dplyr::pull(.data[[proj_col_name]])
wanted_tags <- c(
"subject", "tissue", "cell_marker", "tp_days"
)
wanted <- association_file_columns(TRUE)
wanted <- wanted |>
dplyr::filter(
.data$tag %in% wanted_tags,
.data$names %in% colnames(association_file)
)
wanted <- req_tag_cols |>
dplyr::bind_rows(wanted)
missing_in_df <- joined |>
dplyr::filter(
dplyr::if_all(
.cols = dplyr::all_of(mandatory_IS_vars()),
.fns = is.na
) & .data[[proj_col_name]] %in% projects
) |>
dplyr::distinct(dplyr::across(
dplyr::all_of(c(wanted$names, indep_sample_id))
))
reduced_af <- association_file |>
dplyr::filter(
.data[[proj_col_name]] %in% projects
)
list(miss = missing_in_df, reduced_af = reduced_af)
}
# Identifies independent samples and separates the joined_df in
# collisions and non-collisions
# @return A named list containing the splitted joined_df for collisions and
# non-collisions
.identify_independent_samples <- function(joined, indep_sample_id) {
temp <- joined |>
dplyr::group_by(dplyr::across(dplyr::all_of(mandatory_IS_vars()))) |>
dplyr::summarise(
N = dplyr::n_distinct(
!!!purrr::map(indep_sample_id, ~ rlang::sym(.x))
),
.groups = "drop"
) |>
dplyr::filter(.data$N > 1)
non_collisions <- joined |>
dplyr::anti_join(temp, by = mandatory_IS_vars())
collisions <- dplyr::semi_join(joined, temp, by = mandatory_IS_vars())
list(collisions = collisions, non_collisions = non_collisions)
}
# Internal for date discrimination in collision removal.
#
# It's the first of 4 steps in the algorithm for collision removal: it tries to
# find a single sample who has an associated date which is earlier than any
# other. If comparison is not possible the analysis fails and returns the
# original input data.
#
# @param x - All metadata associated with a single collision event in
# a data frame format (does not contain mandatory IS vars)
# @param date_col - The name of the date column chosen for collision removal
# @param ind_sample_key - character vector containing the columns that identify
# independent samples
# @return A named list with:
# * $data: a data frame, containing the data (unmodified or modified)
# * $check: a logical value indicating whether the analysis was successful or
# not (and therefore there is the need to perform the next step)
.discriminate_by_date <- function(x, date_col, ind_sample_key) {
# Test for all dates equality
all_equal_dates <- length(unique(x[[date_col]])) == 1
if (all_equal_dates == TRUE) {
return(list(data = NULL, check = FALSE))
}
# If not all are equal sort them asc
x <- dplyr::arrange(x, .data[[date_col]])
# Test if first date is unique
dates <- x[[date_col]]
if (length(dates[dates %in% dates[1]]) > 1) {
return(list(data = NULL, check = FALSE))
}
winning_sample <- x[1, ind_sample_key]
# Filter the winning rows
x <- x |>
dplyr::semi_join(winning_sample, by = ind_sample_key)
return(list(data = x, check = TRUE))
}
# Internal for replicate discrimination in collision removal.
#
# It's the second of 4 steps in the algorithm for collision removal:
# grouping by independent sample it counts the number of
# rows (replicates) found for each group, orders them from biggest to smallest
# and, if a single group has more rows than any other group the integration is
# assigned to that sample, otherwise the analysis fails and returns the
# original input to be submitted to the next step.
#
# @param x - All metadata associated with a single collision event in
# a data frame format (does not contain mandatory IS vars)
# @param repl_col - The name of the column containing the replicate number
# @param ind_sample_key - character vector containing the columns that identify
# independent samples
# @return A named list with:
# * $data: a data frame, containing the data (unmodified or modified)
# * $check: a logical value indicating whether the analysis was successful or
# not (and therefore there is the need to perform the next step)
.discriminate_by_replicate <- function(x, repl_col, ind_sample_key) {
temp <- x |>
dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_key))) |>
dplyr::summarise(N = dplyr::n(), .groups = "drop") |>
dplyr::arrange(dplyr::desc(.data$N))
if (length(temp$N) != 1 & !temp$N[1] > temp$N[2]) {
return(list(data = NULL, check = FALSE))
}
temp <- temp[1, ind_sample_key]
x <- x |>
dplyr::semi_join(temp, by = ind_sample_key)
return(list(data = x, check = TRUE))
}
# Internal for sequence count discrimination in collision removal.
#
# It's the third of 4 steps in the algorithm for collision removal:
# grouping by independent sample, it sums the value of
# the sequence count for each group, sorts the groups from highest to lowest
# cumulative value and then checks the ratio between the first element and the
# second: if the ratio is > `reads_ratio` the integration is assigned to
# the first group, otherwise the analysis fails and returns the original input.
#
# @param x - All metadata associated with a single collision event in
# a data frame format (does not contain mandatory IS vars)
# @param reads_ratio The value of the ratio between sequence count values to
# check
# @param seqCount_col The name of the sequence count column (support
# for multi quantification matrix)
# @param ind_sample_key - character vector containing the columns that identify
# independent samples
# @return A named list with:
# * $data: a tibble, containing the data (unmodified or modified)
# * $check: a logical value indicating whether the analysis was successful or
# not (and therefore there is the need to perform the next step)
.discriminate_by_seqCount <- function(
x,
reads_ratio,
seqCount_col,
ind_sample_key) {
temp <- x |>
dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_key))) |>
dplyr::summarise(sum = sum(.data[[seqCount_col]]), .groups = "drop") |>
dplyr::arrange(dplyr::desc(.data$sum))
ratio <- temp$sum[1] / temp$sum[2]
if (!ratio > reads_ratio) {
return(list(data = NULL, check = FALSE))
}
temp <- temp[1, ind_sample_key]
x <- x |>
dplyr::semi_join(temp, by = ind_sample_key)
return(list(data = x, check = TRUE))
}
# Internal function that performs four-step-check of collisions for
# a single integration.
#
# @param x - Represents the sub-table containing coordinates
# and metadata about a SINGLE collision event (same values for
# mandatory IS vars)
# @param date_col The date column to consider
# @param repl_col - The name of the column containing the replicate number
# @param reads_ratio The value of the ratio between sequence count values to
# check
# @param seqCount_col The name of the sequence count column (support
# for multi quantification matrix)
# @param ind_sample_key - character vector containing the columns that identify
# independent samples
# @return A list with:
# * $data: an updated tibble with processed collisions or NULL if no
# criteria was sufficient
# * $reassigned: 1 if the integration was successfully reassigned, 0 otherwise
# * $removed: 1 if the integration is removed entirely because no criteria was
# met, 0 otherwise
.four_step_check <- function(
x,
date_col,
repl_col,
reads_ratio,
seqCount_col,
ind_sample_key,
progress = NULL) {
current_data <- x |>
dplyr::select(-dplyr::all_of(mandatory_IS_vars()))
# Try to discriminate by date
result <- .discriminate_by_date(current_data, date_col, ind_sample_key)
if (result$check == TRUE) {
current_data <- result$data
coordinates <- x[seq_len(nrow(current_data)), mandatory_IS_vars()]
res <- dplyr::bind_cols(coordinates, current_data)
if (!is.null(progress)) {
progress()
}
return(list(data = res, reassigned = 1, removed = 0))
}
# If first check fails try to discriminate by replicate
result <- .discriminate_by_replicate(current_data, repl_col, ind_sample_key)
if (result$check == TRUE) {
current_data <- result$data
coordinates <- x[seq_len(nrow(current_data)), mandatory_IS_vars()]
res <- dplyr::bind_cols(coordinates, current_data)
if (!is.null(progress)) {
progress()
}
return(list(data = res, reassigned = 1, removed = 0))
}
# If second check fails try to discriminate by seqCount
result <- .discriminate_by_seqCount(
current_data, reads_ratio, seqCount_col,
ind_sample_key
)
if (result$check == TRUE) {
current_data <- result$data
coordinates <- x[seq_len(nrow(current_data)), mandatory_IS_vars()]
res <- dplyr::bind_cols(coordinates, current_data)
if (!is.null(progress)) {
progress()
}
return(list(data = res, reassigned = 1, removed = 0))
}
# If all check fails remove the integration from all subjects
if (!is.null(progress)) {
progress()
}
return(list(data = NULL, reassigned = 0, removed = 1))
}
# Internal function to process collisions on multiple integrations,
# parallelized.
# @return A list containing the updated collisions, a numeric value
# representing the number of integrations removed and a numeric value
# representing the number of integrations reassigned
.process_collisions <- function(
collisions,
date_col,
repl_col,
reads_ratio,
seqCount_col,
ind_sample_key,
max_workers) {
# Split by IS
split_data <- collisions |>
dplyr::group_by(dplyr::across(dplyr::all_of(mandatory_IS_vars()))) |>
dplyr::group_split()
result <- .execute_map_job(
data_list = split_data,
fun_to_apply = .four_step_check,
fun_args = list(
date_col = date_col,
repl_col = repl_col,
reads_ratio = reads_ratio,
seqCount_col = seqCount_col,
ind_sample_key = ind_sample_key
),
stop_on_error = TRUE,
max_workers = max_workers
)
# For each element of result extract and bind the 3 components
processed_collisions_df <- purrr::map(result$res, ~ .x$data) |>
purrr::list_rbind()
removed_total <- purrr::map(result$res, ~ .x$removed) |>
purrr::reduce(sum)
reassigned_total <- purrr::map(result$res, ~ .x$reassigned) |>
purrr::reduce(sum)
# Return the summary
list(
coll = processed_collisions_df, removed = removed_total,
reassigned = reassigned_total
)
}
# Internal function to obtain a summary table after collision processing.
#
# @param before The joined table before collision removing
# (obtained via `.join_matrix_af`)
# @param after The final matrix obtained after collision processing
# @param association_file The association file
#' @importFrom rlang .data
# @keywords internal
#
# @return A tibble with a summary containing for each indep sample the number of
# integrations found before and after, the sum of the value of the sequence
# count for each sample before and after and the corresponding deltas.
.summary_table <- function(before, after, seqCount_col, ind_sample_id) {
after_matr <- after |>
dplyr::select(dplyr::all_of(c(colnames(after), ind_sample_id)))
n_int_after <- after_matr |>
dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_id))) |>
dplyr::tally(name = "count_integrations_after")
sumReads_after <- after_matr |>
dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_id))) |>
dplyr::summarise(
sumSeqReads_after = sum(.data[[seqCount_col]]),
.groups = "drop_last"
)
temp_aft <- n_int_after |> dplyr::left_join(sumReads_after,
by = ind_sample_id
)
before_matr <- before |>
dplyr::select(dplyr::all_of(c(colnames(after), ind_sample_id)))
n_int_before <- before_matr |>
dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_id))) |>
dplyr::tally(name = "count_integrations_before")
sumReads_before <- before_matr |>
dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_id))) |>
dplyr::summarise(
sumSeqReads_before = sum(.data[[seqCount_col]]),
.groups = "drop_last"
)
temp_bef <- n_int_before |> dplyr::left_join(sumReads_before,
by = ind_sample_id
)
summary <- temp_bef |>
dplyr::left_join(temp_aft, by = ind_sample_id) |>
dplyr::mutate(
delta_integrations =
.data$count_integrations_before -
.data$count_integrations_after,
delta_seqReads = .data$sumSeqReads_before - .data$sumSeqReads_after
)
summary
}
# Internal for obtaining summary info about the input sequence count matrix
# as is (no pre-processing).
# Info contain
# - Number of distinct integration sites
# - Total for each quantification present (if multi-quant) or seqCount
.summary_input <- function(input_df, quant_cols) {
names(quant_cols) <- NULL
# Distinct integration sites
n_IS <- input_df |>
dplyr::distinct(dplyr::across(mandatory_IS_vars())) |>
nrow()
quant_totals <- input_df |>
dplyr::select(dplyr::all_of(quant_cols)) |>
dplyr::summarise(
dplyr::across(
.cols = dplyr::all_of(quant_cols),
.fns = list(
sum = ~ sum(.x, na.rm = TRUE)
),
.names = "{.col}"
),
.groups = "drop"
) |>
as.list()
list(total_iss = n_IS, quant_totals = quant_totals)
}
.per_pool_stats <- function(joined, quant_cols, pool_col) {
## Joined is the matrix already joined with metadata
df_by_pool <- joined |>
dplyr::group_by(dplyr::across(dplyr::all_of(pool_col))) |>
dplyr::summarise(dplyr::across(
.cols = dplyr::all_of(quant_cols),
.fns = list(
sum = ~ sum(.x, na.rm = TRUE),
count = length,
describe = ~ psych::describe(.x)
)
))
df_cols <- purrr::map_lgl(df_by_pool, ~ is.data.frame(.x))
if (any(df_cols == TRUE)) {
df_cols <- df_cols[df_cols]
df_cols <- names(df_cols)
dfss <- purrr::map(df_cols, function(col) {
exp <- rlang::expr(`$`(df_by_pool, !!col))
df <- rlang::eval_tidy(exp)
df <- df |> dplyr::rename_with(.fn = ~ paste0(col, "_", .x))
df
}) |> purrr::set_names(df_cols)
for (dfc in df_cols) {
df_by_pool <- df_by_pool |>
dplyr::select(-dplyr::all_of(dfc)) |>
dplyr::bind_cols(dfss[[dfc]])
}
}
df_by_pool
}
.collisions_obtain_report_summaries <- function(
x,
association_file,
quant_cols, missing_ind,
pcr_col, pre_process,
collisions,
removed, reassigned,
joined, post_joined, pool_col,
final_matr,
replicate_n_col,
independent_sample_id,
seq_count_col) {
input_summary <- .summary_input(x, quant_cols)
missing_smpl <- if (!is.null(missing_ind)) {
x[missing_ind, ] |>
dplyr::group_by(dplyr::across(dplyr::all_of(pcr_col))) |>
dplyr::summarise(
n_IS = dplyr::n(),
dplyr::across(
dplyr::all_of(quant_cols),
.fns = ~ sum(.x, na.rm = TRUE),
.names = "{.col}_tot"
)
)
} else {
NULL
}
samples_info <- list(
MATRIX = unique(x[[pcr_col]]),
AF = unique(association_file[[pcr_col]])
)
pre_summary <- .summary_input(pre_process, quant_cols)
per_pool_stats_pre <- .per_pool_stats(joined, quant_cols, pool_col)
coll_info <- list(
coll_n = collisions |>
dplyr::distinct(
dplyr::across(dplyr::all_of(mandatory_IS_vars()))
) |>
nrow(),
removed = removed,
reassigned = reassigned
)
post_info <- .summary_input(final_matr, quant_cols)
post_per_pool_stats <- .per_pool_stats(
post_joined,
quant_cols,
pool_col
)
summary_tbl <- .summary_table(
before = joined, after = post_joined,
seqCount_col = seq_count_col,
ind_sample_id = independent_sample_id
)
return(
list(
input_info = input_summary,
missing_info = missing_smpl,
samples_info = samples_info,
pre_info = pre_summary,
pre_stats = per_pool_stats_pre,
coll_info = coll_info,
post_info = post_info,
post_stats = post_per_pool_stats,
summary_post = summary_tbl
)
)
}
.collisions_obtain_sharing_heatmaps <- function(
joined,
independent_sample_id,
post_joined,
report_path) {
sharing_heatmaps_pre <- sharing_heatmaps_post <- NULL
withCallingHandlers(
{
withRestarts(
{
sharing_pre <- is_sharing(joined,
group_key = independent_sample_id,
n_comp = 2,
is_count = FALSE,
minimal = FALSE,
include_self_comp = TRUE
)
sharing_post <- is_sharing(post_joined,
group_key = independent_sample_id,
n_comp = 2,
is_count = FALSE,
minimal = FALSE,
include_self_comp = TRUE
)
too_many_samples <- nrow(dplyr::distinct(
joined,
dplyr::across(
dplyr::all_of(independent_sample_id)
)
)) > 25
sharing_heatmaps_pre <- if (too_many_samples) {
sharing_heatmap(sharing_pre, interactive = FALSE)
} else {
sharing_heatmap(sharing_pre, interactive = TRUE)
}
sharing_heatmaps_post <- if (too_many_samples) {
sharing_heatmap(sharing_post, interactive = FALSE)
} else {
sharing_heatmap(sharing_post, interactive = TRUE)
}
if (too_many_samples) {
skip_sharing_msg <- c(
"Too many independent samples",
i = paste(
"To avoid issues with report size,",
"sharing heatmaps will be saved",
"as separate files in the same",
"folder"
)
)
rlang::inform(skip_sharing_msg)
fs::dir_create(report_path)
prefix_filename <- paste(lubridate::today(),
"ISAnalytics_collision_removal",
sep = "_"
)
for (map_type in names(sharing_heatmaps_pre)) {
filename <- paste0(paste(prefix_filename,
"pre-processing-sharing",
map_type,
sep = "_"
), ".pdf")
ggplot2::ggsave(
plot = sharing_heatmaps_pre[[map_type]],
filename = filename,
path = report_path,
width = 15, height = 15
)
}
for (map_type in names(sharing_heatmaps_post)) {
filename <- paste0(
paste(prefix_filename,
"post-processing-sharing",
map_type,
sep = "_"
),
".pdf"
)
ggplot2::ggsave(
plot = sharing_heatmaps_post[[map_type]],
filename = filename,
path = report_path,
width = 15, height = 15
)
}
sharing_heatmaps_pre <- NULL
sharing_heatmaps_post <- NULL
}
},
sharing_err = function(e) {
rlang::inform(c(
paste(
"Unable to compute sharing:",
conditionMessage(e)
),
i = "Skipping"
))
}
)
},
error = function(cnd) {
rest <- findRestart("sharing_err")
invokeRestart(rest, cnd)
}
)
return(
list(
sharing_pre = sharing_heatmaps_pre,
sharing_post = sharing_heatmaps_post
)
)
}
#### ---- Internals for aggregate functions ----####
#---- USED IN : aggregate_metadata ----
# Aggregates the association file based on the function table.
#' @importFrom rlang .data is_function is_formula
#' @importFrom tibble tibble
# @keywords internal
#
# @return A tibble
.aggregate_meta <- function(association_file, grouping_keys, function_tbl) {
## Discard columns that are not present in the af
function_tbl <- function_tbl |>
dplyr::filter(.data$Column %in% colnames(association_file))
## If no columns are left return
if (nrow(function_tbl) == 0) {
return(NULL)
}
function_tbl <- function_tbl |>
tidyr::nest(cols = dplyr::all_of("Column"))
apply_function <- function(Function, Args, Output_colname, cols) {
Function <- list(Function)
Args <- list(Args)
row <- tibble::tibble(Function, Args,
Output_colname,
cols = list(cols)
)
if (rlang::is_formula(row$Function[[1]])) {
res <- association_file |>
dplyr::group_by(dplyr::across(dplyr::all_of(grouping_keys))) |>
dplyr::summarise(
dplyr::across(
dplyr::all_of(row$cols[[1]]$Column),
.fns = row$Function[[1]],
.names = row$Output_colname
),
.groups = "drop"
)
return(res)
}
if (rlang::is_function(row$Function[[1]])) {
args <- if (is.na(row$Args[[1]]) | is.null(is.na(row$Args[[1]]))) {
list()
} else {
row$Args[[1]]
}
res <- association_file |>
dplyr::group_by(dplyr::across(dplyr::all_of(grouping_keys))) |>
dplyr::summarise(
dplyr::across(
.cols = dplyr::all_of(row$cols[[1]]$Column),
.fns = \(x) row$Function[[1]](x, !!!args),
.names = row$Output_colname
),
.groups = "drop"
)
return(res)
}
return(NULL)
}
agg <- purrr::pmap(function_tbl, apply_function)
agg <- purrr::reduce(agg, ~ dplyr::full_join(.x, .y, by = grouping_keys))
return(agg)
}
#---- USED IN : aggregate_values_by_key ----
# Internal function that performs aggregation on values with a lambda
# operation.
# - x is a single matrix
.aggregate_lambda <- function(
x, af, key, value_cols, lambda, group,
join_af_by) {
cols_to_check <- c(group, key, value_cols, join_af_by)
joint <- x |>
dplyr::left_join(af, by = join_af_by)
if (any(!cols_to_check %in% colnames(joint))) {
missing <- cols_to_check[!cols_to_check %in% colnames(joint)]
rlang::abort(.missing_user_cols_error(missing))
}
cols <- c(colnames(x), key)
aggregated_matrices <- joint |>
dplyr::select(dplyr::all_of(cols)) |>
dplyr::group_by(dplyr::across(dplyr::all_of(c(group, key)))) |>
dplyr::summarise(dplyr::across(
.cols = dplyr::all_of(value_cols),
.fns = lambda
), .groups = "drop")
aggregated_matrices
}
#### ---- Internals for re-calibration functions ----####
#---- USED IN : compute_near_integrations ----
# Finds a unique maximum value in a vector of numbers.
#
# Unlike the standard `max` function, this function returns a single maximum
# value if and only if a maximum exists, more in details that means:
# * It ignores NA values by default
# * If the identified maximum value is not unique in the vector, it returns
# an empty vector instead
#
# @param x A numeric or integer vector
#' @importFrom purrr is_empty
#
# @return A single numeric value or an empty numeric vector
# @keywords internal
.find_unique_max <- function(x) {
if (any(is.na(x))) {
x <- x[!is.na(x)]
}
uniques <- unique(x)
if (purrr::is_empty(uniques)) {
return(uniques)
}
if (length(uniques) == 1) {
if (length(x[x == uniques]) > 1) {
return(numeric(0))
} else {
return(uniques)
}
}
max_val <- max(uniques)
if (length(x[x == max_val]) > 1) {
return(numeric(0))
}
max_val
}
.update_rec_map <- function(rec_map, before, after) {
if (is.null(rec_map)) {
return()
}
rec_map$update(before, after)
}
# Internal function that implements the sliding window algorithm.
#
# **NOTE: this function is meant to be called on a SINGLE GROUP,
# meaning a subset of an integration matrix in which all rows
# share the same chr (and optionally same strand).**
#
#' @importFrom dplyr filter pull arrange
#' @importFrom purrr is_empty map set_names
#' @importFrom rlang as_function
#' @importFrom utils tail
#
# returns only recalibrated chunk
.sliding_window <- function(x,
threshold,
keep_criteria,
annotated,
num_cols,
max_val_col,
sample_col,
req_tags,
add_col_lambdas,
rec_map,
progress = NULL) {
locus_col <- req_tags |>
dplyr::filter(.data$tag == "locus") |>
dplyr::pull(.data$names)
## Order by integration_locus
x <- x |> dplyr::arrange(.data[[locus_col]])
index <- 1
repeat {
# If index is the last row in the data frame return
if (index == nrow(x)) {
.update_rec_map(rec_map,
before = x[index, req_tags$names],
after = x[index, req_tags$names]
)
if (!is.null(progress)) {
progress()
}
return(x)
}
# Otherwise
## Compute interval for row
interval <- x[index, ][[locus_col]] + 1 + threshold
## Look ahead for every integration that falls in the interval
near <- numeric()
k <- index
repeat {
if (k == nrow(x)) {
break
}
k <- k + 1
if (x[k, ][[locus_col]] < interval) {
# Saves the indexes of the rows that are in the interval
near <- append(near, k)
} else {
break
}
}
window <- c(index, near)
row_to_keep <- index
if (!purrr::is_empty(near)) {
# If there are integrations in the interval (recalibration necessary)
## Change loci according to criteria
######## CRITERIA PROCESSING
if (keep_criteria == "max_value") {
to_check <- x[window, ][[max_val_col]]
max <- .find_unique_max(to_check)
if (!purrr::is_empty(max)) {
row_to_keep <- window[which(to_check == max)]
near <- window[!window == row_to_keep]
}
}
.update_rec_map(rec_map,
before = x[window, req_tags$names],
after = x[row_to_keep, req_tags$names]
)
# Change loci and other ids of near integrations
fields_to_change <- req_tags$names
if (annotated) {
fields_to_change <- c(fields_to_change, annotation_IS_vars())
}
x[near, fields_to_change] <- x[row_to_keep, fields_to_change]
## Aggregate same IDs
starting_rows <- nrow(x)
repeat {
t <- x[[sample_col]][window]
d <- unique(t[duplicated(t)])
if (purrr::is_empty(d)) {
break
}
dupl_indexes <- which(t == d[1])
values_sum <- colSums(x[window[dupl_indexes], num_cols],
na.rm = TRUE
)
x[window[dupl_indexes[1]], num_cols] <- as.list(values_sum)
### Aggregate additional cols
if (!is.null(add_col_lambdas) &&
!purrr::is_empty(add_col_lambdas)) {
agg_cols <- purrr::map(names(add_col_lambdas), ~ {
if (is.null(add_col_lambdas[[.x]])) {
return(x[window[dupl_indexes[1]], ][[.x]])
}
values <- x[window[dupl_indexes], ][[.x]]
return(rlang::as_function(
add_col_lambdas[[.x]]
)(values))
}) |>
purrr::set_names(names(add_col_lambdas))
x[window[dupl_indexes[1]], names(add_col_lambdas)] <- agg_cols
}
x <- x[-window[dupl_indexes[-1]], ]
to_drop <- seq(
from = length(window),
length.out = length(dupl_indexes[-1]),
by = -1
)
window <- window[-to_drop]
}
## Increment index
if (nrow(x) == starting_rows) {
index <- k
} else {
index <- utils::tail(window, n = 1) + 1
}
} else {
# If no integrations in interval (no recalibration necessary)
.update_rec_map(rec_map,
before = x[window, req_tags$names],
after = x[row_to_keep, req_tags$names]
)
index <- index + 1
}
}
if (!is.null(progress)) {
progress()
}
return(x)
}
#### ---- Internals for analysis functions ----####
#---- USED IN : threshold_filter ----
# @keywords internal
.check_threshold_param <- function(threshold) {
if (is.list(threshold)) {
# List must have names
if (is.null(names(threshold))) {
stop(.names_list_param_err())
}
all_nums <- purrr::map_lgl(threshold, function(t) {
is.numeric(t) || is.integer(t)
})
if (!all(all_nums)) {
stop(.threshold_err(), call. = FALSE)
}
} else {
if (!(is.numeric(threshold) || is.integer(threshold))) {
stop(.threshold_err(), call. = FALSE)
}
}
}
# @keywords internal
.check_comparators_param <- function(comparators) {
if (is.list(comparators)) {
# List must have names
if (is.null(names(comparators))) {
stop(.names_list_param_err())
}
chk <- purrr::map_lgl(comparators, function(comp) {
is.character(comp) &
all(comp %in% c("<", ">", "==", "!=", ">=", "<="))
})
if (!all(chk)) {
stop(.comparators_err(), call. = FALSE)
}
} else {
if (!(is.character(comparators) &
all(comparators %in% c("<", ">", "==", "!=", ">=", "<=")))) {
stop(.comparators_err(), call. = FALSE)
}
}
}
# @keywords internal
.check_cols_to_compare_param <- function(cols_to_compare) {
if (is.list(cols_to_compare)) {
# List must have names
if (is.null(names(cols_to_compare))) {
stop(.names_list_param_err())
}
chk <- purrr::map_lgl(cols_to_compare, function(col) {
is.character(col)
})
if (!all(chk)) {
stop(paste("`cols_to_compare` elements must be character"),
call. = FALSE
)
}
} else {
if (!is.character(cols_to_compare)) {
stop(paste("`cols_to_compare` must be character"), call. = FALSE)
}
}
}
# @keywords internal
.list_names_to_mod <- function(x, list_params) {
### list_params is a list with 1-3 elements with names as parameters
### single elements are also lists
names_equal_list <- purrr::map_lgl(list_params, function(p) {
all(names(p) %in% names(x))
})
if (!all(names_equal_list)) {
stop(.param_names_not_in_list_err())
}
names_same <- purrr::reduce(list_params, function(p1, p2) {
names_same <- all(names(p1) == names(p2))
if (names_same == FALSE) {
stop(.param_names_not_equal_err())
}
})
return(names(list_params[[1]]))
}
# @keywords internal
.check_col_correct <- function(cols_to_compare, x, names_to_mod) {
if (is.list(cols_to_compare)) {
purrr::walk2(
cols_to_compare,
names(cols_to_compare),
function(set, name) {
df <- x[[name]]
if (!all(set %in% colnames(df))) {
rlang::abort(.missing_user_cols_list_error(set, name))
}
purrr::walk(set, function(col) {
expr <- rlang::expr(`$`(df, !!col))
if (!is.numeric(rlang::eval_tidy(expr)) &&
!is.integer(rlang::eval_tidy(expr))) {
rlang::abort(.non_num_user_cols_error(
rlang::eval_tidy(expr)
))
}
})
}
)
} else {
if (is.data.frame(x)) {
if (!all(cols_to_compare %in% colnames(x))) {
rlang::abort(.missing_user_cols_error(
cols_to_compare[!cols_to_compare %in% colnames(x)]
))
}
### Check they're numeric
all_numeric <- purrr::map_lgl(cols_to_compare, function(col) {
expr <- rlang::expr(`$`(x, !!col))
is.numeric(rlang::eval_tidy(expr)) ||
is.integer(rlang::eval_tidy(expr))
})
if (!all(all_numeric)) {
rlang::abort(
.non_num_user_cols_error(
cols_to_compare[!all_numeric]
)
)
}
} else {
purrr::walk(x[names_to_mod], function(ddf) {
if (!all(cols_to_compare %in% colnames(ddf))) {
rlang::abort(
.missing_user_cols_error(
cols_to_compare[!cols_to_compare %in% colnames(ddf)]
)
)
}
purrr::walk(cols_to_compare, function(col) {
expr <- rlang::expr(`$`(ddf, !!col))
if (!is.numeric(rlang::eval_tidy(expr)) &&
!is.integer(rlang::eval_tidy(expr))) {
rlang::abort(
.non_num_user_cols_error(rlang::eval_tidy(expr))
)
}
})
})
}
}
}
# @keywords internal
.check_length_list_params <- function(list_params) {
purrr::pwalk(list(
list_params$threshold, list_params$cols_to_compare,
list_params$comparators
), function(x, y, z) {
if (length(x) != length(y) || length(x) != length(z) ||
length(y) != length(z)) {
stop(.diff_leng_args())
}
})
}
# @keywords internal
.check_length_vector_params <- function(vec_params, list_params) {
### Check lengths between vector params first
vec_lengths <- purrr::map_dbl(vec_params, length)
max <- max(vec_lengths)
vec_lengths_ok <- purrr::map_lgl(vec_lengths, function(l) {
l == 1 || l == max
})
if (any(vec_lengths_ok == FALSE)) {
stop(.diff_leng_args(), call. = FALSE)
}
if (!is.null(list_params) || !purrr::is_empty(list_params)) {
list_lengths_ok <- purrr::map_lgl(list_params, function(p) {
l <- purrr::map_dbl(p, length)
all(l == max)
})
if (any(list_lengths_ok == FALSE)) {
stop(.diff_leng_args(), call. = FALSE)
}
}
}
# Threshold filter for data frames.
#
# @param x A data frame
# @param threshold Vector of threshold values
# @param cols_to_compare Vector of columns to compare
# @param comparators Vector of comparators
#
#' @importFrom purrr pmap_chr reduce
#' @importFrom rlang eval_tidy parse_expr .data
#
# @keywords internal
# @return A data frame
.tf_data_frame <- function(x, threshold, cols_to_compare, comparators) {
### Check all columns in input are present
if (is.list(threshold) || is.list(comparators) ||
is.list(cols_to_compare)) {
stop(.list_params_err())
}
.check_threshold_param(threshold)
.check_comparators_param(comparators)
.check_cols_to_compare_param(cols_to_compare)
.check_col_correct(cols_to_compare, x, NULL)
.check_length_vector_params(
list(threshold, cols_to_compare, comparators),
NULL
)
### Obtain single expression for all columns
conditions <- purrr::pmap_chr(
list(
cols_to_compare,
comparators,
threshold
),
function(q, c, t) {
temp <- paste(q, c, t)
paste0(".data$", temp)
}
)
single_predicate_string <- purrr::reduce(conditions, function(c1, c2) {
paste0(c1, ", ", c2)
})
single_predicate_string <- paste0(
"x |> dplyr::filter(",
single_predicate_string,
")"
)
filtered <- rlang::eval_tidy(rlang::parse_expr(single_predicate_string))
return(filtered)
}
# Threshold filter for lists.
#
# @param x The list of data frames
# @param threshold Vector or named list for threshold values
# @param cols_to_compare Vector or named list for columns to
# compare
# @param comparators Vector or named list for comparators
#
#' @importFrom purrr map_lgl is_empty map pmap_chr reduce map2
#' @importFrom rlang .data parse_expr eval_tidy
# @keywords internal
#
# @return A list of data frames
.tf_list <- function(x, threshold, cols_to_compare, comparators) {
### Check list contains data frames
contain_df <- purrr::map_lgl(x, function(el) {
is.data.frame(el)
})
if (!all(contain_df)) {
stop(paste("Elements of the list must be data frames"))
}
### Check the nature of the parameters (list or vector)
list_has_names <- !is.null(names(x))
params <- list(
threshold = threshold, comparators = comparators,
cols_to_compare = cols_to_compare
)
param_are_lists <- purrr::map_lgl(params, is.list)
### With unnamed lists only vector parameters allowed
if (any(param_are_lists) & list_has_names == FALSE) {
stop(.unnamed_list_err())
}
### Check single params
.check_threshold_param(threshold)
.check_cols_to_compare_param(cols_to_compare)
.check_comparators_param(comparators)
list_params <- params[param_are_lists]
names_to_mod <- if (!purrr::is_empty(list_params)) {
.list_names_to_mod(x, list_params)
} else {
names(x)
}
if (all(param_are_lists)) {
.check_length_list_params(params)
}
if (any(param_are_lists == FALSE)) {
.check_length_vector_params(params[!param_are_lists], list_params)
}
.check_col_correct(cols_to_compare, x, names_to_mod)
### Filter each
if (!list_has_names) {
filtered <- purrr::map(x, function(df) {
conditions <- purrr::pmap_chr(
list(
cols_to_compare,
comparators,
threshold
),
function(q, c, t) {
temp <- paste(q, c, t)
paste0(".data$", temp)
}
)
single_predicate_string <- purrr::reduce(
conditions,
function(c1, c2) {
paste0(c1, ", ", c2)
}
)
single_predicate_string <- paste0(
"df |> dplyr::filter(",
single_predicate_string,
")"
)
rlang::eval_tidy(
rlang::parse_expr(single_predicate_string)
)
})
return(filtered)
}
filtered <- purrr::map2(x, names(x), function(df, nm) {
if (!nm %in% names_to_mod) {
df
} else {
col_set <- if (is.list(cols_to_compare)) {
cols_to_compare[[nm]]
} else {
cols_to_compare
}
comp <- if (is.list(comparators)) {
comparators[[nm]]
} else {
comparators
}
thr <- if (is.list(threshold)) {
threshold[[nm]]
} else {
threshold
}
conditions <- purrr::pmap_chr(
list(
col_set,
comp,
thr
),
function(q, c, t) {
temp <- paste(q, c, t)
paste0(".data$", temp)
}
)
single_predicate_string <- purrr::reduce(
conditions,
function(c1, c2) {
paste0(c1, ", ", c2)
}
)
single_predicate_string <- paste0(
"df |> dplyr::filter(",
single_predicate_string,
")"
)
rlang::eval_tidy(
rlang::parse_expr(single_predicate_string)
)
}
})
return(filtered)
}
#---- USED IN : top_targeted_genes ----
# Internal to count the number of distinct integration sites for each gene
# - x: input df, must include mand vars and annotation vars
# - include_chr: should the distinction be made by taking into account that
# some genes span over different chromosomes? If TRUE keeps same
# (gene_sym, gene_strand) but different chr separated (different rows)
# - include_gene_strand: same but for gene strand
#' @importFrom rlang sym
.count_distinct_is_per_gene <- function(
x, include_chr,
include_gene_strand,
gene_sym_col,
gene_strand_col,
chr_col,
mand_vars_to_check) {
present_mand_vars <- mand_vars_to_check |>
dplyr::filter(.data$names %in% colnames(x))
group_key <- c(gene_sym_col)
if (include_gene_strand) {
group_key <- c(group_key, gene_strand_col)
}
if (include_chr) {
group_key <- c(group_key, chr_col)
present_mand_vars <- present_mand_vars |>
dplyr::filter(!.data$names %in% chr_col)
}
is_vars <- present_mand_vars$names
count_by_gene <- x |>
dplyr::group_by(dplyr::across(dplyr::all_of(group_key))) |>
dplyr::summarise(n_IS = dplyr::n_distinct(
dplyr::across(dplyr::all_of(is_vars))
), .groups = "drop")
count_by_gene
}
#---- USED IN : CIS_grubbs, CIS_grubbs_overtime ----
# Param check for CIS functions
.cis_param_check <- function(
x, genomic_annotation_file,
grubbs_flanking_gene_bp,
threshold_alpha,
return_missing_as_df) {
## Check x has the correct structure
stopifnot(is.data.frame(x))
check_res <- list()
## Check dyn vars for required tags
check_res$req_mand_vars <- .check_required_cols(c("chromosome", "locus"),
vars_df = mandatory_IS_vars(TRUE),
duplicate_politic = "error"
)
check_res$chrom_col <- check_res$req_mand_vars |>
dplyr::filter(.data$tag == "chromosome") |>
dplyr::pull(.data$names)
check_res$locus_col <- check_res$req_mand_vars |>
dplyr::filter(.data$tag == "locus") |>
dplyr::pull(.data$names)
check_res$strand_col <- if ("is_strand" %in% mandatory_IS_vars(TRUE)$tag) {
col <- mandatory_IS_vars(TRUE) |>
dplyr::filter(.data$tag == "is_strand") |>
dplyr::pull(.data$names)
if (col %in% colnames(x)) {
col
} else {
NULL
}
} else {
NULL
}
check_res$req_annot_col <- .check_required_cols(
list(gene_symbol = "char", gene_strand = "char"),
vars_df = annotation_IS_vars(TRUE),
"error"
)
check_res$gene_symbol_col <- check_res$req_annot_col |>
dplyr::filter(.data$tag == "gene_symbol") |>
dplyr::pull(.data$names)
check_res$gene_strand_col <- check_res$req_annot_col |>
dplyr::filter(.data$tag == "gene_strand") |>
dplyr::pull(.data$names)
check_res$cols_required <- c(
check_res$chrom_col, check_res$locus_col,
check_res$gene_symbol_col,
check_res$gene_strand_col
)
if (!all(check_res$cols_required %in% colnames(x))) {
rlang::abort(.missing_user_cols_error(
check_res$cols_required[!check_res$cols_required %in% colnames(x)]
))
}
# Check other parameters
stopifnot(is.data.frame(genomic_annotation_file) ||
is.character(genomic_annotation_file))
if (is.character(genomic_annotation_file) &&
!genomic_annotation_file %in% c("hg19", "mm9")) {
err_msg <- c("Genomic annotation file unknown",
x = paste(
"Since ISAnalytics 1.5.4, if provided as",
"character vector, `genomic_annotation_file`",
"parameter must be one of 'hg19' or 'mm9'"
),
i = paste(
"For using other genome reference files",
"import them in the R environment and pass",
"them to the function"
)
)
rlang::abort(err_msg, class = "genomic_file_char")
}
if (is.character(genomic_annotation_file)) {
gen_file <- paste0("refGenes_", genomic_annotation_file)
utils::data(list = gen_file, envir = rlang::current_env())
check_res$refgenes <- rlang::eval_tidy(rlang::sym(gen_file))
} else {
# Check annotation file format
refgenes <- genomic_annotation_file
if (!all(refGene_table_cols() %in% colnames(refgenes))) {
rlang::abort(.non_standard_annotation_structure())
}
check_res$refgenes <- tibble::as_tibble(refgenes) |>
dplyr::mutate(chrom = stringr::str_replace_all(
.data$chrom,
"chr", ""
))
}
stopifnot(is.numeric(grubbs_flanking_gene_bp) ||
is.integer(grubbs_flanking_gene_bp))
check_res$grubbs_flanking_gene_bp <- grubbs_flanking_gene_bp[1]
stopifnot(is.numeric(threshold_alpha))
check_res$threshold_alpha <- threshold_alpha[1]
stopifnot(is.logical(return_missing_as_df))
check_res$return_missing_as_df <- return_missing_as_df[1]
return(check_res)
}
# Join with refgenes
.cis_join_ref <- function(x, res_checks) {
result <- list()
# Join input with refgenes - inner join only
result$joint_ref <- x |>
dplyr::inner_join(res_checks$refgenes |>
dplyr::select(
dplyr::all_of(
c("name2", "chrom", "strand", "average_TxLen")
)
), by = setNames(
object = c("chrom", "strand", "name2"),
nm = c(
res_checks$chrom_col,
res_checks$gene_strand_col,
res_checks$gene_symbol_col
)
))
# Retrieve eventual missing genes
missing_genes <- x |>
dplyr::anti_join(result$joint_ref,
by = c(
res_checks$gene_symbol_col,
res_checks$gene_strand_col, res_checks$chrom_col
)
)
missing_is_tot <- nrow(missing_genes)
missing_genes <- missing_genes |>
dplyr::distinct(dplyr::across(dplyr::all_of(
c(
res_checks$gene_symbol_col, res_checks$gene_strand_col,
res_checks$chrom_col
)
)))
if (nrow(missing_genes) > 0 &
getOption("ISAnalytics.verbose", TRUE) == TRUE) {
warn_miss <- c("Warning: missing genes in refgenes table",
i = paste(
paste(
"A total of", nrow(missing_genes),
"genes",
"were found in the input data but not",
"in the refgene table. This may be caused by",
"a mismatch in the annotation phase of",
"the matrix. Here is a summary: "
),
paste0(utils::capture.output({
print(missing_genes, nrows = Inf)
}), collapse = "\n"),
sep = "\n"
),
i = paste(
"NOTE: missing genes will be removed from",
"the final output! Review results carefully"
),
i = paste(
"A total of", missing_is_tot,
"IS will be removed because of missing genes (",
round((missing_is_tot / nrow(x)) * 100, 2), "% of",
"total IS in input)"
)
)
rlang::warn(warn_miss, class = "warn_miss_genes")
}
if (res_checks$return_missing_as_df) {
result$missing_genes <- missing_genes
result$missing_is <- list(
absolute = missing_is_tot,
perc = round(
(missing_is_tot / nrow(x)) * 100, 2
)
)
}
return(result)
}
# Internal computation of CIS_grubbs test
.cis_grubb_calc <- function(
x,
grubbs_flanking_gene_bp,
threshold_alpha,
gene_symbol_col,
gene_strand_col,
chr_col,
locus_col,
strand_col) {
## -- Grouping by gene
df_by_gene <- if (requireNamespace("psych", quietly = TRUE)) {
x |>
dplyr::group_by(
dplyr::across(
dplyr::all_of(c(gene_symbol_col, gene_strand_col, chr_col))
)
) |>
dplyr::summarise(
n = dplyr::n(),
mean = mean(.data[[locus_col]]),
sd = stats::sd(.data[[locus_col]], na.rm = TRUE),
median = stats::median(.data[[locus_col]]),
trimmed = mean(.data[[locus_col]], trim = .1),
mad = stats::mad(.data[[locus_col]]),
min = min(.data[[locus_col]]),
max = max(.data[[locus_col]]),
range = max(.data[[locus_col]]) - min(.data[[locus_col]]),
skew = psych::skew(.data[[locus_col]]),
kurtosis = psych::kurtosi(.data[[locus_col]]),
n_IS_perGene = dplyr::n_distinct(
.data[[locus_col]]
),
min_bp_integration_locus =
min(.data[[locus_col]]),
max_bp_integration_locus =
max(.data[[locus_col]]),
IS_span_bp = (max(.data[[locus_col]]) -
min(.data[[locus_col]])),
avg_bp_integration_locus =
mean(.data[[locus_col]]),
median_bp_integration_locus =
stats::median(.data[[locus_col]]),
distinct_orientations = dplyr::if_else(
condition = is.null(strand_col),
true = NA_integer_,
false = dplyr::n_distinct(.data[[strand_col]])
),
average_TxLen = .data$average_TxLen[1],
.groups = "drop"
)
} else {
x |>
dplyr::group_by(
dplyr::across(
dplyr::all_of(c(gene_symbol_col, gene_strand_col, chr_col))
)
) |>
dplyr::summarise(
n = dplyr::n(),
mean = mean(.data[[locus_col]]),
sd = stats::sd(.data[[locus_col]], na.rm = TRUE),
median = stats::median(.data[[locus_col]]),
trimmed = mean(.data[[locus_col]], trim = .1),
mad = stats::mad(.data[[locus_col]]),
min = min(.data[[locus_col]]),
max = max(.data[[locus_col]]),
range = max(.data[[locus_col]]) - min(.data[[locus_col]]),
n_IS_perGene = dplyr::n_distinct(
.data[[locus_col]]
),
min_bp_integration_locus =
min(.data[[locus_col]]),
max_bp_integration_locus =
max(.data[[locus_col]]),
IS_span_bp = (max(.data[[locus_col]]) -
min(.data[[locus_col]])),
avg_bp_integration_locus =
mean(.data[[locus_col]]),
median_bp_integration_locus =
stats::median(.data[[locus_col]]),
distinct_orientations = dplyr::if_else(
condition = is.null(strand_col),
true = NA_integer_,
false = dplyr::n_distinct(.data[[strand_col]])
),
average_TxLen = .data$average_TxLen[1],
.groups = "drop"
)
}
n_elements <- nrow(df_by_gene)
### Grubbs test
### --- Gene Frequency
df_bygene_withannotation <- df_by_gene |>
dplyr::mutate(
raw_gene_integration_frequency =
.data$n_IS_perGene / .data$average_TxLen,
integration_frequency_withtolerance = (.data$n_IS_perGene /
(.data$average_TxLen + grubbs_flanking_gene_bp)) * 1000,
minus_log2_integration_freq_withtolerance =
-log(x = .data$integration_frequency_withtolerance, base = 2)
)
### --- z score
z_mlif <- function(x) {
sqrt((n_elements * (n_elements - 2) * x^2) /
(((n_elements - 1)^2) - (n_elements * x^2)))
}
df_bygene_withannotation <- df_bygene_withannotation |>
dplyr::mutate(
zscore_minus_log2_int_freq_tolerance =
scale(.data$minus_log2_integration_freq_withtolerance)[, 1],
neg_zscore_minus_log2_int_freq_tolerance =
-.data$zscore_minus_log2_int_freq_tolerance,
t_z_mlif = z_mlif(
.data$neg_zscore_minus_log2_int_freq_tolerance
)
)
### --- tdist
t_dist_2t <- function(x, deg) {
return((1 - stats::pt(x, deg)) * 2)
}
df_bygene_withannotation <- df_bygene_withannotation |>
dplyr::mutate(
tdist2t = t_dist_2t(.data$t_z_mlif, n_elements - 2),
tdist_pt = stats::pt(
q = .data$t_z_mlif,
df = n_elements - 2
),
tdist_bonferroni_default = ifelse(
.data$tdist2t * n_elements > 1, 1,
.data$tdist2t * n_elements
),
tdist_bonferroni = stats::p.adjust(
.data$tdist2t,
method = "bonferroni",
n = length(.data$tdist2t)
),
tdist_fdr = stats::p.adjust(
.data$tdist2t,
method = "fdr",
n = length(.data$tdist2t)
),
tdist_benjamini = stats::p.adjust(
.data$tdist2t,
method = "BY",
n = length(.data$tdist2t)
)
)
df_bygene_withannotation <- df_bygene_withannotation |>
dplyr::mutate(
tdist_positive_and_corrected =
ifelse(
(.data$tdist_bonferroni_default < threshold_alpha &
.data$neg_zscore_minus_log2_int_freq_tolerance > 0),
.data$tdist_bonferroni_default,
NA
),
tdist_positive = ifelse(
(.data$tdist2t < threshold_alpha &
.data$neg_zscore_minus_log2_int_freq_tolerance > 0),
.data$tdist2t,
NA
)
)
EM_correction_N <- length(
df_bygene_withannotation$tdist_positive[
!is.na(df_bygene_withannotation$tdist_positive)
]
)
df_bygene_withannotation <- df_bygene_withannotation |>
dplyr::mutate(
tdist_positive_and_correctedEM =
ifelse(
(.data$tdist2t * EM_correction_N <
threshold_alpha &
.data$neg_zscore_minus_log2_int_freq_tolerance > 0),
.data$tdist2t * EM_correction_N,
NA
)
)
return(df_bygene_withannotation)
}
#---- USED IN : CIS_volcano_plot ----
#' @importFrom rlang arg_match abort inform current_env
#' @importFrom utils read.delim
.load_onco_ts_genes <- function(
onco_db_file,
tumor_suppressors_db_file,
species) {
onco_db <- if (onco_db_file == "proto_oncogenes") {
utils::data("proto_oncogenes", envir = rlang::current_env())
rlang::current_env()$proto_oncogenes
} else {
if (!file.exists(onco_db_file)) {
onco_db_not_found <- c(paste("`onco_db_file` was not found"),
x = "Did you provide the correct path?"
)
rlang::abort(onco_db_not_found)
}
utils::read.delim(
file = onco_db_file, header = TRUE,
fill = TRUE, sep = "\t",
check.names = FALSE,
na.strings = c("NONE", "NA", "NULL", "NaN", "")
)
}
tumsup_db <- if (tumor_suppressors_db_file == "tumor_suppressors") {
utils::data("tumor_suppressors", envir = rlang::current_env())
rlang::current_env()$tumor_suppressors
} else {
if (!file.exists(tumor_suppressors_db_file)) {
tum_db_not_found <- c("`tumor_suppressors_db_file` was not found",
x = "Did you provide the correct path?"
)
rlang::abort(tum_db_not_found)
}
utils::read.delim(
file = tumor_suppressors_db_file,
header = TRUE,
fill = TRUE, sep = "\t",
check.names = FALSE,
na.strings = c("NONE", "NA", "NULL", "NaN", "")
)
}
specie <- rlang::arg_match(species, values = c("human", "mouse", "all"))
specie_name <- switch(specie,
"human" = "Homo sapiens (Human)",
"mouse" = "Mus musculus (Mouse)",
"all" = c(
"Homo sapiens (Human)",
"Mus musculus (Mouse)"
)
)
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
load_onco_msg <- c(
"Loading annotated genes - species selected: ",
paste(c(specie_name), collapse = ", ")
)
rlang::inform(load_onco_msg)
}
# Filter and merge
onco_df <- .filter_db(onco_db, specie_name, "OncoGene")
tumsup_df <- .filter_db(tumsup_db, specie_name, "TumorSuppressor")
oncots_df_to_use <- .merge_onco_tumsup(onco_df, tumsup_df)
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform(c(
"Loading annotated genes - done"
))
}
return(oncots_df_to_use)
}
#' @importFrom rlang .data
.filter_db <- function(onco_db, species, output_col_name) {
filtered_db <- onco_db |>
dplyr::filter(.data$Organism %in% species) |>
dplyr::filter(.data$Status == "reviewed" &
!is.na(.data$`Gene names`)) |>
dplyr::select(.data$`Gene names`) |>
dplyr::distinct()
filtered_db <- filtered_db |>
dplyr::mutate(`Gene names` = stringr::str_split(
.data$`Gene names`, " "
)) |>
tidyr::unnest("Gene names") |>
dplyr::mutate({{ output_col_name }} := .data$`Gene names`) |>
dplyr::rename("GeneName" = "Gene names")
return(filtered_db)
}
#' @importFrom rlang .data
.merge_onco_tumsup <- function(onco_df, tum_df) {
onco_tumsup <- onco_df |>
dplyr::full_join(tum_df, by = "GeneName") |>
dplyr::mutate(Onco1_TS2 = ifelse(
!is.na(.data$OncoGene) & !is.na(.data$TumorSuppressor),
yes = 3,
no = ifelse(!is.na(.data$OncoGene),
yes = 1,
no = ifelse(!is.na(.data$TumorSuppressor),
yes = 2,
no = NA
)
)
)) |>
dplyr::select(-c("OncoGene", "TumorSuppressor")) |>
dplyr::distinct()
return(onco_tumsup)
}
.expand_cis_df <- function(
cis_grubbs_df,
gene_sym_col,
onco_db_file,
tumor_suppressors_db_file,
species,
known_onco,
suspicious_genes) {
## Load onco and ts
oncots_to_use <- .load_onco_ts_genes(
onco_db_file,
tumor_suppressors_db_file,
species
)
## Join all dfs by gene
cis_grubbs_df <- cis_grubbs_df |>
dplyr::left_join(oncots_to_use, by = gene_sym_col) |>
dplyr::left_join(known_onco, by = gene_sym_col) |>
dplyr::left_join(suspicious_genes, by = gene_sym_col)
## Add info
cis_grubbs_df <- cis_grubbs_df |>
dplyr::mutate(
KnownGeneClass = ifelse(
is.na(.data$Onco1_TS2),
yes = "Other",
no = ifelse(.data$Onco1_TS2 == 1,
yes = "OncoGene",
no = "TumSuppressor"
)
),
CriticalForInsMut = ifelse(!is.na(.data$KnownClonalExpansion),
yes = TRUE, no = FALSE
)
)
return(cis_grubbs_df)
}
#---- USED IN : outliers_by_pool_fragments ----
.outlier_pool_frag_base_checks <- function(
metadata,
key,
outlier_p_value_threshold,
normality_test,
normality_p_value_threshold,
transform_log2,
min_samples_per_pool,
per_pool_test,
pool_col, pcr_id_col) {
stopifnot(is.data.frame(metadata))
stopifnot(is.character(key))
if (!all(key %in% colnames(metadata))) {
rlang::abort(
.missing_user_cols_error(key[!key %in%
colnames(metadata)]),
class = "missing_cols_key"
)
}
stopifnot(is.numeric(outlier_p_value_threshold))
stopifnot(is.logical(normality_test))
if (normality_test) {
stopifnot(is.numeric(normality_p_value_threshold))
}
stopifnot(is.logical(transform_log2))
stopifnot(is.logical(per_pool_test))
if (per_pool_test) {
stopifnot(is.character(pool_col))
stopifnot(is.numeric(min_samples_per_pool) &&
length(min_samples_per_pool) == 1)
if (!all(pool_col %in% colnames(metadata))) {
rlang::abort(
.missing_user_cols_error(
pool_col[!pool_col %in% colnames(metadata)]
),
class = "missing_cols_pool"
)
}
}
if (!pcr_id_col %in% colnames(metadata)) {
rlang::abort(.missing_user_cols_error(pcr_id_col))
}
}
.outlier_test_verify_logiop <- function(key, flag_logic, symbol_name) {
if (length(key) > 1) {
## Verify logic
stopifnot(is.character(flag_logic))
if (length(flag_logic) > length(key) - 1) {
flag_logic_new <- flag_logic[seq_len(length(key) - 1)]
rlang::env_poke(
env = rlang::caller_env(), nm = symbol_name,
value = flag_logic_new
)
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
flag_msg <- c(
paste0(
"'", symbol_name,
"' has more elements than expected"
),
i = paste(
"The vector will be trimmed to consider",
"the first", length(key) - 1, "elements",
"only."
)
)
rlang::inform(flag_msg, class = "flag_logic_long")
}
} else if (length(flag_logic) != length(key) - 1 &
length(flag_logic) != 1) {
flag_logic_new <- flag_logic[1]
rlang::env_poke(
env = rlang::caller_env(), nm = symbol_name,
value = flag_logic_new
)
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
flag_logic_err <- c(
paste(
"'", symbol_name, "' has an incorrect",
"amount of elements"
),
paste(
"You should provide 1 or",
length(key) - 1,
"logical operators"
),
i = "Only the first parameter will be considered"
)
rlang::inform(flag_logic_err,
class = "flag_logic_short"
)
}
} else {
flag_logic_new <- flag_logic
}
flag_logic_new <- toupper(flag_logic_new)
rlang::env_poke(
env = rlang::caller_env(), nm = symbol_name,
value = flag_logic_new
)
if (!all(flag_logic_new %in% flag_logics())) {
unknown_logi_op_err <- c(paste(
"Unknown or unsupported logical operators:",
paste0(flag_logic_new[!flag_logic_new %in% flag_logics()],
collapse = ", "
)
))
rlang::abort(unknown_logi_op_err, class = "unsupp_logi_op")
}
}
}
# Actual computation of statistical test on pre-filtered metadata (no NA values)
#' @importFrom rlang sym
.pool_frag_calc <- function(
meta,
key,
by_pool,
normality_test,
normality_threshold,
pool_col,
min_samples_per_pool,
log2,
pcr_id_col) {
log2_removed_report <- NULL
if (log2) {
## discard values < = 0 and report removed
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
rlang::inform("Log2 transformation, removing values <= 0")
}
old_meta <- meta
meta <- meta |>
dplyr::filter(dplyr::if_all(
.cols = dplyr::all_of(key),
.fns = ~ .x > 0
))
log2_removed_report <- old_meta |>
dplyr::anti_join(meta, by = pcr_id_col) |>
dplyr::distinct(dplyr::across(dplyr::all_of(c(
pool_col, pcr_id_col, key
))))
}
if (by_pool) {
# Group by pool
pool_sample_count <- meta |>
dplyr::group_by(dplyr::across(dplyr::all_of(pool_col))) |>
dplyr::summarise(N = dplyr::n(), .groups = "drop") |>
dplyr::filter(.data$N >= min_samples_per_pool)
pools_to_process <- meta |>
dplyr::semi_join(pool_sample_count, by = pool_col)
split <- pools_to_process |>
dplyr::group_by(dplyr::across(dplyr::all_of(pool_col))) |>
dplyr::group_split()
test_res <- purrr::map(
split,
~ .process_pool_frag(
.x, key, normality_test,
normality_threshold, log2
)
)
test_res <- purrr::list_rbind(test_res) |>
dplyr::mutate(processed = TRUE)
# Groups not processed because not enough samples
non_proc <- meta |>
dplyr::anti_join(pool_sample_count, by = pool_col) |>
dplyr::mutate(processed = FALSE)
final <- test_res |>
dplyr::bind_rows(non_proc)
# Rows not processed because log2 requested and value <= 0
if (exists("old_meta")) {
log2_removed <- old_meta |>
dplyr::anti_join(
meta,
by = pcr_id_col
) |>
dplyr::mutate(processed = FALSE)
final <- final |>
dplyr::bind_rows(log2_removed)
}
return(list(
metadata = final,
removed_zeros = log2_removed_report,
non_proc_samples = non_proc
))
} else {
test_res <- .process_pool_frag(
chunk = meta,
key = key,
normality_test = normality_test,
normality_threshold = normality_threshold,
log2 = log2
)
final <- test_res |>
dplyr::mutate(processed = TRUE)
# Rows not processed because log2 requested and value <= 0
if (exists("old_meta")) {
log2_removed <- old_meta |>
dplyr::filter(dplyr::if_any(
.cols = dplyr::all_of(key),
.fns = ~ .x <= 0
)) |>
dplyr::mutate(processed = FALSE)
final <- final |>
dplyr::bind_rows(log2_removed)
}
return(list(
metadata = final,
removed_zeros = log2_removed_report
))
}
}
# Only computes for each key the actual calculations (either on pool chunk
# or entire df)
#' @importFrom purrr map reduce
#' @importFrom dplyr bind_cols
.process_pool_frag <- function(
chunk, key, normality_test,
normality_threshold,
log2) {
res <- purrr::map(key, ~ .tests_pool_frag(
x = chunk[[.x]],
suffix = .x,
normality_test = normality_test,
normality_threshold = normality_threshold,
log2 = log2
))
res <- purrr::list_cbind(res)
res <- chunk |>
dplyr::bind_cols(res)
res
}
# Computation on single vectors (columns)
#' @importFrom stats dt shapiro.test
#' @importFrom tibble tibble
.tests_pool_frag <- function(
x,
suffix, normality_test,
normality_threshold,
log2) {
if (log2) {
x <- log2(x)
}
norm <- NA
res <- list(NA_real_, NA_real_, NA_real_)
z_statistic <- function() {
zscore <- scale(x)
zscore <- zscore[, 1]
count <- length(x)
tstudent <- sqrt(
((count * (count - 2) * (zscore)^2) /
((count - 1)^2 - count * (zscore)^2))
)
tdist <- stats::dt(tstudent, df = count - 2)
return(list(zscore = zscore, tstudent = tstudent, tdist = tdist))
}
if (normality_test) {
withCallingHandlers(
{
withRestarts(
{
shapiro_test <- stats::shapiro.test(x)
norm <- shapiro_test$p.value >= normality_threshold
if (norm) {
res <- z_statistic()
}
},
test_err = function() {
rlang::inform(
c("Unable to perform normality test, skipping")
)
}
)
},
error = function(cnd) {
rlang::inform(cnd$message)
invokeRestart("test_err")
}
)
} else {
res <- z_statistic()
}
results <- if (log2) {
res_colnames <- c("log2", "normality", "zscore", "tstudent", "tdist")
res_colnames <- paste(res_colnames, suffix, sep = "_")
res <- setNames(res, res_colnames[seq(3, 5)])
res <- append(setNames(list(x, norm), res_colnames[c(1, 2)]), res)
tibble::as_tibble(res)
} else {
res_colnames <- c("normality", "zscore", "tstudent", "tdist")
res_colnames <- paste(res_colnames, suffix, sep = "_")
res <- setNames(res, res_colnames[seq(2, 4)])
res <- append(setNames(list(norm), res_colnames[1]), res)
tibble::as_tibble(res)
}
return(results)
}
# Flag logic to use on a single key component
# returns a logical vector
#' @importFrom purrr pmap_lgl
.flag_cond_outliers_pool_frag <- function(
proc,
tdist,
zscore,
outlier_threshold) {
# basic flag condition is tdist < out.thresh AND zscore < 0
purrr::pmap_lgl(list(proc, tdist, zscore), function(p, t, z) {
if (p == FALSE) {
return(FALSE)
}
if (is.na(t) | is.na(z)) {
return(FALSE)
}
return(t < outlier_threshold & z < 0)
})
}
# Obtain and apply global logic on multiple columns based on the single
# key logic
#' @importFrom tibble tibble
.apply_flag_logic <- function(..., logic) {
all_flags <- rlang::list2(...)
partial <- NULL
index <- 1
for (op in logic) {
switch(op,
"AND" = {
if (is.null(partial)) {
partial <- all_flags[[1]] & all_flags[[2]]
index <- 3
} else {
partial <- partial & all_flags[[index]]
index <- index + 1
}
},
"OR" = {
if (is.null(partial)) {
partial <- all_flags[[1]] | all_flags[[2]]
index <- 3
} else {
partial <- partial | all_flags[[index]]
index <- index + 1
}
},
"XOR" = {
if (is.null(partial)) {
partial <- xor(all_flags[[1]], all_flags[[2]])
index <- 3
} else {
partial <- xor(partial, all_flags[[index]])
index <- index + 1
}
},
"NAND" = {
if (is.null(partial)) {
partial <- !(all_flags[[1]] & all_flags[[2]])
index <- 3
} else {
partial <- !(partial & all_flags[[index]])
index <- index + 1
}
},
"NOR" = {
if (is.null(partial)) {
partial <- !(all_flags[[1]] | all_flags[[2]])
index <- 3
} else {
partial <- !(partial | all_flags[[index]])
index <- index + 1
}
},
"XNOR" = {
if (is.null(partial)) {
partial <- !xor(all_flags[[1]], all_flags[[2]])
index <- 3
} else {
partial <- !xor(partial, all_flags[[index]])
index <- index + 1
}
}
)
}
return(partial)
}
.validate_outlier_output_format <- function(out, meta_rows, pcr_id_col) {
format_err <- c("Wrong outlier test result format",
x = paste(
"Test results should follow the format",
"described in the documentation"
),
i = "See `?outlier_filter`"
)
if (any(!c("to_remove", pcr_id_col) %in% colnames(out))) {
rlang::abort(format_err, class = "outlier_format_err")
}
if (!all(meta_rows %in% out[[pcr_id_col]])) {
rlang::abort(format_err, class = "outlier_format_err")
}
}
#---- USED IN : HSC_population_size_estimate ----
# Matches the vector of celltypes to tissues to form a list of groups to
# process
.match_celltypes_tissues <- function(cell_type, tissue) {
if (length(cell_type) > 1 | length(tissue) > 1) {
if (length(cell_type) == 1) {
cell_type <- rep_len(cell_type, length(tissue))
} else if (length(tissue) == 1) {
tissue <- rep_len(tissue, length(cell_type))
} else if (length(cell_type) != length(tissue)) {
err_length <- c(
"Cell types and tissue vectors have mismatching lengths",
i = "See ?HSC_population_size_estimate details"
)
rlang::abort(err_length, class = "pop_size_err_length")
}
}
groups_to_proc <- purrr::map2(cell_type, tissue, ~ c(.x, .y))
duplicated_ind <- duplicated(groups_to_proc)
groups_to_proc <- groups_to_proc[!duplicated_ind]
return(groups_to_proc)
}
# Converts a group slice df to a captures matrix - input df contains only
# timepoints relative to same patient, tissue and cell type
.convert_to_captures_matrix <- function(df, quant_cols,
tissue_col,
timepoint_column,
annotation_cols) {
captures_matrix <- df |>
dplyr::mutate(
!!timepoint_column := as.numeric(.data[[timepoint_column]])
) |>
dplyr::mutate(bin = 1) |>
dplyr::select(-dplyr::all_of(quant_cols)) |>
tidyr::pivot_wider(
names_from = dplyr::all_of(c(
"CellType",
tissue_col,
timepoint_column
)),
values_from = dplyr::all_of("bin"),
names_sort = TRUE,
values_fill = 0
) |>
dplyr::select(-dplyr::all_of(c(
mandatory_IS_vars(),
annotation_cols
))) |>
as.matrix()
return(captures_matrix)
}
# Calculates population estimates (all)
#' @importFrom purrr map_lgl detect_index
#' @importFrom tidyr pivot_wider
.estimate_pop <- function(
df,
seqCount_column,
fragmentEstimate_column,
timepoint_column,
annotation_cols,
stable_timepoints,
subject,
tissue_col) {
quant_cols <- if (is.null(fragmentEstimate_column)) {
seqCount_column
} else {
c(seqCount_column, fragmentEstimate_column)
}
logger <- list()
# --- STABLE TIMEPOINTS?
found_stable <- purrr::map_lgl(
stable_timepoints,
~ .x %in%
as.numeric(df[[timepoint_column]])
)
first_stable_index <- if (length(found_stable) > 0) {
purrr::detect_index(found_stable, ~ .x == TRUE)
} else {
0
}
stable_tps <- if (first_stable_index > 0) {
TRUE
} else {
FALSE
}
# --- OBTAIN MATRIX (ALL TPs)
matrix_desc <- .convert_to_captures_matrix(
df,
quant_cols = quant_cols,
tissue_col = tissue_col,
timepoint_column = timepoint_column,
annotation_cols = annotation_cols
)
# --- OBTAIN MATRIX (STABLE TPs)
patient_slice_stable <- if (first_stable_index > 0) {
first_stable <- stable_timepoints[first_stable_index]
.convert_to_captures_matrix(
df |>
dplyr::filter(as.numeric(.data[[timepoint_column]]) >=
first_stable),
quant_cols = quant_cols,
tissue_col = tissue_col,
timepoint_column = timepoint_column,
annotation_cols = annotation_cols
)
} else {
NULL
}
# --- ESTIMATES
timecaptures <- ncol(matrix_desc)
cols_estimate_mcm <- c("Model", "abundance", "stderr")
## models0
### Estimate abundance without bias correction nor het
#### For all tps
models0_all <- .closed_m0_est(
matrix_desc = matrix_desc,
timecaptures = timecaptures,
cols_estimate_mcm = cols_estimate_mcm,
subject = subject, stable = FALSE
)
if (is.null(models0_all) || nrow(models0_all) == 0) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{subject}, {df$CellType[1]}, "),
glue::glue("{df[[tissue_col]][1]}"),
" - Models0 estimates tp 'All' is empty"
)
)
))
}
#### For the slice (first stable - last tp)
models0_stable <- if (!is.null(patient_slice_stable) &&
ncol(patient_slice_stable) > 1) {
.closed_m0_est(
matrix_desc = patient_slice_stable,
timecaptures = ncol(patient_slice_stable),
cols_estimate_mcm = cols_estimate_mcm,
subject = subject, stable = TRUE
)
} else {
NULL
}
if (is.null(models0_stable) || nrow(models0_stable) == 0) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{subject}, {df$CellType[1]}, "),
glue::glue("{df[[tissue_col]][1]}"),
" - Models0 estimates tp 'Stable' is empty"
)
)
))
}
## BC models
### Estimate abundance with bias correction
models_bc_all <- .closed_mthchaobc_est(
matrix_desc = matrix_desc,
timecaptures = timecaptures,
cols_estimate_mcm = cols_estimate_mcm,
subject = subject, stable = FALSE
)
if (is.null(models_bc_all) || nrow(models_bc_all) == 0) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{subject}, {df$CellType[1]}, "),
glue::glue("{df[[tissue_col]][1]}"),
" - ModelsBC estimates tp 'All' is empty"
)
)
))
}
models_bc_stable <- if (!is.null(patient_slice_stable) &&
ncol(patient_slice_stable) > 1) {
.closed_mthchaobc_est(
matrix_desc = patient_slice_stable,
timecaptures = length(colnames(patient_slice_stable)),
cols_estimate_mcm = cols_estimate_mcm,
subject = subject, stable = TRUE
)
} else {
NULL
}
if (is.null(models_bc_stable) || nrow(models_bc_stable) == 0) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{subject}, {df$CellType[1]}, "),
glue::glue("{df[[tissue_col]][1]}"),
" - ModelsBC estimates tp 'Stable' is empty"
)
)
))
}
## Consecutive timepoints
### Model m0 bc
estimate_consecutive_m0 <- if (ncol(matrix_desc) > 1) {
.consecutive_m0bc_est(
matrix_desc = matrix_desc,
cols_estimate_mcm = cols_estimate_mcm,
subject = subject
)
} else {
NULL
}
if (is.null(estimate_consecutive_m0) ||
nrow(estimate_consecutive_m0) == 0) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{subject}, {df$CellType[1]}, "),
glue::glue("{df[[tissue_col]][1]}"),
" - Models0 estimates tp 'Consecutive' is empty"
)
)
))
}
### Model Mth
estimate_consecutive_mth <- if (stable_tps & ncol(matrix_desc) > 2) {
# - Note: pass the whole matrix, not only stable slice
.consecutive_mth_est(
matrix_desc = matrix_desc,
cols_estimate_mcm = cols_estimate_mcm,
subject = subject
)
} else {
NULL
}
if (is.null(estimate_consecutive_mth) ||
nrow(estimate_consecutive_mth) == 0) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{subject}, {df$CellType[1]}, "),
glue::glue("{df[[tissue_col]][1]}"),
" - ModelsMth estimates tp 'Consecutive' is empty"
)
)
))
}
results <- models0_all |>
dplyr::bind_rows(models0_stable) |>
dplyr::bind_rows(models_bc_all) |>
dplyr::bind_rows(models_bc_stable) |>
dplyr::bind_rows(estimate_consecutive_m0) |>
dplyr::bind_rows(estimate_consecutive_mth)
return(list(est = results, logger = logger))
}
#' @importFrom stringr str_split
#' @importFrom tibble as_tibble add_column
#' @importFrom dplyr select all_of
.closed_m0_est <- function(
matrix_desc, timecaptures, cols_estimate_mcm,
subject, stable) {
### Estimate abundance without bias correction nor het
models0 <- Rcapture::closedp.0(
matrix_desc,
dfreq = FALSE,
dtype = "hist",
t = timecaptures,
neg = FALSE
)
splitted_col_nms1 <- (stringr::str_split(
colnames(matrix_desc)[1],
pattern = "_"
))[[1]]
splitted_col_nmsn <- (stringr::str_split(
colnames(matrix_desc)[ncol(matrix_desc)],
pattern = "_"
))[[1]]
tp_string <- if (stable) {
"Stable"
} else {
"All"
}
estimate_results <- tibble::as_tibble(models0$results,
rownames = "Model"
) |>
dplyr::select(dplyr::all_of(cols_estimate_mcm)) |>
tibble::add_column(
SubjectID = subject,
Timepoints = tp_string,
CellType = splitted_col_nms1[1],
Tissue = splitted_col_nms1[2],
TimePoint_from = as.numeric(splitted_col_nms1[3]),
TimePoint_to = as.numeric(splitted_col_nmsn[3]),
Timepoints_included = timecaptures,
ModelType = "ClosedPopulation",
ModelSetUp = "models0"
)
estimate_results
}
#' @importFrom stringr str_split
#' @importFrom tibble as_tibble add_column
#' @importFrom dplyr select all_of
.closed_mthchaobc_est <- function(
matrix_desc, timecaptures, cols_estimate_mcm,
subject, stable) {
mthchaobc_m0 <- Rcapture::closedp.bc(
matrix_desc,
dfreq = FALSE,
dtype = "hist",
t0 = timecaptures,
m = c("M0")
)
mthchaobc_m0 <- tibble::as_tibble(mthchaobc_m0$results,
rownames = "Model"
)
mthchaobc_mt <- Rcapture::closedp.bc(
matrix_desc,
dfreq = FALSE,
dtype = "hist",
t0 = timecaptures,
m = c("Mt")
)
mthchaobc_mt <- tibble::as_tibble(mthchaobc_mt$results,
rownames = "Model"
)
mthchaobc_mth <- if (timecaptures > 2) {
tmp <- Rcapture::closedp.bc(
matrix_desc,
dfreq = FALSE,
dtype = "hist",
t0 = timecaptures,
m = c("Mth")
)
tibble::as_tibble(tmp$results,
rownames = "Model"
)
} else {
NULL
}
splitted_col_nms1 <- (stringr::str_split(
colnames(matrix_desc)[1],
pattern = "_"
))[[1]]
splitted_col_nmsn <- (stringr::str_split(
colnames(matrix_desc)[ncol(matrix_desc)],
pattern = "_"
))[[1]]
tp_string <- if (stable) {
"Stable"
} else {
"All"
}
estimate_results_chaobc <- mthchaobc_m0 |>
dplyr::bind_rows(mthchaobc_mt)
if (!is.null(mthchaobc_mth)) {
estimate_results_chaobc <- estimate_results_chaobc |>
dplyr::bind_rows(mthchaobc_mth)
}
estimate_results_chaobc <- estimate_results_chaobc |>
dplyr::select(dplyr::all_of(cols_estimate_mcm)) |>
tibble::add_column(
SubjectID = subject,
Timepoints = tp_string,
CellType = splitted_col_nms1[1],
Tissue = splitted_col_nms1[2],
TimePoint_from = as.numeric(splitted_col_nms1[3]),
TimePoint_to = as.numeric(splitted_col_nmsn[3]),
Timepoints_included = timecaptures,
ModelType = "ClosedPopulation",
ModelSetUp = "mthchaobc"
)
estimate_results_chaobc
}
#' @importFrom stringr str_split
#' @importFrom tibble as_tibble add_column
#' @importFrom dplyr select all_of
.consecutive_m0bc_est <- function(matrix_desc, cols_estimate_mcm, subject) {
## Consecutive timepoints
indexes <- seq(from = 1, to = ncol(matrix_desc) - 1, by = 1)
purrr::map2(
indexes,
indexes + 1,
function(t1, t2) {
sub_matrix <- matrix_desc[, seq(from = t1, to = t2, by = 1)]
splitted_col_nms1 <- (stringr::str_split(
colnames(sub_matrix)[1],
pattern = "_"
))[[1]]
splitted_col_nmsn <- (stringr::str_split(
colnames(sub_matrix)[ncol(sub_matrix)],
pattern = "_"
))[[1]]
patient_trend_M0 <- Rcapture::closedp.bc(
sub_matrix,
dfreq = FALSE,
dtype = "hist",
t0 = 2,
m = c("M0")
)
results <- tibble::as_tibble(patient_trend_M0$results,
rownames = "Model"
) |>
dplyr::select(dplyr::all_of(cols_estimate_mcm)) |>
tibble::add_column(
SubjectID = subject,
Timepoints = "Consecutive",
CellType = splitted_col_nms1[1],
Tissue = splitted_col_nms1[2],
TimePoint_from = as.numeric(splitted_col_nms1[3]),
TimePoint_to = as.numeric(splitted_col_nmsn[3]),
Timepoints_included = 2,
ModelType = "ClosedPopulation",
ModelSetUp = "models0BC"
)
results
}
) |>
purrr::list_rbind()
}
#' @importFrom stringr str_split
#' @importFrom tibble as_tibble add_column
#' @importFrom dplyr select all_of
.consecutive_mth_est <- function(matrix_desc, cols_estimate_mcm, subject) {
indexes_s <- seq(from = 1, to = ncol(matrix_desc) - 2, by = 1)
purrr::map2(
indexes_s,
indexes_s + 2,
function(t1, t2) {
sub_matrix <- matrix_desc[, seq(from = t1, to = t2, by = 1)]
splitted_col_nms1 <- (stringr::str_split(
colnames(sub_matrix)[1],
pattern = "_"
))[[1]]
splitted_col_nmsn <- (stringr::str_split(
colnames(sub_matrix)[ncol(sub_matrix)],
pattern = "_"
))[[1]]
patient_trend_Mth <- Rcapture::closedp.bc(
sub_matrix,
dfreq = FALSE,
dtype = "hist",
t0 = 3,
m = c("Mth"),
h = "Chao"
)
result <- tibble::as_tibble(patient_trend_Mth$results,
rownames = "Model"
) |>
dplyr::select(dplyr::all_of(cols_estimate_mcm)) |>
tibble::add_column(
SubjectID = subject,
Timepoints = "Consecutive",
CellType = splitted_col_nms1[1],
Tissue = splitted_col_nms1[2],
TimePoint_from = as.numeric(splitted_col_nms1[3]),
TimePoint_to = as.numeric(splitted_col_nmsn[3]),
Timepoints_included = 3,
ModelType = "ClosedPopulation",
ModelSetUp = "modelMTHBC"
)
result
}
) |>
purrr::list_rbind()
}
# Re-aggregates a patient IS df and applies filters
# - df contains ONLY one patient
.re_agg_and_filter <- function(
df,
metadata,
fragmentEstimate_column,
seqCount_column,
tissue_col,
timepoint_column,
aggregation_key,
seqCount_threshold,
fragmentEstimate_threshold,
groups_to_proc,
annotation_cols,
subj_col) {
# --- RE-AGGREGATION - by cell type, tissue and time point
val_cols <- if (!is.null(fragmentEstimate_column)) {
c(seqCount_column, fragmentEstimate_column)
} else {
seqCount_column
}
re_agg <- aggregate_values_by_key(
x = df,
association_file = metadata,
value_cols = val_cols,
key = c("CellType", tissue_col, timepoint_column),
join_af_by = aggregation_key
)
re_agg <- re_agg |>
dplyr::filter(!is.na(.data$CellType))
new_seqCount_column <- colnames(re_agg)[stringr::str_detect(
colnames(re_agg),
seqCount_column
)]
new_fragmentEstimate_column <- if (!is.null(fragmentEstimate_column)) {
colnames(re_agg)[stringr::str_detect(
colnames(re_agg),
fragmentEstimate_column
)]
} else {
NULL
}
re_agg <- re_agg |>
dplyr::rename(!!seqCount_column := dplyr::all_of(new_seqCount_column))
if (!is.null(new_fragmentEstimate_column)) {
re_agg <- re_agg |>
dplyr::rename(!!fragmentEstimate_column := dplyr::all_of(
new_fragmentEstimate_column
))
}
### Keep only sequence count value greater or equal to
### seqCount_threshold (or put it in AND with fe filter)
### (for each is)
per_int_sums_low <- if (is.null(fragmentEstimate_column)) {
re_agg |>
dplyr::group_by(dplyr::across(
dplyr::all_of(mandatory_IS_vars())
)) |>
dplyr::summarise(
sum_sc = sum(.data[[seqCount_column]]),
.groups = "drop"
) |>
dplyr::filter(.data$sum_sc >= seqCount_threshold)
} else {
re_agg |>
dplyr::group_by(dplyr::across(
dplyr::all_of(mandatory_IS_vars())
)) |>
dplyr::summarise(
sum_sc = sum(.data[[seqCount_column]]),
sum_fe = sum(.data[[fragmentEstimate_column]]),
.groups = "drop"
) |>
dplyr::filter(
.data$sum_sc >= seqCount_threshold,
.data$sum_fe >= fragmentEstimate_threshold |
.data$sum_fe == 0
)
}
re_agg <- re_agg |>
dplyr::semi_join(per_int_sums_low, by = mandatory_IS_vars())
### Keep values whose cell type is in cell_type, tissue
### in tissue_type and
### have timepoint greater than zero
filter_predicate <- purrr::map(groups_to_proc, ~ {
string_expr <- glue::glue_collapse(
c(
glue::glue("(stringr::str_to_upper(.data$CellType) == '{.x[1]}'"),
glue::glue("stringr::str_to_upper(.data[[tissue_col]]) == '{.x[2]}')")
),
sep = " & "
)
string_expr
}) |>
purrr::reduce(~ glue::glue("{.x} | {.y}"))
patient_slice <- re_agg |>
dplyr::filter(
rlang::eval_tidy(rlang::parse_expr(filter_predicate)),
as.numeric(.data[[timepoint_column]]) > 0
)
return(patient_slice)
}
## Internal to call on each slice (a slice typically corresponding to 1 patient)
.re_agg_and_estimate <- function(
df,
metadata,
fragmentEstimate_column,
seqCount_column,
tissue_col,
timepoint_column,
aggregation_key,
seqCount_threshold,
fragmentEstimate_threshold,
groups_to_proc,
annotation_cols,
subj_col,
stable_timepoints,
progress = NULL) {
# --- RE-AGGREGATION - by cell type, tissue and time point
patient_slice <- .re_agg_and_filter(
df,
metadata,
fragmentEstimate_column,
seqCount_column,
tissue_col,
timepoint_column,
aggregation_key,
seqCount_threshold,
fragmentEstimate_threshold,
groups_to_proc,
annotation_cols,
subj_col
)
# --- Split by group to process
groups_dfs <- patient_slice |>
dplyr::group_by(dplyr::across(dplyr::all_of(
c("CellType", tissue_col)
))) |>
dplyr::group_split()
# --- Filter out groups with less than 2 time points and log it
logger <- NULL
for (i in seq_along(groups_dfs)) {
group_i <- groups_dfs[[i]]
if (length(unique(group_i[[timepoint_column]])) < 2) {
logger <- append(logger, list(
glue::glue_collapse(
c(
glue::glue("{df[[subj_col]][1]}, {group_i$CellType[1]}, "),
glue::glue("{group_i[[tissue_col]][1]}"),
" - Has less than 2 time points, won't be processed"
)
)
))
groups_dfs[[i]] <- NULL
}
}
groups_dfs <- purrr::discard(groups_dfs, is.null)
# --- Calculate estimates
estimates_per_group <- purrr::map(
groups_dfs, ~ .estimate_pop(
df = .x,
seqCount_column = seqCount_column,
fragmentEstimate_column = fragmentEstimate_column,
timepoint_column = timepoint_column,
annotation_cols = annotation_cols,
subject = df[[subj_col]][1],
stable_timepoints = stable_timepoints,
tissue_col = tissue_col
)
)
estimates_dfs <- purrr::map(estimates_per_group, ~ .x$est) |>
purrr::list_rbind()
logs <- append(logger, purrr::map(estimates_per_group, ~ .x$logger) |>
purrr::list_flatten())
if (!is.null(progress)) {
progress()
}
return(list(est = estimates_dfs, log = logs))
}
#---- USED IN : is_sharing ----
## Internal to find absolute shared number of is between an arbitrary
## number of groups
## Dots are group names in sharing df
#' @importFrom rlang sym
.find_in_common <- function(..., lookup_tbl, keep_genomic_coord) {
groups <- rlang::list2(...)
filt <- lookup_tbl |>
dplyr::filter(.data$group_id %in% groups)
common <- purrr::reduce(filt$is, ~ .x |>
dplyr::inner_join(.y,
by = mandatory_IS_vars()
))
if (!keep_genomic_coord) {
return(
tibble::tibble_row(shared = nrow(common))
)
}
return(
tibble::tibble_row(
shared = nrow(common),
is_coord = list(common)
)
)
}
## Internal, to use on each row of the combinations df.
## Expands the row with all its permutations keeping same absolute shared is
## and counts if present
## - ... : row passed as a list
## - g_names: names of the groups (g1, g2...)
## - counts: are counts present? TRUE/FALSE
.sh_row_permut <- function(..., g_names, counts) {
og_row <- rlang::list2(...)
coord <- if ("is_coord" %in% names(og_row)) {
TRUE
} else {
FALSE
}
truth_tbls <- if ("truth_tbl_venn" %in% names(og_row)) {
TRUE
} else {
FALSE
}
ids <- unlist(og_row[g_names])
if (length(unique(ids)) == 1) {
return(
tibble::tibble(!!!og_row)
)
}
# If elements are different
shared_is <- og_row$shared
if (counts) {
count_union <- og_row$count_union
}
perm <- gtools::permutations(
n = length(g_names),
r = length(g_names),
v = g_names,
set = TRUE,
repeats.allowed = FALSE
)
colnames(perm) <- g_names
perm <- tibble::as_tibble(perm)
perm <- perm |>
dplyr::mutate(shared = shared_is)
if (counts) {
for (g in g_names) {
count_col <- paste0("count_", g)
perm <- perm |>
dplyr::mutate(!!count_col := paste0("count_", .data[[g]]))
}
perm <- perm |>
dplyr::mutate(count_union = count_union)
}
sub_with_val <- function(val) {
unlist(purrr::map(val, ~ og_row[[.x]]))
}
for (g in g_names) {
perm <- perm |>
dplyr::mutate(!!g := sub_with_val(.data[[g]]))
if (counts) {
count_col <- paste0("count_", g)
perm <- perm |>
dplyr::mutate(!!count_col := sub_with_val(.data[[count_col]]))
}
}
if (coord) {
perm <- perm |>
dplyr::mutate(is_coord = og_row$is_coord)
}
if (truth_tbls) {
perm <- perm |>
dplyr::mutate(truth_tbl_venn = og_row$truth_tbl_venn)
}
return(perm)
}
# Counts the number of integrations in the union of all groups of a row
.count_group_union <- function(..., col_groups, lookup_tbl) {
dots <- rlang::list2(...)
row <- tibble::as_tibble(dots)
groups_in_row <- row[1, col_groups]
sub_lookup <- lookup_tbl |>
dplyr::filter(.data$group_id %in% groups_in_row)
count_union <- nrow(purrr::reduce(sub_lookup$is, dplyr::union))
row <- row |>
dplyr::mutate(count_union = count_union)
return(row)
}
# Obtains lookup table for groups
.sh_obtain_lookup <- function(key, df) {
temp <- df |>
dplyr::select(dplyr::all_of(c(key, mandatory_IS_vars()))) |>
dplyr::group_by(dplyr::across(dplyr::all_of(key))) |>
dplyr::distinct(dplyr::across(dplyr::all_of(mandatory_IS_vars())),
.keep_all = TRUE
) |>
tidyr::unite(col = "group_id", dplyr::all_of(key)) |>
tidyr::nest(.by = "group_id", .key = "is")
return(temp)
}
# Obtains truth table for venn diagrams. To apply to each row with pmap.
# - groups : g1, g2, g3...
# - lookup : df
.sh_truth_tbl_venn <- function(..., lookup, groups) {
row <- rlang::list2(...)
labels <- unlist(row[groups])
retrieved <- lookup |>
dplyr::filter(.data$group_id %in% labels)
retrieved <- retrieved |>
tidyr::unnest(dplyr::all_of("is")) |>
tidyr::unite(
col = "int_id",
dplyr::all_of(mandatory_IS_vars())
) |>
dplyr::mutate(observed = TRUE)
truth_tbl <- retrieved |>
tidyr::pivot_wider(
names_from = "group_id",
values_from = "observed",
values_fill = FALSE
)
return(truth_tbl)
}
.single_row_sharing <- function(row, lookup,
keep_genomic_coord,
is_counts,
add_is_count,
rel_sharing,
venn,
minimal,
progress) {
common_iss <- .find_in_common(!!!row,
lookup_tbl = lookup,
keep_genomic_coord = keep_genomic_coord
)
row <- row |>
dplyr::bind_cols(common_iss)
group_cols <- colnames(row)[!colnames(row) %in% c(
"shared",
"is_coord"
)]
if (add_is_count || rel_sharing) {
## Add counts -groups
for (col in group_cols) {
count_col_name <- paste0("count_", col)
counts <- (is_counts |>
dplyr::filter(.data$group_id == row[[col]]))$count
row <- row |>
dplyr::mutate(
!!count_col_name := counts
)
}
## Add counts -union
row <- .count_group_union(!!!row,
col_groups = group_cols,
lookup_tbl = lookup
)
}
if (venn) {
venn_tbl <- .sh_truth_tbl_venn(!!!row,
lookup = lookup,
groups = group_cols
)
row <- row |>
dplyr::mutate(
truth_tbl_venn = list(venn_tbl)
)
}
if (!minimal) {
row <- .sh_row_permut(!!!row,
g_names = group_cols,
counts = add_is_count || rel_sharing
)
}
if (rel_sharing) {
# for groups
for (col in group_cols) {
rel_col_name <- paste0("on_", col)
count_col_name <- paste0("count_", col)
row <- row |>
dplyr::mutate(!!rel_col_name := (.data$shared /
.data[[count_col_name]]) * 100)
}
# for union
row <- row |>
dplyr::mutate(on_union = (.data$shared /
.data$count_union) * 100)
}
if (!is.null(progress)) {
progress()
}
return(row)
}
## Computes sharing table for single input df and single key
## -key: name of the columns to do a group by
## -minimal: true or false, if true computes ONLY combinations and not
## all permutations
## -n_comp: number of comparisons (2-way sharing, 3-way sharing...).
## Should be less or equal to the distinct number of groups
## -is_count: keep the counts in the table?
## -rel_sharing: compute relative sharing? (on g_i & on_union)
## -include_self_comp: include rows with the same group for each comparison?
## useful for heatmaps. Sharing for this rows is always 100%
.sharing_singledf_single_key <- function(
df, key, minimal, n_comp,
is_count, rel_sharing,
include_self_comp,
keep_genomic_coord,
venn) {
lookup <- .sh_obtain_lookup(key, df)
# Check n and k
if (nrow(lookup) < n_comp) {
if (nrow(lookup) >= 2) {
n_comp <- nrow(lookup)
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
warn_msg <- c("Number of requested comparisons too big",
i = paste(
"The number of requested comparisons",
"is greater than the number of groups.",
"Reducing comparisons to the biggest value",
"allowed"
)
)
rlang::inform(warn_msg)
}
} else if (!include_self_comp) {
if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
warn_msg <- c("Number of requested comparisons too big",
i = paste(
"The number of requested comparisons",
"is greater than the number of groups.",
"There is only 1 group available in the input",
"data frame, nothing to compare"
)
)
rlang::inform(warn_msg)
}
return(NULL)
}
}
group_comb <- if (nrow(lookup) == 1) {
gtools::combinations(
n = length(lookup$group_id),
r = n_comp,
v = lookup$group_id,
set = TRUE,
repeats.allowed = TRUE
)
} else {
gtools::combinations(
n = length(lookup$group_id),
r = n_comp,
v = lookup$group_id,
set = TRUE,
repeats.allowed = FALSE
)
}
cols <- paste0("g", seq_len(ncol(group_comb)))
colnames(group_comb) <- cols
group_comb <- tibble::as_tibble(group_comb)
is_counts <- lookup |>
dplyr::mutate(count = purrr::map_int(.data$is, ~ nrow(.x))) |>
dplyr::select(-dplyr::all_of("is"))
sharing_df <- .execute_map_job(
data_list = group_comb |>
dplyr::group_by(
dplyr::across(dplyr::everything())
) |>
dplyr::group_split(),
fun_to_apply = .single_row_sharing,
fun_args = list(
lookup = lookup,
keep_genomic_coord = keep_genomic_coord,
is_counts = is_counts,
add_is_count = is_count,
rel_sharing = rel_sharing,
venn = venn,
minimal = minimal
),
stop_on_error = TRUE
)$res |>
purrr::list_rbind()
if (include_self_comp & nrow(lookup) >= 2) {
## Calculate groups with equal components
self_rows <- purrr::map2(
is_counts$group_id, is_counts$count,
function(x, y) {
row_ls <- as.list(setNames(rep_len(
x,
length.out = n_comp
), nm = cols))
row <- tibble::as_tibble(row_ls) |>
dplyr::mutate(shared = y)
if (keep_genomic_coord) {
row <- row |>
dplyr::mutate(
is_coord =
lookup |>
dplyr::filter(.data$group_id == x) |>
dplyr::pull("is")
)
}
return(row)
}
) |>
purrr::list_rbind()
# Add counts if requested
if (is_count) {
first_group <- colnames(self_rows)[1]
self_rows <- self_rows |>
dplyr::left_join(
is_counts |>
dplyr::rename(
!!first_group := dplyr::all_of("group_id")
),
by = first_group
)
for (col in cols) {
self_rows <- self_rows |>
dplyr::mutate(
!!paste0("count_", col) := .data$count
)
}
self_rows <- self_rows |>
dplyr::mutate(
count_union = .data$count
) |>
dplyr::select(-dplyr::all_of("count"))
}
if (rel_sharing) {
for (col in cols) {
self_rows <- self_rows |>
dplyr::mutate(
!!paste0("on_", col) := 100.0
)
self_rows <- self_rows |>
dplyr::mutate(
on_union = 100.0
)
}
}
if (venn) {
venn_tbls <- purrr::pmap(
self_rows, .sh_truth_tbl_venn,
lookup = lookup,
groups = cols
)
self_rows <- self_rows |>
dplyr::mutate(
truth_tbl_venn = venn_tbls
)
}
sharing_df <- self_rows |>
dplyr::bind_rows(sharing_df)
}
column_order <- c(
cols, "shared", paste0("count_", cols),
"count_union", paste0("on_", cols),
"on_union", "is_coord", "truth_tbl_venn"
)
column_order <- column_order[column_order %in% colnames(sharing_df)]
sharing_df <- sharing_df |>
dplyr::select(dplyr::all_of(column_order))
sharing_df
}
## Computes sharing table for single input df and multiple keys
## -keys: name of the columns to do a group by (it is a named list)
## -minimal: true or false, if true computes ONLY combinations and not
## all permutations
## -is_count: keep the counts in the table?
## -rel_sharing: compute relative sharing? (on g_i & on_union)
.sharing_singledf_mult_key <- function(
df, keys,
minimal, is_count,
rel_sharing, keep_genomic_coord,
venn) {
cols <- names(keys)
## Obtain lookup table for each key
lookup <- purrr::map(keys, ~ .sh_obtain_lookup(.x, df))
group_labels <- purrr::map(lookup, ~ .x$group_id)
unique_keys <- names(keys[!duplicated(keys)])
lookup <- purrr::list_rbind(lookup[unique_keys])
## Obtain combinations
combin <- tidyr::expand_grid(!!!group_labels)
is_counts <- lookup |>
dplyr::mutate(count = purrr::map_int(.data$is, ~ nrow(.x))) |>
dplyr::select(-dplyr::all_of("is"))
sharing_df <- .execute_map_job(
data_list = combin |>
dplyr::group_by(
dplyr::across(dplyr::everything())
) |>
dplyr::group_split(),
fun_to_apply = .single_row_sharing,
fun_args = list(
lookup = lookup,
keep_genomic_coord = keep_genomic_coord,
is_counts = is_counts,
add_is_count = is_count,
rel_sharing = rel_sharing,
venn = venn,
minimal = minimal
),
stop_on_error = TRUE
)$res |>
purrr::list_rbind()
column_order <- c(
cols, "shared", paste0("count_", cols),
"count_union", paste0("on_", cols),
"on_union", "is_coord", "truth_tbl_venn"
)
column_order <- column_order[column_order %in% colnames(sharing_df)]
sharing_df <- sharing_df |>
dplyr::select(dplyr::all_of(column_order))
sharing_df
}
## Computes sharing table for mult input dfs and single key
.sharing_multdf_single_key <- function(
dfs, key, minimal,
is_count, rel_sharing, keep_genomic_coord, venn) {
cols <- if (is.null(names(dfs))) {
g_n <- paste0("g", seq_len(length(dfs)))
names(dfs) <- g_n
g_n
} else {
names(dfs)
}
lookups <- purrr::map2(
dfs, seq_along(dfs),
~ .sh_obtain_lookup(key, .x) |>
dplyr::mutate(group_id = paste0(
.data$group_id, "-", .y
))
)
group_labels <- purrr::map(lookups, ~ .x$group_id)
lookup <- purrr::list_rbind(lookups)
## Obtain combinations
combin <- tidyr::expand_grid(!!!group_labels)
is_counts <- purrr::map(lookups, ~ {
.x |>
dplyr::mutate(count = purrr::map_int(.data$is, nrow)) |>
dplyr::select(-dplyr::all_of("is"))
}) |>
purrr::list_rbind()
sharing_df <- .execute_map_job(
data_list = combin |>
dplyr::group_by(
dplyr::across(dplyr::everything())
) |>
dplyr::group_split(),
fun_to_apply = .single_row_sharing,
fun_args = list(
lookup = lookup,
keep_genomic_coord = keep_genomic_coord,
is_counts = is_counts,
add_is_count = is_count,
rel_sharing = rel_sharing,
venn = venn,
minimal = minimal
),
stop_on_error = TRUE
)$res |>
purrr::list_rbind()
column_order <- c(
cols, "shared", paste0("count_", cols),
"count_union", paste0("on_", cols),
"on_union", "is_coord", "truth_tbl_venn"
)
column_order <- column_order[column_order %in% colnames(sharing_df)]
sharing_df <- sharing_df |>
dplyr::select(dplyr::all_of(column_order))
sharing_df
}
## Computes sharing table for mult input dfs and mult key
.sharing_multdf_mult_key <- function(
dfs, keys, minimal,
is_count, rel_sharing, keep_genomic_coord, venn) {
cols <- names(keys)
lookups <- purrr::map2(dfs, keys, ~ .sh_obtain_lookup(.y, .x)) |>
purrr::set_names(cols)
lookups <- purrr::map2(lookups, seq_along(lookups), ~ .x |>
dplyr::mutate(group_id = paste0(
.data$group_id, "-", .y
)))
group_labels <- purrr::map(lookups, ~ .x$group_id)
lookup <- purrr::list_rbind(lookups)
## Obtain combinations
combin <- tidyr::expand_grid(!!!group_labels)
is_counts <- purrr::map(lookups, ~ {
.x |>
dplyr::mutate(count = purrr::map_int(.data$is, nrow)) |>
dplyr::select(-dplyr::all_of("is"))
}) |>
purrr::list_rbind()
sharing_df <- .execute_map_job(
data_list = combin |>
dplyr::group_by(
dplyr::across(dplyr::everything())
) |>
dplyr::group_split(),
fun_to_apply = .single_row_sharing,
fun_args = list(
lookup = lookup,
keep_genomic_coord = keep_genomic_coord,
is_counts = is_counts,
add_is_count = is_count,
rel_sharing = rel_sharing,
venn = venn,
minimal = minimal
),
stop_on_error = TRUE
)$res |>
purrr::list_rbind()
column_order <- c(
cols, "shared", paste0("count_", cols),
"count_union", paste0("on_", cols),
"on_union", "is_coord", "truth_tbl_venn"
)
column_order <- column_order[column_order %in% colnames(sharing_df)]
sharing_df <- sharing_df |>
dplyr::select(dplyr::all_of(column_order))
sharing_df
}
#---- USED IN : iss_source ----
.assign_iss_by_tp <- function(df, timepoint_column) {
is_vars <- if (.is_annotated(df)) {
c(mandatory_IS_vars(), annotation_IS_vars())
} else {
mandatory_IS_vars()
}
return(df |>
dplyr::mutate(!!timepoint_column := as.numeric(
.data[[timepoint_column]]
)) |>
dplyr::group_by(dplyr::across(dplyr::all_of(is_vars))) |>
dplyr::group_modify(~ {
if (nrow(.x) == 1) {
return(.x)
}
min_tp <- min(.x[[timepoint_column]])
return(.x |>
dplyr::filter(.data[[timepoint_column]] == min_tp))
}) |>
dplyr::ungroup())
}
.sharing_for_source <- function(
ref,
sel,
ref_key,
sel_key,
tp_col,
subj_col) {
if (!is.data.frame(ref)) {
### Workflow by subject
common_names <- if (!all(names(ref) == names(sel))) {
intersect(names(ref), names(sel))
} else {
names(ref)
}
if (length(common_names) == 0) {
no_common_err <- c("No common subjects",
x = paste(
"No common subjects between",
"reference and selection"
),
i = paste(
"In reference:",
paste0(names(ref), collapse = ", "),
"\n",
"In selection: ",
paste0(names(sel), collapse = ", ")
)
)
rlang::abort(no_common_err)
}
if (getOption("ISAnalytics.verbose", TRUE) == TRUE &&
(length(common_names) < length(names(ref)) ||
length(common_names) < length(names(sel)))) {
all_names <- union(names(ref), names(sel))
common_warn_msg <- c("Mismatch in subjects found",
i = paste(
"Some subjects were excluded from",
"computations because they were",
"absent from reference or from",
"selection"
),
paste(
"Excluded: ",
paste0(
all_names[!all_names %in%
common_names],
collapse = ", "
)
)
)
}
FUN <- function(
subj,
ref, sel, ref_key, sel_key,
tp_col, progress) {
ref_df <- ref[[subj]]
sel_df <- sel[[subj]]
ref_key_min <- ref_key[ref_key != tp_col]
ref_split <- ref_df |>
dplyr::group_by(dplyr::across(dplyr::all_of(ref_key_min))) |>
dplyr::group_split()
quiet_sharing <- purrr::quietly(is_sharing)
calculate_sharing <- function(.x) {
assigned_ref <- .assign_iss_by_tp(.x,
timepoint_column = tp_col
)
sh <- (quiet_sharing(assigned_ref,
sel_df,
group_keys = list(
g1 = ref_key,
g2 = sel_key
),
keep_genomic_coord = TRUE
))$result
sh <- sh |>
tidyr::separate(
col = "g1",
into = paste0("g1_", c(ref_key, "source_table")),
remove = FALSE,
convert = TRUE
) |>
tidyr::separate(
col = "g2",
into = paste0("g2_", c(sel_key, "source_table")),
remove = FALSE,
convert = TRUE
) |>
dplyr::select(-dplyr::ends_with("source_table"))
}
sharing <- purrr::map(ref_split, calculate_sharing) |>
purrr::list_rbind()
if (!is.null(progress)) {
progress()
}
sharing
}
shared <- .execute_map_job(
data_list = common_names,
fun_to_apply = FUN,
fun_args = list(
ref = ref,
sel = sel,
ref_key = ref_key,
sel_key = sel_key,
tp_col = tp_col
),
stop_on_error = TRUE
)$res |>
purrr::set_names(common_names)
return(shared)
}
ref_key_min <- ref_key[ref_key != tp_col]
ref_split <- ref |>
dplyr::group_by(dplyr::across(dplyr::all_of(ref_key_min))) |>
dplyr::group_split()
quiet_sharing <- purrr::quietly(is_sharing)
sharing <- purrr::map(ref_split, ~ {
assigned_ref <- .assign_iss_by_tp(.x,
timepoint_column = tp_col
)
(quiet_sharing(assigned_ref,
sel,
group_keys = list(
g1 = ref_key,
g2 = sel_key
),
keep_genomic_coord = TRUE
))$result
}) |>
purrr::list_rbind()
sharing |>
tidyr::separate(
col = "g1",
into = paste0("g1_", c(ref_key, "source_table")),
remove = FALSE,
convert = TRUE
) |>
tidyr::separate(
col = "g2",
into = paste0("g2_", c(sel_key, "source_table")),
remove = FALSE,
convert = TRUE
) |>
dplyr::select(-dplyr::ends_with("source_table"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.