#' Subset a Binary Matrix By Alteration Frequency Threshold
#'
#' @param gene_binary A data frame with a row for each sample and column for each
#' alteration. Data frame must have a `sample_id` column and columns for each alteration
#' with values of 0, 1 or NA.
#' @param t Threshold value between 0 and 1 to subset by. Default is 10% (.1).
#' @param other_vars One or more column names (quoted or unquoted) in data to be retained
#' in resulting data frame. Default is NULL.
#' @param by Variable used to subset the data. Default is NULL.
#' @return a data frame with a `sample_id` column and columns for
#' alterations over the given prevalence threshold of `t`.
#'
#' @export
#'
#' @examples
#' samples <- unique(gnomeR::mutations$sampleId)
#' gene_binary <- create_gene_binary(
#' samples = samples, mutation = mutations, cna = cna,
#' mut_type = "somatic_only",
#' include_silent = FALSE,
#' specify_panel = "impact"
#' )
#'gene_binary %>%
#' subset_by_frequency()
#'
subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NULL) {
# Checks ------------------------------------------------------------------
# check threshold `t` argument
if(!(is.numeric(t) & (t >= 0 & t <= 1))) {
cli::cli_abort("{.field t} must be a number between 0 and 1")
}
if (!is.data.frame(gene_binary)) {
cli::cli_abort("{.code gene_binary} must be a data.frame")
}
.check_required_cols(gene_binary, "sample_id")
# Other Vars - Capture Other Columns to Retain -----------------------------------
other_vars <-
.select_to_varnames({{ other_vars }},
data = gene_binary,
arg_name = "other_vars"
)
# Define by variable
by <-
.select_to_varnames({{ by }},
data = gene_binary,
arg_name = "by", select_single = TRUE
)
# Check if 'by' is in 'other_vars' (only if both are non-NULL)
if (!is.null(by) && !is.null(other_vars) && by %in% other_vars) {
cli::cli_abort("{.code other_vars} cannot overlap with {.code by}.")
}
# data frame of only alterations
alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars))
# Remove all NA columns ----------------------------------------------
all_na_alt <- apply(alt_only, 2, function(x) {
sum(is.na(x)) == nrow(alt_only)
})
all_non_na_alt <- names(all_na_alt[!all_na_alt])
alt_only <- select(alt_only, all_of(all_non_na_alt))
# Check Numeric Class -----------------------------------------------------
if (is.null(by)) {
.abort_if_not_numeric(alt_only)
}
else {
.abort_if_not_numeric(select(alt_only, -any_of(by)))
}
# Calc Frequency ----------------------------------------------------------
if(is.null(by)){
counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)})
num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))
alt_freq <- counts/num_non_na
alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE))
subset_binary <- select(gene_binary, "sample_id",
any_of(other_vars),
all_of(alts_over_thresh))
return(subset_binary)
}
else{
alt_data <-
alt_only |>
group_by(across(all_of(by))) |>
summarise(across(everything(),
list(sum = ~ sum(., na.rm = TRUE),
total = ~ sum(!is.na(.), na.rm = T))))
alt_group_data <-
alt_data |>
pivot_longer(-any_of(by),
names_to = c("gene")) |>
separate("gene", into = c("gene", "measure"), sep = "_") |>
pivot_wider(names_from = "measure",
values_from = "value") |>
mutate(propo = .data$sum/.data$total) |>
arrange(desc(.data$propo))
alts_over_thresh_grp <-
alt_group_data |>
filter(.data$propo > t) |>
pull("gene") |>
unique()
subset_binary <- select(gene_binary, "sample_id",
any_of(by),
any_of(other_vars),
all_of(alts_over_thresh_grp))
return(subset_binary)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.