# differential_discovery.R
# This file contains functions relevant to performing differential discovery
# analyses (differential abundance analysis and differential expression analysis)
# on tof_tbl objects containing high-dimensional cytometry data.
# diffcyt ----------------------------------------------------------------------
#' Differential Abundance Analysis (DAA) with diffcyt
#'
#' This function performs differential abundance analysis on the cell clusters
#' contained within a `tof_tbl` using one of three
#' methods implemented in the \href{https://www.bioconductor.org/packages/release/bioc/html/diffcyt.html}{diffcyt}
#' package for differential discovery analysis in high-dimensional cytometry
#' data.
#'
#' The three methods are based on generalized linear mixed models ("glmm"),
#' \href{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2796818/}{edgeR} ("edgeR"), and
#' \href{https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-2-r29}{voom} ("voom").
#' While both the "glmm" and "voom" methods can model both fixed effects and random
#' effects, the "edgeR" method can only model fixed effects.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param sample_col An unquoted column name indicating which column in `tof_tibble`
#' represents the id of the sample from which each cell was collected. `sample_col`
#' should serve as a unique identifier for each sample collected during data acquisition -
#' all cells with the same value for `sample_col` will be treated as a part of the same
#' observational unit.
#'
#' @param cluster_col An unquoted column name indicating which column in `tof_tibble`
#' stores the cluster ids of the cluster to which each cell belongs.
#' Cluster labels can be produced via any method the user chooses - including manual gating,
#' any of the functions in the `tof_cluster_*` function family, or any other method.
#'
#' @param fixed_effect_cols Unquoted column names representing which columns in
#' `tof_tibble` should be used to model fixed effects during the differential
#' abundance analysis. Generally speaking, fixed effects represent the
#' comparisons of biological interest (often the variables manipulated during
#' experiments), such as treated vs. non-treated, before-treatment vs. after-treatment,
#' or healthy vs. non-healthy.
#'
#' @param random_effect_cols Optional. Unquoted column names representing which columns in
#' `tof_tibble` should be used to model random effects during the differential
#' abundance analysis. Generally speaking, random effects should represent variables
#' that a researcher wants to control/account for, but that are not necessarily
#' of biological interest. Example random effect variables might include batch id,
#' patient id (in a paired design), or patient age.
#'
#' Note that without multiple samples at each level of each of the
#' random effect variables, it can be easy to overfit mixed models. For most high-dimensional cytometry
#' experiments, 2 or fewer (and often 0) random effect variables are appropriate.
#'
#' @param diffcyt_method A string indicating which diffcyt method should be used for the
#' differential abundance analysis. Valid methods include "glmm" (the default),
#' "edgeR", and "voom".
#'
#' @param include_observation_level_random_effects A boolean value indicating
#' if "observation-level random effects" (OLREs) should be included as random effect
#' terms in a "glmm" differential abundance model. For details about what OLREs are, see
#' \href{https://www.nature.com/articles/s42003-019-0415-5}{the diffcyt paper}. Only the
#' "glmm" method can model observation-level random effects, and all other values will ignore
#' this argument (and throw a warning if it is set to TRUE).
#' Defaults to FALSE.
#'
#' @param min_cells An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 3.
#'
#' @param alpha A numeric value between 0 and 1 indicating which significance
#' level should be applied to multiple-comparison adjusted p-values during the
#' differential abundance analysis. Defaults to 0.05.
#'
#' @param min_samples An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 5.
#'
#' @param ... Optional additional arguments to pass to the under-the-hood diffcyt
#' function being used to perform the differential abundance analysis. See
#' \code{\link[diffcyt]{testDA_GLMM}}, \code{\link[diffcyt]{testDA_edgeR}}, and
#' \code{\link[diffcyt]{testDA_voom}} for details.
#'
#' @return A nested tibble with two columns: `tested_effect` and `daa_results`.
#'
#' The first column, `tested_effect`
#' is a character vector indicating which term in the differential abundance model
#' was used for significance testing. The values in this row are obtained
#' by pasting together the column names for each fixed effect variable and each
#' of its values. For example, a fixed effect column named `fixed_effect` with
#' levels "a", "b", and "c" have two terms in `tested_effect`: "fixed_effectb" and
#' "fixed_effectc" (note that level "a" of fixed_effect is set as the reference
#' level during dummy coding). These values correspond to the terms in the
#' differential abundance model that represent the difference in cluster abundances
#' between samples with fixed_effect = "b" and fixed_effect = "a" and between
#' samples with fixed_effect = "c" and fixed_effect = "a", respectively. In addition,
#' the first row in `tested_effect` will always represent the "omnibus"
#' test, or the test that there were significant differences between \emph{any} levels of
#' \emph{any} fixed effect variable in the model.
#'
#' The second column, `daa_results` is a list of tibbles in which each entry gives
#' the differential abundance results for each tested_effect. Within each entry
#' of `daa_results`, you will find several columns including the following:
#' * `p_val`, the p-value associated with each
#' tested effect in each input cluster
#' * `p_adj`, the multiple-comparison
#' adjusted p-value (using the \code{\link[stats]{p.adjust}} function)
#' * Other values associated with the underlying method used to perform the
#' differential abundance analysis (such as the log-fold change of cluster
#' abundance between the levels being compared). For details, see
#' \code{\link[edgeR]{glmFit}}, \code{\link[limma]{voom}}, \code{\link[limma]{topTable}},
#' and \code{\link[diffcyt]{testDA_GLMM}}.
#'
#' @family differential abundance analysis functions
#'
#' @export
#'
#' @importFrom rlang arg_match
#' @importFrom tidyselect eval_select
#' @importFrom tidyr unite
#' @importFrom purrr map
#' @importFrom rlang arg_match
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_abundance_diffcyt <-
function(
tof_tibble,
sample_col,
cluster_col,
fixed_effect_cols,
random_effect_cols,
diffcyt_method = c("glmm", "edgeR", "voom"),
include_observation_level_random_effects = FALSE,
min_cells = 3,
min_samples = 5,
alpha = 0.05,
...) {
# check to see if the diffcyt package is installed
has_diffcyt <- requireNamespace(package = "diffcyt")
if (!has_diffcyt) {
stop(
"This function requires the {diffcyt} package. Install it with this code:\n
if (!requireNamespace(\"BiocManager\", quietly = TRUE))
install.packages(\"BiocManager\")
BiocManager::install(\"diffcyt\")"
)
}
# check method argument
diffcyt_method <- rlang::arg_match(diffcyt_method)
# edgeR can't model random effects, so we throw an error for the user
# if they are included
if (diffcyt_method == "edgeR" & !missing(random_effect_cols)) {
stop(
"edgeR can't model random effects. Trying using another method or
model everything as a fixed effect."
)
}
# Only the "glmm" method supports observation-level random effects, so
# provide a warning if include_observation_level_random_effects = TRUE
# for any other method.
if (include_observation_level_random_effects == TRUE & diffcyt_method != "glmm") {
message(
"Note: Only the \"glmm\" method can use observation-level random effects.
Setting include_observation_level_random_effects to FALSE.\n"
)
include_observation_level_random_effects <- FALSE
}
# a hack-y approach for dealing with the diffcyt software - we can pick 2 random
# columns corresponding to high-dimensional cytometry measurements to fill the SummarizedExperiment
# that diffcyt requires later, but because in DAA these measurements are not used,
# it doesn't matter that we ignore all the other protein measurements.
#
# This will make the implementation faster for DAA as well because fewer values will
# need to by copied into the SummarizedExperiment data structure.
marker_colnames <-
tof_tibble |>
dplyr::select(dplyr::where(tof_is_numeric)) |>
colnames()
marker_colnames <- marker_colnames[c(1, 2)]
# remove all columns from `tof_tibble` that aren't relevant
tof_tibble <-
tof_tibble |>
dplyr::select(
{{ sample_col }},
dplyr::any_of(marker_colnames),
{{ cluster_col }},
{{ fixed_effect_cols }},
{{ random_effect_cols }}
)
diffcyt_args <-
prepare_diffcyt_args(
tof_tibble = tof_tibble,
sample_col = {{ sample_col }},
cluster_col = {{ cluster_col }},
marker_cols = dplyr::any_of(marker_colnames),
fixed_effect_cols = {{ fixed_effect_cols }},
random_effect_cols = {{ random_effect_cols }},
diffcyt_method = diffcyt_method,
include_observation_level_random_effects =
include_observation_level_random_effects
)
# find counts of each cluster in all samples
cell_counts <- diffcyt::calcCounts(diffcyt_args$data_diff)
# perform difference abundance testing
if (diffcyt_method == "glmm") {
# if glmms are being used,
result_tibble <-
diffcyt_args$contrast_matrix_tibble |>
dplyr::transmute(
tested_effect = .data$contrast_names,
daa_results =
map(
.x = .data$contrast_matrices,
.f = function(x) {
diffcyt::testDA_GLMM(
d_counts = cell_counts,
formula = diffcyt_args$my_formula,
contrast = x,
min_cells = min_cells,
min_samples = min_samples,
...
) |>
diffcyt::topTable(all = TRUE, show_all_cols = TRUE) |>
dplyr::as_tibble() |>
dplyr::arrange(.data$p_adj) |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", "")
) |>
dplyr::rename("{{cluster_col}}" := "cluster_id")
}
)
)
} else if (diffcyt_method == "voom") {
# if limma/voom is being used,
# We unite all random effect columns and treat them as
# a single block ID. Note this occurs in the help file and that
# it is not recommended to use more than 1 random effect variable with
# the voom method.
if (length(diffcyt_args$random_effect_colnames) != 0) {
# if there are random effects, combine them into a single block_id
block_id <-
diffcyt_args$experiment_info |>
tidyr::unite(
col = "block_id",
dplyr::any_of(diffcyt_args$random_effect_colnames)
) |>
dplyr::pull(.data$block_id) |>
as.factor()
} else {
# otherwise, don't include a block_id
block_id <- NULL
}
result_tibble <-
suppressWarnings(suppressMessages(
diffcyt_args$contrast_matrix_tibble |>
dplyr::transmute(
tested_effect = .data$contrast_names,
daa_results =
purrr::map(
.x = .data$contrast_matrices,
.f = function(x) {
diffcyt::testDA_voom(
d_counts = cell_counts,
design = diffcyt_args$my_design,
contrast = x,
block_id = block_id,
min_cells = min_cells,
min_samples = min_samples,
...
) |>
diffcyt::topTable(all = TRUE, show_all_cols = TRUE) |>
dplyr::as_tibble() |>
dplyr::arrange(.data$p_adj) |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", "")
) |>
dplyr::select(
"{{cluster_col}}" := "cluster_id",
"p_val",
"p_adj",
"significant",
dplyr::everything()
)
}
)
)
))
} else {
result_tibble <-
diffcyt_args$contrast_matrix_tibble |>
dplyr::transmute(
tested_effect = .data$contrast_names,
daa_results =
purrr::map(
.x = .data$contrast_matrices,
.f = function(x) {
diffcyt::testDA_edgeR(
d_counts = cell_counts,
design = diffcyt_args$my_design,
contrast = x,
min_cells = min_cells,
min_samples = min_samples,
...
) |>
diffcyt::topTable(all = TRUE, show_all_cols = TRUE) |>
dplyr::as_tibble() |>
dplyr::arrange(.data$p_adj) |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", "")
) |>
dplyr::select(
"{{cluster_col}}" := "cluster_id",
"p_val",
"p_adj",
"significant",
dplyr::everything()
)
}
)
)
}
# remove the omnibus test information (and unnest the results tibble)
# if there are only 2 levels to the fixed effects being tested (because
# this means the omnibus test and the individual effect will be identical)
if (nrow(result_tibble) == 2) {
result_tibble <-
result_tibble |>
dplyr::filter(.data$tested_effect != "omnibus") |>
tidyr::unnest(cols = "daa_results")
}
attr(result_tibble, "daa_method") <- paste0("diffcyt_", diffcyt_method)
return(result_tibble)
}
#' Differential Expression Analysis (DEA) with diffcyt
#'
#' This function performs differential expression analysis on the cell clusters
#' contained within a `tof_tbl` using one of two
#' methods implemented in the \href{https://www.bioconductor.org/packages/release/bioc/html/diffcyt.html}{diffcyt}
#' package for differential discovery analysis in high-dimensional cytometry
#' data.
#'
#' The two methods are based on linear mixed models ("lmm") and
#' \href{https://academic.oup.com/nar/article/43/7/e47/2414268}{limma} ("limma").
#' Both the "lmm" and "limma" methods can model both fixed effects and random
#' effects.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param sample_col An unquoted column name indicating which column in `tof_tibble`
#' represents the id of the sample from which each cell was collected. `sample_col`
#' should serve as a unique identifier for each sample collected during data acquisition -
#' all cells with the same value for `sample_col` will be treated as a part of the same
#' observational unit.
#'
#' @param cluster_col An unquoted column name indicating which column in `tof_tibble`
#' stores the cluster ids of the cluster to which each cell belongs.
#' Cluster labels can be produced via any method the user chooses - including manual gating,
#' any of the functions in the `tof_cluster_*` function family, or any other method.
#'
#' @param marker_cols Unquoted column names representing which columns in `tof_tibble`
#' (i.e. which high-dimensional cytometry protein measurements) should be tested for differential expression between
#' levels of the `fixed_effect_cols`. Defaults to all numeric (integer or double) columns.
#' Supports tidyselect helpers.
#'
#' @param fixed_effect_cols Unquoted column names representing which columns in
#' `tof_tibble` should be used to model fixed effects during the differential
#' expression analysis. Generally speaking, fixed effects represent the
#' comparisons of biological interest (often the the variables manipulated during
#' experiments), such as treated vs. non-treated, before-treatment vs. after-treatment,
#' or healthy vs. non-healthy.
#'
#' @param random_effect_cols Unquoted column names representing which columns in
#' `tof_tibble` should be used to model random effects during the differential
#' expression analysis. Generally speaking, random effects represent variables
#' that a researcher wants to control/account for, but that are not necessarily
#' of biological interest. Example random effect variables might include batch id,
#' patient id (in a paired design), or patient age.
#'
#' Note that without many samples at each level of each of the
#' random effect variables, it can be easy to overfit mixed models. For most high-dimensional cytometry
#' experiments, 2 or fewer (and often 0) random effect variables are appropriate.
#'
#' @param diffcyt_method A string indicating which diffcyt method should be used for the
#' differential expression analysis. Valid methods include "lmm" (the default)
#' and "limma".
#'
#' @param include_observation_level_random_effects A boolean value indicating
#' if "observation-level random effects" (OLREs) should be included as random effect
#' terms in a "lmm" differential expression model. For details about what OLREs are, see
#' \href{https://www.nature.com/articles/s42003-019-0415-5}{the diffcyt paper}.
#' Defaults to FALSE.
#'
#' @param min_cells An integer value used to filter clusters out of the differential
#' expression analysis. Clusters are not included in the differential expression testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 3.
#'
#' @param min_samples An integer value used to filter clusters out of the differential
#' expression analysis. Clusters are not included in the differential expression testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 5.
#'
#' @param alpha A numeric value between 0 and 1 indicating which significance
#' level should be applied to multiple-comparison adjusted p-values during the
#' differential abundance analysis. Defaults to 0.05.
#'
#' @param ... Optional additional arguments to pass to the under-the-hood diffcyt
#' function being used to perform the differential expression analysis. See
#' \code{\link[diffcyt]{testDS_LMM}} and \code{\link[diffcyt]{testDS_limma}}
#' for details.
#'
#' @return A nested tibble with two columns: `tested_effect` and `dea_results`.
#'
#' The first column, `tested_effect`
#' is a character vector indicating which term in the differential expression model
#' was used for significance testing. The values in this row are obtained
#' by pasting together the column names for each fixed effect variable and each
#' of its values. For example, a fixed effect column named fixed_effect with
#' levels "a", "b", and "c" have two terms in `tested_effect`: "fixed_effectb" and
#' "fixed_effectc" (note that level "a" of fixed_effect is set as the reference
#' level during dummy coding). These values correspond to the terms in the
#' differential expression model that represent the difference in cluster median
#' expression values of each marker between samples with fixed_effect = "b" and
#' fixed_effect = "a" and between samples with fixed_effect = "c" and
#' fixed_effect = "a", respectively. In addition,
#' note that the first row in `tested_effect` will always represent the "omnibus"
#' test, or the test that there are significant differences between \emph{any} levels of
#' \emph{any} fixed effect variable in the model.
#'
#' The second column, `dea_results` is a list of tibbles in which each entry gives
#' the differential expression results for each tested_effect. Within each entry
#' of `dea_results`, you will find `p_val`, the p-value associated with each
#' tested effect in each input cluster/marker pair; `p_adj`, the multiple-comparison
#' adjusted p-value (using the \code{\link[stats]{p.adjust}} function), and
#' other values associated with the underlying method used to perform the
#' differential expression analysis (such as the log-fold change of clusters' median
#' marker expression values between the conditions being compared). Each tibble in `dea_results`
#' will also have two columns representing the cluster and marker corresponding to the
#' p-value in each row.
#'
#' @family differential expression analysis functions
#'
#' @export
#'
#' @importFrom purrr pluck
#' @importFrom tidyr unite
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_expression_diffcyt <-
function(
tof_tibble,
sample_col,
cluster_col,
marker_cols = where(tof_is_numeric),
fixed_effect_cols,
random_effect_cols,
diffcyt_method = c("lmm", "limma"),
include_observation_level_random_effects = FALSE,
min_cells = 3,
min_samples = 5,
alpha = 0.05,
...) {
# check to see if the diffcyt package is installed
has_diffcyt <- requireNamespace(package = "diffcyt")
if (!has_diffcyt) {
stop(
"This function requires the {diffcyt} package. Install it with this code:\n
if (!requireNamespace(\"BiocManager\", quietly = TRUE)){
install.packages(\"BiocManager\")
BiocManager::install(\"diffcyt\")
}"
)
}
# check diffcyt_method argument
diffcyt_method <- match.arg(diffcyt_method, choices = c("lmm", "limma"))
# Only the "lmm" method supports observation-level random effects, so
# provide a warning if include_observation_level_random_effects = TRUE
# for any other method.
if (include_observation_level_random_effects == TRUE & diffcyt_method != "lmm") {
message(
"Note: Only the \"lmm\" method can use observation-level random effects.
Setting include_observation_level_random_effects to FALSE.\n"
)
}
# remove all columns from `tof_tibble` that aren't relevant
tof_tibble <-
tof_tibble |>
dplyr::select(
{{ sample_col }},
{{ cluster_col }},
{{ marker_cols }},
{{ fixed_effect_cols }},
{{ random_effect_cols }}
)
diffcyt_args <-
prepare_diffcyt_args(
tof_tibble = tof_tibble,
sample_col = {{ sample_col }},
cluster_col = {{ cluster_col }},
marker_cols = {{ marker_cols }},
fixed_effect_cols = {{ fixed_effect_cols }},
random_effect_cols = {{ random_effect_cols }},
diffcyt_method = diffcyt_method,
include_observation_level_random_effects =
include_observation_level_random_effects
)
# find cluster counts and cluster medians
cell_counts <- diffcyt::calcCounts(diffcyt_args$data_diff)
cell_medians <- diffcyt::calcMedians(diffcyt_args$data_diff)
# Perform the differential expression analysis
my_contrast <-
diffcyt_args$contrast_matrix_tibble |>
dplyr::pull(.data$contrast_matrices) |>
purrr::pluck(1)
if (diffcyt_method == "lmm") {
# if lmm's are being used,
result_tibble <-
suppressWarnings(suppressMessages(
diffcyt_args$contrast_matrix_tibble |>
dplyr::transmute(
tested_effect = .data$contrast_names,
dea_results =
purrr::map(
.x = .data$contrast_matrices,
.f = function(x) {
diffcyt::testDS_LMM(
d_counts = cell_counts,
d_medians = cell_medians,
formula = diffcyt_args$my_formula,
contrast = x,
markers_to_test = rep(TRUE, nrow(diffcyt_args$marker_info)),
min_cells = min_cells,
min_samples = min_samples,
...
) |>
diffcyt::topTable(all = TRUE, show_all_cols = TRUE) |>
dplyr::as_tibble() |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
cluster_id = as.character(.data$cluster_id),
marker_id = as.character(.data$marker_id)
) |>
dplyr::rename(
marker = .data$marker_id,
"{{cluster_col}}" := "cluster_id"
) |>
dplyr::select(
{{ cluster_col }},
"marker",
"p_val",
"p_adj",
"significant",
dplyr::everything()
)
}
)
)
))
} else if (diffcyt_method == "limma") {
# if limma is being used,
if (length(diffcyt_args$random_effect_colnames) != 0) {
# if there are random effects, combine them into a single block_id
block_id <-
diffcyt_args$experiment_info |>
tidyr::unite(
col = "block_id",
dplyr::any_of(diffcyt_args$random_effect_colnames)
) |>
dplyr::pull(.data$block_id) |>
as.factor()
} else {
# otherwise, don't include a block_id
block_id <- NULL
}
result_tibble <-
suppressWarnings(suppressMessages(
diffcyt_args$contrast_matrix_tibble |>
dplyr::transmute(
tested_effect = .data$contrast_names,
dea_results =
purrr::map(
.x = .data$contrast_matrices,
.f = function(x) {
diffcyt::testDS_limma(
d_counts = cell_counts,
d_medians = cell_medians,
design = diffcyt_args$my_design,
contrast = x,
block_id = block_id,
min_cells = min_cells,
min_samples = min_samples,
markers_to_test = rep(TRUE, nrow(diffcyt_args$marker_info)),
...
) |>
diffcyt::topTable(all = TRUE, show_all_cols = TRUE) |>
dplyr::as_tibble() |>
select(-.data$ID) |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
cluster_id = as.character(.data$cluster_id),
marker_id = as.character(.data$marker_id)
) |>
dplyr::rename(
marker = .data$marker_id,
"{{cluster_col}}" := "cluster_id"
) |>
dplyr::select(
{{ cluster_col }},
"marker",
"p_val",
"p_adj",
"significant",
dplyr::everything()
)
}
)
)
))
}
# remove the omnibus test information (and unnest the results tibble)
# if there are only 2 levels to the fixed effects being tested (because
# this means the omnibus test and the individual effect will be identical)
if (nrow(result_tibble) == 2) {
result_tibble <-
result_tibble |>
dplyr::filter(.data$tested_effect != "omnibus") |>
tidyr::unnest(cols = "dea_results")
}
attr(result_tibble, "dea_method") <- paste0("diffcyt_", diffcyt_method)
return(result_tibble)
}
# GLMs and GLMMs ---------------------------------------------------------------
#' Differential Abundance Analysis (DAA) with generalized linear mixed-models (GLMMs)
#'
#' This function performs differential abundance analysis on the cell clusters
#' contained within a `tof_tbl` using generalized linear mixed-models. Users
#' specify which columns represent sample, cluster, fixed effect, and random effect
#' information, and a (mixed) binomial regression model is fit using either
#' \code{\link[lme4]{glmer}} or \code{\link[stats]{glm}}.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param sample_col An unquoted column name indicating which column in `tof_tibble`
#' represents the id of the sample from which each cell was collected. `sample_col`
#' should serve as a unique identifier for each sample collected during data acquisition -
#' all cells with the same value for `sample_col` will be treated as a part of the same
#' observational unit.
#'
#' @param cluster_col An unquoted column name indicating which column in `tof_tibble`
#' stores the cluster ids of the cluster to which each cell belongs.
#' Cluster labels can be produced via any method the user chooses - including manual gating,
#' any of the functions in the `tof_cluster_*` function family, or any other method.
#'
#' @param fixed_effect_cols Unquoted column names representing which columns in
#' `tof_tibble` should be used to model fixed effects during the differential
#' abundance analysis. Supports tidyselect helpers.
#'
#' Generally speaking, fixed effects should represent the
#' comparisons of biological interest (often the the variables manipulated during
#' experiments), such as treated vs. non-treated, before-treatment vs. after-treatment,
#' or healthy vs. non-healthy.
#'
#' @param random_effect_cols Unquoted column names representing which columns in
#' `tof_tibble` should be used to model random effects during the differential
#' abundance analysis. Supports tidyselection.
#'
#' Generally speaking, random effects should represent variables
#' that a researcher wants to control/account for, but that are not necessarily
#' of biological interest. Example random effect variables might include batch id,
#' patient id (in a paired design), or patient age.
#'
#' Note that without many samples at each level of each of the
#' random effect variables, it can be easy to overfit mixed models. For most high-dimensional cytometry
#' experiments, 2 or fewer (and often 0) random effect variables are appropriate.
#'
#' @param min_cells An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 3.
#'
#' @param min_samples An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 5.
#'
#' @param alpha A numeric value between 0 and 1 indicating which significance
#' level should be applied to multiple-comparison adjusted p-values during the
#' differential abundance analysis. Defaults to 0.05.
#'
#' @return A nested tibble with two columns: `tested_effect` and `daa_results`.
#'
#' The first column, `tested_effect`,
#' is a character vector indicating which term in the differential abundance model
#' was used for significance testing. The values in this row are obtained
#' by pasting together the column names for each fixed effect variable and each
#' of its values. For example, a fixed effect column named fixed_effect with
#' levels "a", "b", and "c" have two terms in `tested_effect`: "fixed_effectb" and
#' "fixed_effectc" (note that level "a" of fixed_effect is set as the reference
#' level during dummy coding). These values correspond to the terms in the
#' differential abundance model that represent the difference in cluster abundances
#' between samples with fixed_effect = "b" and fixed_effect = "a" and between
#' samples with fixed_effect = "c" and fixed_effect = "a", respectively. In addition,
#' note that the first row in `tested_effect` will always represent the "omnibus"
#' test, or the test that there were significant differences between any levels of
#' any fixed effect variable in the model.
#'
#' The second column, `daa_results`, is a list of tibbles in which each entry gives
#' the differential abundance results for each tested_effect. Within each entry
#' of `daa_results`, you will find `p_value`, the p-value associated with each
#' tested effect in each input cluster; `p_adj`, the multiple-comparison
#' adjusted p-value (using the \code{\link[stats]{p.adjust}} function), and
#' other values associated with the underlying method used to perform the
#' differential abundance analysis (such as the log-fold change of cluster
#' abundance between the levels being compared).
#'
#' @family differential abundance analysis functions
#'
#' @export
#'
#' @importFrom rlang enquo
#' @importFrom tidyselect eval_select
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr nest
#' @importFrom stringr str_c
#' @importFrom stats as.formula
#' @importFrom purrr map
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_abundance_glmm <-
function(
tof_tibble,
sample_col,
cluster_col,
fixed_effect_cols,
random_effect_cols,
min_cells = 3,
min_samples = 5,
alpha = 0.05) {
# extract sample column as a character vector
# will return an empty character vector if the argument is missing
sample_colname <-
tidyselect::eval_select(
expr = rlang::enquo(sample_col),
data = tof_tibble
) |>
names()
# extract fixed effect columns as a character vector
# will return an empty character vector if the argument is missing
fixed_effect_colnames <-
tidyselect::eval_select(
expr = rlang::enquo(fixed_effect_cols),
data = tof_tibble
) |>
names()
if (length(fixed_effect_colnames) == 0) {
stop("Fixed effects must be specified. Did you forget to set the `fixed_effect_cols` argument?")
}
# extract random effect columns as a character vector
# will return an empty character vector if the argument is missing
random_effect_colnames <-
tidyselect::eval_select(
expr = rlang::enquo(random_effect_cols),
data = tof_tibble
) |>
names()
# count cells in all samples
my_sep <- "_______"
cell_counts <-
tof_tibble |>
tidyr::unite(
col = "metadata",
c({{ sample_col }}, dplyr::any_of(c(fixed_effect_colnames, random_effect_colnames))),
sep = my_sep
) |>
dplyr::mutate(
dplyr::across(
c("metadata", {{ cluster_col }}),
.f = as.factor
),
) |>
dplyr::count(.data$metadata, {{ cluster_col }}, name = "num_cells", .drop = FALSE) |>
tidyr::separate(
col = "metadata",
into = c(sample_colname, fixed_effect_colnames, random_effect_colnames),
sep = my_sep
) |>
dplyr::mutate("{{cluster_col}}" := as.character({{ cluster_col }})) |>
dplyr::group_by({{ sample_col }}) |>
dplyr::mutate(
total_cells = sum(.data$num_cells),
prop = .data$num_cells / .data$total_cells
) |>
dplyr::ungroup()
# find the clusters that don't have over the threshold of minimum cells
# in over the threshold of minimum samples
clusters_to_remove <-
cell_counts |>
dplyr::count(
{{ sample_col }},
{{ cluster_col }},
wt = .data$num_cells,
.drop = FALSE
) |>
dplyr::mutate(has_over_min_cells = .data$n > min_cells) |>
dplyr::count({{ cluster_col }}, .data$has_over_min_cells) |>
dplyr::filter(.data$has_over_min_cells) |>
dplyr::filter(.data$n < min_samples) |>
dplyr::pull({{ cluster_col }})
cell_counts <-
cell_counts |>
dplyr::filter(!({{ cluster_col }} %in% clusters_to_remove))
# nest the count data so we can fit one model per cluster
fit_data <-
cell_counts |>
dplyr::group_by({{ cluster_col }}) |>
tidyr::nest() |>
dplyr::ungroup()
# specify if there are random effects
if (length(random_effect_colnames) == 0) {
has_random_effects <- FALSE
} else {
has_random_effects <- TRUE
}
# construct formula for each model
if (has_random_effects) {
formula_string <-
stringr::str_c(
"prop ~ ",
stringr::str_c(fixed_effect_colnames, sep = "+", collapse = " + "),
"+",
stringr::str_c(paste0("(1 | ", random_effect_colnames, ")"), sep = "+")
)
} else {
formula_string <-
stringr::str_c(
"prop ~ ",
stringr::str_c(fixed_effect_colnames, sep = "+", collapse = " + "),
sep = ""
)
}
formula <- stats::as.formula(formula_string)
# fit one model per cluster
fit_data <-
suppressMessages(suppressWarnings(
fit_data |>
dplyr::mutate(
results =
purrr::map(
.x = data,
.f = fit_da_model,
formula = formula,
has_random_effects = has_random_effects
),
results = purrr::map(.x = .data$results, .f = tidy_lmer_test_glmm)
)
))
fit_data <-
fit_data |>
dplyr::select(-"data") |>
tidyr::unnest(cols = "results") |>
dplyr::filter(.data$term != "(Intercept)", !is.na(.data$p.value)) |>
dplyr::mutate(p_adj = stats::p.adjust(.data$p.value, method = "fdr")) |>
dplyr::arrange(.data$p_adj) |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
mean_fc = exp(.data$estimate)
) |>
dplyr::rename(
tested_effect = "term",
p_val = "p.value",
f_statistic = "statistic"
)
# if (has_random_effects) {
# fit_data <-
# fit_data |>
# dplyr::select(-.data$group, -.data$effect)
# }
fit_data <-
fit_data |>
dplyr::rename_with(stringr::str_replace_all, pattern = "(?<=.)\\.", replacement = "_") |>
dplyr::select(
{{ cluster_col }},
"p_val",
"p_adj",
"significant",
tidyselect::everything()
) |>
tidyr::nest(daa_results = c(-"tested_effect"))
# if result tibble only has 1 row (only 2 levels of fixed_effect_cols), \
# unnest it (which is more intuitive)
if (nrow(fit_data) == 1) {
fit_data <-
tidyr::unnest(fit_data, cols = "daa_results")
}
attr(fit_data, which = "daa_method") <- "glmm"
return(fit_data)
}
#' Differential Expression Analysis (DEA) with linear mixed-models (LMMs)
#'
#' This function performs differential expression analysis on the cell clusters
#' contained within a `tof_tbl` using linear mixed-models. Users
#' specify which columns represent sample, cluster, marker, fixed effect, and random effect
#' information, and a (mixed) linear regression model is fit using either
#' \code{\link[lmerTest]{lmer}} or \code{\link[stats]{glm}}.
#'
#' Specifically, one linear model is fit for each cluster/marker pair. For each cluster/marker
#' pair, a user-supplied measurement of central tendency (`central_tendency_function`), such
#' as mean or median, is calculated across all cells in the cluster on a sample-by-sample
#' basis. Then, this central tendency value is used as the dependent variable in a
#' linear model with `fixed_effect_cols` as fixed effects predictors and `random_effect_cols`
#' as random effects predictors. Once all models (one per each cluster/marker pair) are fit,
#' p-values for each coefficient in each model are multiple-comparisons adjusted using the
#' \code{\link[stats]{p.adjust}} function.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param sample_col An unquoted column name indicating which column in `tof_tibble`
#' represents the id of the sample from which each cell was collected. `sample_col`
#' should serve as a unique identifier for each sample collected during data acquisition -
#' all cells with the same value for `sample_col` will be treated as a part of the same
#' observational unit.
#'
#' @param cluster_col An unquoted column name indicating which column in `tof_tibble`
#' stores the cluster ids of the cluster to which each cell belongs.
#' Cluster labels can be produced via any method the user chooses - including manual gating,
#' any of the functions in the `tof_cluster_*` function family, or any other method.
#'
#' @param marker_cols Unquoted column names representing which columns in `tof_tibble`
#' (i.e. which high-dimensional cytometry protein measurements) should be included in the differential
#' discovery analysis. Defaults to all numeric (integer or double) columns.
#' Supports tidyselection.
#'
#' @param fixed_effect_cols Unquoted column names representing which columns in
#' `tof_tibble` should be used to model fixed effects during the differential
#' expression analysis. Supports tidyselection.
#'
#' Generally speaking, fixed effects should represent the
#' comparisons of biological interest (often the the variables manipulated during
#' experiments), such as treated vs. non-treated, before-treatment vs. after-treatment,
#' or healthy vs. non-healthy.
#'
#' @param random_effect_cols Optional. Unquoted column names representing which columns in
#' `tof_tibble` should be used to model random effects during the differential
#' expression analysis. Supports tidyselection.
#'
#' Generally speaking, random effects should represent variables
#' that a researcher wants to control/account for, but that are not necessarily
#' of biological interest. Example random effect variables might include batch id,
#' patient id (in a paired design), or patient age. Most analyses will not include random effects.
#'
#' @param central_tendency_function The function that will be used to calculate
#' the measurement of central tendency for each cluster/marker pair (to be used
#' as the dependent variable in the linear model). Defaults to \code{\link[stats]{median}}.
#'
#' @param min_cells An integer value used to filter clusters out of the differential
#' expression analysis. Clusters are not included in the differential expression testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 3.
#'
#' @param min_samples An integer value used to filter clusters out of the differential
#' expression analysis. Clusters are not included in the differential expression testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 5.
#'
#' @param alpha A numeric value between 0 and 1 indicating which significance
#' level should be applied to multiple-comparison adjusted p-values during the
#' differential abundance analysis. Defaults to 0.05.
#'
#' @return A nested tibble with two columns: `tested_effect` and `dea_results`.
#'
#' The first column, `tested_effect`
#' is a character vector indicating which term in the differential expression model
#' was used for significance testing. The values in this row are obtained
#' by pasting together the column names for each fixed effect variable and each
#' of its values. For example, a fixed effect column named fixed_effect with
#' levels "a", "b", and "c" have two terms in `tested_effect`: "fixed_effectb" and
#' "fixed_effectc" (note that level "a" of fixed_effect is set as the reference
#' level during dummy coding). These values correspond to the terms in the
#' differential expression model that represent the difference in cluster median
#' expression values of each marker between samples with fixed_effect = "b" and
#' fixed_effect = "a" and between samples with fixed_effect = "c" and
#' fixed_effect = "a", respectively. In addition,
#' note that the first row in `tested_effect` will always represent the "omnibus"
#' test, or the test that there were significant differences between any levels of
#' any fixed effect variable in the model.
#'
#' The second column, `dea_results` is a list of tibbles in which each entry gives
#' the differential expression results for each tested_effect. Within each entry
#' of `daa_results`, you will find `p_val`, the p-value associated with each
#' tested effect in each input cluster/marker pair; `p_adj`, the multiple-comparison
#' adjusted p-value (using the \code{\link[stats]{p.adjust}} function), and
#' other values associated with the underlying method used to perform the
#' differential expression analysis (such as the log-fold change of clusters' median
#' marker expression values between the levels being compared).
#'
#' @export
#'
#' @importFrom rlang enquo
#' @importFrom rlang check_installed
#' @importFrom rlang is_installed
#'
#' @importFrom tidyselect eval_select
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr nest
#'
#' @importFrom stringr str_c
#'
#' @importFrom stats as.formula
#'
#' @importFrom purrr map
#'
#' @family differential expression analysis functions
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_expression_lmm <-
function(
tof_tibble,
sample_col,
cluster_col,
marker_cols = where(tof_is_numeric),
fixed_effect_cols,
random_effect_cols,
central_tendency_function = median,
min_cells = 3,
min_samples = 5,
alpha = 0.05) {
# extract sample column as a character vector
# will return an empty character vector if the argument is missing
sample_colname <-
tidyselect::eval_select(
expr = rlang::enquo(sample_col),
data = tof_tibble
) |>
names()
# extract cluster column as a character vector
# will return an empty character vector if the argument is missing
cluster_colname <-
tidyselect::eval_select(
expr = rlang::enquo(cluster_col),
data = tof_tibble
) |>
names()
# extract fixed effect columns as a character vector
# will return an empty character vector if the argument is missing
fixed_effect_colnames <-
tidyselect::eval_select(
expr = rlang::enquo(fixed_effect_cols),
data = tof_tibble
) |>
names()
# extract random effect columns as a character vector
# will return an empty character vector if the argument is missing
random_effect_colnames <-
tidyselect::eval_select(
expr = rlang::enquo(random_effect_cols),
data = tof_tibble
) |>
names()
# count cells in all samples
my_sep <- "_______"
cell_counts <-
tof_tibble |>
tidyr::unite(
col = "metadata",
c({{ sample_col }}, dplyr::any_of(c(fixed_effect_colnames, random_effect_colnames))),
sep = my_sep
) |>
dplyr::mutate(
dplyr::across(
c("metadata", {{ cluster_col }}),
.f = as.factor
),
) |>
dplyr::count(.data$metadata, {{ cluster_col }}, name = "num_cells", .drop = FALSE) |>
tidyr::separate(
col = "metadata",
into = c(sample_colname, fixed_effect_colnames, random_effect_colnames),
sep = my_sep
) |>
dplyr::mutate("{{cluster_col}}" := as.character({{ cluster_col }})) |>
dplyr::group_by({{ sample_col }}) |>
dplyr::mutate(
total_cells = sum(.data$num_cells),
prop = .data$num_cells / .data$total_cells
) |>
dplyr::ungroup()
# find the clusters that don't have over the threshold of minimum cells
# in over the threshold of minimum samples
clusters_to_remove <-
cell_counts |>
dplyr::mutate(has_over_min_cells = .data$num_cells > min_cells) |>
dplyr::count({{ cluster_col }}, .data$has_over_min_cells) |>
dplyr::filter(.data$has_over_min_cells) |>
dplyr::filter(.data$n < min_samples) |>
dplyr::pull({{ cluster_col }})
## find the clusters that only occur in one of the fixed_effect_cols
fixed_clusters_to_remove <-
cell_counts |>
dplyr::filter(.data$num_cells > 0) |>
dplyr::distinct({{ cluster_col }}, {{ fixed_effect_cols }}) |>
dplyr::count({{ cluster_col }}) |>
dplyr::filter(n == 1) |>
dplyr::pull({{ cluster_col }})
clusters_to_remove <-
unique(c(clusters_to_remove, fixed_clusters_to_remove))
## find the clusters that only occur in one of the random_effect_cols
if (length(random_effect_colnames) != 0) {
random_clusters_to_remove <-
cell_counts |>
dplyr::distinct({{ cluster_col }}, {{ random_effect_cols }}) |>
dplyr::count({{ cluster_col }}) |>
dplyr::filter(n == 1) |>
dplyr::pull({{ cluster_col }})
clusters_to_remove <-
unique(c(clusters_to_remove, random_clusters_to_remove))
}
# find the median for each cluster in each sample
expression_data <-
tof_tibble |>
dplyr::select(
{{ cluster_col }},
{{ sample_col }},
tidyselect::any_of(c(fixed_effect_colnames, random_effect_colnames)),
{{ marker_cols }}
) |>
# remove clusters that don't fit the minimum criteria
dplyr::filter(!({{ cluster_col }} %in% clusters_to_remove)) |>
dplyr::group_by(
{{ sample_col }},
{{ cluster_col }},
dplyr::across(tidyselect::any_of(c(fixed_effect_colnames, random_effect_colnames)))
) |>
dplyr::summarize(
dplyr::across(
tidyselect::everything(),
.fns = central_tendency_function
)
) |>
dplyr::ungroup()
# nest the expression data so we can fit one model per cluster per channel
fit_data <-
expression_data |>
tidyr::pivot_longer(
cols = {{ marker_cols }},
names_to = "marker",
values_to = "expression"
) |>
dplyr::group_by({{ cluster_col }}, .data$marker) |>
tidyr::nest() |>
dplyr::ungroup()
# specify if there are random effects
if (length(random_effect_colnames) == 0) {
has_random_effects <- FALSE
} else {
has_random_effects <- TRUE
}
# construct formula for each model
if (has_random_effects) {
formula_string <-
stringr::str_c(
"expression ~ ",
stringr::str_c(fixed_effect_colnames, sep = "+", collapse = " + "),
"+",
stringr::str_c(paste0("(1 | ", random_effect_colnames, ")"), sep = "+")
)
} else {
formula_string <-
stringr::str_c(
"expression ~ ",
stringr::str_c(fixed_effect_colnames, sep = "+", collapse = " + "),
sep = ""
)
}
formula <- stats::as.formula(formula_string)
# fit one model per cluster
fit_data <-
suppressWarnings(suppressMessages(
fit_data |>
dplyr::mutate(
results =
purrr::map(
.x = data,
.f = fit_de_model,
formula = formula,
has_random_effects = has_random_effects
),
results = purrr::map(.x = .data$results, .f = tidy_lmer_test)
) |>
dplyr::select(-"data") |>
tidyr::unnest(cols = "results")
))
intercepts <-
fit_data |>
dplyr::filter(.data$term == "(Intercept)", !is.na(.data$p.value)) |>
dplyr::rename(baseline_expression = "estimate") |>
dplyr::select({{ cluster_col }}, "marker", "baseline_expression")
fit_data <-
fit_data |>
dplyr::filter(.data$term != "(Intercept)", !is.na(.data$p.value)) |>
dplyr::mutate(p_adj = stats::p.adjust(.data$p.value, method = "fdr")) |>
dplyr::arrange(.data$p_adj) |>
dplyr::mutate(
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
mean_diff = .data$estimate
) |>
dplyr::rename(
tested_effect = "term",
p_val = "p.value",
f_statistic = "statistic"
) |>
dplyr::rename_with(stringr::str_replace_all, pattern = "(?<=.)\\.", replacement = "_") |>
dplyr::left_join(
intercepts,
by = c("marker", cluster_colname)
) |>
dplyr::mutate(
mean_fc = (.data$baseline_expression + .data$mean_diff) / .data$baseline_expression
) |>
dplyr::select(-"baseline_expression") |>
dplyr::select(
{{ cluster_col }},
"marker",
"p_val",
"p_adj",
"significant",
tidyselect::everything()
) |>
dplyr::select(-"estimate")
fit_data <-
fit_data |>
dplyr::rename(std_error = "Std_ Error") |> # TO DO: Check
tidyr::nest(dea_results = c(-"tested_effect"))
# if result tibble only has 1 row (only 2 levels of fixed_effect_cols), \
# unnest it (which is more intuitive)
if (nrow(fit_data) == 1) {
fit_data <-
tidyr::unnest(fit_data, cols = "dea_results")
}
attr(fit_data, which = "dea_method") <- "lmm"
return(fit_data)
}
# t-tests ----------------------------------------------------------------------
#' Differential Abundance Analysis (DAA) with t-tests
#'
#' This function performs differential abundance analysis on the cell clusters
#' contained within a `tof_tbl` using simple t-tests. Users
#' specify which columns represent sample, cluster, and effect
#' information, and either a paired or unpaired t-test (one per cluster) is used to detect significant
#' differences between sample types.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param cluster_col An unquoted column name indicating which column in `tof_tibble`
#' stores the cluster ids of the cluster to which each cell belongs.
#' Cluster labels can be produced via any method the user chooses - including manual gating,
#' any of the functions in the `tof_cluster_*` function family, or any other method.
#'
#' @param effect_col Unquoted column name representing which column in
#' `tof_tibble` should be used to break samples into groups for the t-test. Should
#' only have 2 unique values.
#'
#' @param group_cols Unquoted names of the columns other than `effect_col`
#' that should be used to group cells into independent observations. Fills a similar role
#' to `sample_col` in other `tof_analyze_abundance_*` functions. For example, if an experiment involves
#' analyzing samples taken from multiple patients at two timepoints (with `effect_col = timepoint`),
#' then group_cols should be the name of the column representing patient IDs.
#'
#' @param test_type A string indicating whether the t-test should be "unpaired"
#' (the default) or "paired".
#'
#' @param min_cells An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 3.
#'
#' @param min_samples An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 5.
#'
#' @param alpha A numeric value between 0 and 1 indicating which significance
#' level should be applied to multiple-comparison adjusted p-values during the
#' differential abundance analysis. Defaults to 0.05.
#'
#' @param quiet A boolean value indicating whether warnings should be printed.
#' Defaults to `TRUE`.
#'
#' @return A tibble with 7 columns:
#'
#' \describe{
#' \item{\strong{\{cluster_col\}}}{The name/ID of the cluster being tested.
#' Each entry in this column will match a unique value in the input
#' \{cluster_col\}.}
#' \item{\strong{t}}{The t-statistic computed for each cluster.}
#' \item{\strong{df}}{The degrees of freedom used for the t-test for each cluster.}
#' \item{\strong{p_val}}{The (unadjusted) p-value for the t-test for each cluster.}
#' \item{\strong{p_adj}}{The \code{\link[stats]{p.adjust}}-adjusted p-value for the t-test for each cluster.}
#' \item{\strong{significant}}{A character vector that will be "*" for clusters
#' for which p_adj < alpha and "" otherwise.}
#' \item{\strong{mean_diff}}{For an unpaired t-test, the difference between the average
#' proportions of each cluster in the two levels of `effect_col`. For a paired
#' t-test, the average difference between the proportions of each cluster in
#' the two levels of `effect_col` within a given patient.}
#' \item{\strong{mean_fc}}{For an unpaired t-test, the ratio between the average
#' proportions of each cluster in the two levels of `effect_col`. For a paired
#' t-test, the average ratio between the proportions of each cluster in
#' the two levels of `effect_col` within a given patient. 0.001 is added to
#' the denominator of the ratio to avoid divide-by-zero errors.}
#' }
#'
#' The "levels" attribute of the result indicates the order in which the different
#' levels of the `effect_col` were considered. The `mean_diff` value for each
#' row of the output is computed by subtracting the second level from the first level,
#' and the `mean_fc` value for each row is computed by dividing the first level
#' by the second level.
#'
#' @family differential abundance analysis functions
#'
#' @export
#'
#' @importFrom tidyr pivot_wider
#' @importFrom tidyr nest
#' @importFrom purrr map_if
#' @importFrom purrr map_dbl
#' @importFrom purrr map
#' @importFrom purrr map2_dbl
#' @importFrom stats p.adjust
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_abundance_ttest <-
function(
tof_tibble,
cluster_col,
effect_col,
group_cols,
test_type = c("unpaired", "paired"),
min_cells = 3,
min_samples = 5,
alpha = 0.05,
quiet = FALSE) {
# check the test_type argument
test_type <- rlang::arg_match(test_type, values = c("unpaired", "paired"))
# check for the number of unique levels in effect_col
if (missing(group_cols)) {
stop("The `group_cols` argument must be specified.")
}
# check for the number of unique levels in effect_col
if (missing(effect_col)) {
stop("The `effect_col` argument must be specified.")
}
effect_levels <-
tof_tibble |>
dplyr::pull({{ effect_col }}) |>
unique()
if (length(effect_levels) != 2L) {
stop("`effect_col` must have 2 distinct levels`.")
}
# count the cells in each cluster within each sample and effect_col level
count_df <-
tof_tibble |>
dplyr::mutate("{{cluster_col}}" := as.factor({{ cluster_col }})) |>
dplyr::count(across({{ group_cols }}), {{ effect_col }}, {{ cluster_col }}, .drop = FALSE) |>
dplyr::group_by(across({{ group_cols }}), {{ effect_col }}) |>
dplyr::mutate(prop = .data$n / sum(.data$n)) |>
dplyr::ungroup()
# find the clusters that should be removed due to not having enough cells in
# enough samples
clusters_to_keep <-
count_df |>
dplyr::filter(.data$n > min_cells) |>
dplyr::count({{ cluster_col }}) |>
dplyr::filter(.data$n > min_samples) |>
dplyr::pull({{ cluster_col }})
count_df <-
count_df |>
dplyr::filter({{ cluster_col }} %in% clusters_to_keep)
# perform the t-tests
if (test_type == "paired") {
t_df <-
count_df |>
dplyr::select({{ group_cols }}, {{ effect_col }}, {{ cluster_col }}, "prop") |>
tidyr::pivot_wider(
names_from = {{ effect_col }},
values_from = "prop"
) |>
tidyr::nest(data = -{{ cluster_col }}) |>
dplyr::mutate(
t_test =
purrr::map_if(
.x = data,
.p = ~ nrow(.x) > 1,
.f = ~ stats::t.test(.x[[effect_levels[[1]]]], .x[[effect_levels[[2]]]], paired = TRUE),
.else = ~ return(list(statistic = NA_real_, parameter = NA_real_, p.value = NA_real_))
),
t = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$statistic),
df = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$parameter),
p_val = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$p.value),
p_adj = stats::p.adjust(.data$p_val, "fdr"),
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
mean_diff =
purrr::map_dbl(
.x = data,
.f = ~ mean(.x[[effect_levels[[1]]]] - .x[[effect_levels[[2]]]])
),
mean_fc =
# note the correction factor to avoid divide-by-zero errors in calculating the fold-change
purrr::map_dbl(
.x = data,
.f = ~ mean(.x[[effect_levels[[1]]]] / (.x[[effect_levels[[2]]]] + 0.001))
)
) |>
dplyr::select(-"t_test", -"data")
} else {
# extract cluster_col names as a string
cluster_colname <-
count_df |>
dplyr::select({{ cluster_col }}) |>
colnames()
effect_1_tibble <-
count_df |>
dplyr::filter({{ effect_col }} == effect_levels[[1]]) |>
tidyr::nest(data = -{{ cluster_col }}) |>
dplyr::transmute(
{{ cluster_col }},
effect_1_vector = purrr::map(.x = data, .f = ~ dplyr::pull(.x, .data$prop))
)
effect_2_tibble <-
count_df |>
dplyr::filter({{ effect_col }} == effect_levels[[2]]) |>
tidyr::nest(data = -{{ cluster_col }}) |>
dplyr::transmute(
{{ cluster_col }},
effect_2_vector = purrr::map(.x = data, .f = ~ dplyr::pull(.x, .data$prop))
)
t_df <-
dplyr::left_join(effect_1_tibble, effect_2_tibble, by = cluster_colname) |>
dplyr::mutate(
enough_samples =
purrr::map2_lgl(
.x = .data$effect_1_vector,
.y = .data$effect_2_vector,
.f = ~ length(.x) > 1 & length(.y) > 1
),
t_test =
purrr::pmap(
.l =
list(.data$enough_samples, .data$effect_1_vector, .data$effect_2_vector),
.f = tof_ttest
),
t = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$statistic),
df = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$parameter),
p_val = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$p.value),
p_adj = stats::p.adjust(.data$p_val, "fdr"),
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
mean_diff =
purrr::map2_dbl(
.x = .data$effect_1_vector,
.y = .data$effect_2_vector,
.f = ~ mean(.x, na.rm = TRUE) - mean(.y, na.rm = TRUE),
),
mean_fc =
purrr::map2_dbl(
.x = .data$effect_1_vector,
.y = .data$effect_2_vector,
.f = ~ mean(.x, na.rm = TRUE) / mean(.y, na.rm = TRUE),
)
) |>
dplyr::select(
-"t_test",
-"effect_1_vector",
-"effect_2_vector",
-"enough_samples"
) |>
dplyr::arrange(.data$p_adj)
}
# convert cluster column back to a character vector for consistency with
# other tidytof functions and arrange columns into standard order
t_df <-
dplyr::mutate(t_df, "{{cluster_col}}" := as.character({{ cluster_col }})) |>
dplyr::select(
{{ cluster_col }},
"p_val",
"p_adj",
"significant",
tidyselect::everything()
)
if (!quiet) {
if (any(is.na(t_df$t))) {
warning("Some conditions did not have at least 2 replicates. Some NA values returned as a result.\nBe sure to check the `group_cols` argument.")
}
}
# save the level in which the attributes were analyzed in case the user
# wants to interpret the mean difference or fold change values.
attr(t_df, "levels") <- effect_levels
attr(t_df, which = "daa_method") <- paste0("t_", test_type)
return(t_df)
}
#' Differential Expression Analysis (DEA) with t-tests
#'
#' This function performs differential expression analysis on the cell clusters
#' contained within a `tof_tbl` using simple t-tests. Specifically, either an unpaired
#' or paired t-test will compare samples' marker expression distributions (between two conditions)
#' within each cluster using a user-specified summary function (i.e. mean or median).
#' One t-test is conducted per cluster/marker pair and significant differences
#' between sample types are detected after multiple-hypothesis correction.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param cluster_col An unquoted column name indicating which column in `tof_tibble`
#' stores the cluster ids of the cluster to which each cell belongs.
#' Cluster labels can be produced via any method the user chooses - including manual gating,
#' any of the functions in the `tof_cluster_*` function family, or any other method.
#'
#' @param effect_col Unquoted column name representing which column in
#' `tof_tibble` should be used to break samples into groups for the t-test. Should
#' only have 2 unique values.
#'
#' @param marker_cols Unquoted column names representing which columns in `tof_tibble`
#' (i.e. which high-dimensional cytometry protein measurements) should be tested for differential expression between
#' levels of the `effect_col`. Defaults to all numeric (integer or double) columns.
#' Supports tidyselect helpers.
#'
#' @param group_cols Unquoted names of the columns other than `effect_col`
#' that should be used to group cells into independent observations. Fills a similar role
#' to `sample_col` in other `tof_analyze_abundance_*` functions. For example, if an experiment involves
#' analyzing samples taken from multiple patients at two timepoints (with `effect_col = timepoint`),
#' then group_cols should be the name of the column representing patient IDs.
#'
#' @param test_type A string indicating whether the t-test should be "unpaired"
#' (the default) or "paired".
#'
#' @param summary_function The vector-valued function that should be used to
#' summarize the distribution of each marker in each cluster (within each sample, as
#' grouped by `group_cols`). Defaults to `mean`.
#'
#' @param min_cells An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 3.
#'
#' @param min_samples An integer value used to filter clusters out of the differential
#' abundance analysis. Clusters are not included in the differential abundance testing
#' if they do not have at least `min_cells` in at least `min_samples` samples.
#' Defaults to 5.
#'
#' @param alpha A numeric value between 0 and 1 indicating which significance
#' level should be applied to multiple-comparison adjusted p-values during the
#' differential abundance analysis. Defaults to 0.05.
#'
#' @param quiet A boolean value indicating whether warnings should be printed.
#' Defaults to `TRUE`.
#'
#' @return A tibble with 7 columns:
#'
#' \describe{
#' \item{\strong{\{cluster_col\}}}{The name/ID of the cluster in the cluster/marker pair
#' being tested. Each entry in this column will match a unique value in the input
#' \{cluster_col\}.}
#' \item{\strong{marker}}{The name of the marker in the cluster/marker pair being tested.}
#' \item{\strong{t}}{The t-statistic computed for each cluster.}
#' \item{\strong{df}}{The degrees of freedom used for the t-test for each cluster.}
#' \item{\strong{p_val}}{The (unadjusted) p-value for the t-test for each cluster.}
#' \item{\strong{p_adj}}{The \code{\link[stats]{p.adjust}}-adjusted p-value for the t-test for each cluster.}
#' \item{\strong{significant}}{A character vector that will be "*" for clusters
#' for which p_adj < alpha and "" otherwise.}
#' \item{\strong{mean_diff}}{For an unpaired t-test, the difference between the average
#' proportions of each cluster in the two levels of `effect_col`. For a paired
#' t-test, the average difference between the proportions of each cluster in
#' the two levels of `effect_col` within a given patient.}
#' \item{\strong{mean_fc}}{For an unpaired t-test, the ratio between the average
#' proportions of each cluster in the two levels of `effect_col`. For a paired
#' t-test, the average ratio between the proportions of each cluster in
#' the two levels of `effect_col` within a given patient. 0.001 is added to
#' the denominator of the ratio to avoid divide-by-zero errors.}
#' }
#'
#' The "levels" attribute of the result indicates the order in which the different
#' levels of the `effect_col` were considered. The `mean_diff` value for each
#' row of the output is computed subtracting the second level from the first level,
#' and the `mean_fc` value for each row is computed by dividing the first level
#' by the second level.
#'
#' @family differential expression analysis functions
#'
#' @export
#'
#' @importFrom rlang arg_match
#' @importFrom tidyr pivot_wider
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr nest
#' @importFrom purrr map_if
#' @importFrom purrr map_dbl
#' @importFrom purrr map
#' @importFrom purrr map2_dbl
#' @importFrom purrr map2_lgl
#' @importFrom purrr pmap
#' @importFrom stats p.adjust
#' @importFrom stats t.test
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_expression_ttest <-
function(
tof_tibble,
cluster_col,
marker_cols = where(tof_is_numeric),
effect_col,
group_cols,
test_type = c("unpaired", "paired"),
summary_function = mean,
min_cells = 3,
min_samples = 5,
alpha = 0.05,
quiet = FALSE) {
# check the test_type argument
test_type <- rlang::arg_match(test_type, values = c("unpaired", "paired"))
# check for the number of unique levels in effect_col
if (missing(effect_col)) {
stop("The `effect_col` argument must be specified.")
}
# check for the number of unique levels in effect_col
if (missing(group_cols)) {
stop("The `group_cols` argument must be specified.")
}
effect_levels <-
tof_tibble |>
dplyr::pull({{ effect_col }}) |>
unique()
if (length(effect_levels) != 2L) {
stop("`effect_col` must have 2 distinct levels`.")
}
# extract marker columns for dea
marker_colnames <-
tof_tibble |>
dplyr::select({{ marker_cols }}) |>
colnames()
# summarize marker expression in each cluster within each sample and effect_col level
expression_df <-
tof_tibble |>
dplyr::mutate("{{cluster_col}}" := as.factor({{ cluster_col }})) |>
dplyr::group_by(across({{ group_cols }}), {{ effect_col }}, {{ cluster_col }}) |>
dplyr::summarize(
dplyr::across(
dplyr::any_of(marker_colnames),
summary_function
)
) |>
dplyr::ungroup()
# find the clusters that should be removed due to not having enough cells in
# enough samples
count_df <-
tof_tibble |>
dplyr::mutate("{{cluster_col}}" := as.factor({{ cluster_col }})) |>
dplyr::count(across({{ group_cols }}), {{ effect_col }}, {{ cluster_col }}, .drop = FALSE) |>
dplyr::group_by(across({{ group_cols }}), {{ effect_col }}) |>
dplyr::ungroup()
clusters_to_keep <-
count_df |>
dplyr::filter(n > min_cells) |>
dplyr::count({{ cluster_col }}) |>
dplyr::filter(n > min_samples) |>
dplyr::pull({{ cluster_col }})
expression_df <-
expression_df |>
dplyr::filter({{ cluster_col }} %in% clusters_to_keep)
# throw an error if you didn't keep any clusters
if (nrow(expression_df) == 0) {
stop("No clusters were retained. Try changing `min_cells` and `min_samples`.")
}
# perform the t-tests
if (test_type == "paired") {
t_df <-
expression_df |>
tidyr::pivot_longer(
cols = dplyr::any_of(marker_colnames),
names_to = "marker",
values_to = "summary"
) |>
tidyr::nest(data = c(-"marker", -{{ cluster_col }})) |>
dplyr::mutate(
data =
purrr::map(
data,
~ tidyr::pivot_wider(.x, names_from = {{ effect_col }}, values_from = summary)
)
) |>
dplyr::mutate(
t_test =
purrr::map_if(
.x = data,
.p = ~ nrow(.x) > 1,
.f = ~
stats::t.test(
x = .x[[effect_levels[[1]]]],
y = .x[[effect_levels[[2]]]],
paired = TRUE
),
.else = ~ return(list(statistic = NA_real_, parameter = NA_real_, p.value = NA_real_))
),
t = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$statistic),
df = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$parameter),
p_val = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$p.value),
p_adj = stats::p.adjust(.data$p_val, "fdr"),
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
mean_diff =
purrr::map_dbl(
.x = data,
.f = ~ mean(.x[[effect_levels[[1]]]] - .x[[effect_levels[[2]]]], na.rm = TRUE)
),
mean_fc =
purrr::map_dbl(
.x = data,
.f = ~ mean(.x[[effect_levels[[1]]]] / (.x[[effect_levels[[2]]]] + 0.001), na.rm = TRUE)
)
) |>
dplyr::select(-"t_test", -"data") |>
dplyr::select(
{{ cluster_col }},
"marker",
"p_val",
"p_adj",
"significant",
tidyselect::everything()
) |>
dplyr::arrange(.data$p_adj)
} else {
t_df <-
expression_df |>
tidyr::pivot_longer(
cols = dplyr::any_of(marker_colnames),
names_to = "marker",
values_to = "summary"
) |>
tidyr::nest(data = c(-{{ effect_col }}, -{{ cluster_col }}, -"marker")) |>
tidyr::pivot_wider(names_from = {{ effect_col }}, values_from = data) |>
dplyr::mutate(
enough_samples =
purrr::map2_lgl(
.x = .data[[effect_levels[[1]]]],
.y = .data[[effect_levels[[2]]]],
.f = ~ if (!is.null(.x) & !is.null(.y)) {
return(nrow(.x) > 1 & nrow(.y) > 1)
} else {
return(FALSE)
}
),
t_test =
purrr::pmap(
.l = list(.data$enough_samples, .data[[effect_levels[[1]]]], .data[[effect_levels[[2]]]]),
.f = function(x, y, z) tof_ttest(x, y$summary, z$summary)
),
t = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$statistic),
df = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$parameter),
p_val = purrr::map_dbl(.x = .data$t_test, .f = ~ .x$p.value),
p_adj = stats::p.adjust(.data$p_val, "fdr"),
significant = dplyr::if_else(.data$p_adj < alpha, "*", ""),
mean_diff =
purrr::map2_dbl(
.x = .data[[effect_levels[[1]]]],
.y = .data[[effect_levels[[2]]]],
.f = ~ if (!is.null(.x) & !is.null(.y)) {
return(mean(.x$summary) - mean(.y$summary))
} else {
return(NA_real_)
}
),
mean_fc =
# note the correction factor to avoid divide-by-zero errors in calculating the fold-change
purrr::map2_dbl(
.x = .data[[effect_levels[[1]]]],
.y = .data[[effect_levels[[2]]]],
.f = ~ if (!is.null(.x) & !is.null(.y)) {
return(mean(.x$summary) / (mean(.y$summary) + 0.001))
} else {
return(NA_real_)
}
)
) |>
dplyr::select(
-"t_test",
-dplyr::any_of(effect_levels),
-"enough_samples"
) |>
dplyr::select(
{{ cluster_col }},
"marker",
"p_val",
"p_adj",
"significant",
tidyselect::everything()
) |>
dplyr::arrange(.data$p_adj)
}
# convert cluster column back to a character vector for consistency with
# other tidytof functions
t_df <- dplyr::mutate(t_df, "{{cluster_col}}" := as.character({{ cluster_col }}))
if (!quiet) {
if (any(is.na(t_df$t))) {
warning("Some conditions did not have at least 2 replicates. Some NA values returned as a result.\nBe sure to check the `group_cols` argument.")
}
}
# save the level in which the attributes were analyzed in case the user
# wants to interpret the mean difference or fold change values.
attr(t_df, "levels") <- effect_levels
attr(t_df, which = "dea_method") <- paste0("t_", test_type)
return(t_df)
}
# wrapper functions ------------------------------------------------------------
#' Perform Differential Abundance Analysis (DAA) on high-dimensional cytometry data
#'
#' This function performs differential abundance analysis on the cell clusters
#' contained within a `tof_tbl` using one of three methods
#' ("diffcyt", "glmm", and "ttest"). It wraps the members of the `tof_analyze_abundance_*`
#' function family: \code{\link{tof_analyze_abundance_diffcyt}},
#' \code{\link{tof_analyze_abundance_glmm}}, and \code{\link{tof_analyze_abundance_ttest}}.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param method A string indicating which statistical method should be used. Valid
#' values include "diffcyt", "glmm", and "ttest".
#'
#' @param ... Additional arguments to pass onto the `tof_analyze_abundance_*`
#' function family member corresponding to the chosen method.
#'
#' @return A tibble or nested tibble containing the differential abundance results
#' from the chosen method. See \code{\link{tof_analyze_abundance_diffcyt}},
#' \code{\link{tof_analyze_abundance_glmm}}, and \code{\link{tof_analyze_abundance_ttest}} for details.
#'
#' @family differential abundance analysis functions
#'
#' @export
#'
#' @importFrom rlang arg_match
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_abundance <-
function(
tof_tibble,
method = c("diffcyt", "glmm", "ttest"),
...) {
# check method argument
method <-
rlang::arg_match(arg = method, values = c("diffcyt", "glmm", "ttest"))
if (method == "diffcyt") {
result <-
tof_tibble |>
tof_analyze_abundance_diffcyt(...)
} else if (method == "glmm") {
result <-
tof_tibble |>
tof_analyze_abundance_glmm(...)
} else {
result <-
tof_tibble |>
tof_analyze_abundance_ttest(...)
}
# return result
return(result)
}
#' Perform Differential Expression Analysis (DEA) on high-dimensional cytometry data
#'
#' This function performs differential expression analysis on the cell clusters
#' contained within a `tof_tbl` using one of three methods
#' ("diffcyt", "glmm", and "ttest"). It wraps the members of the `tof_analyze_expression_*`
#' function family: \code{\link{tof_analyze_expression_diffcyt}},
#' \code{\link{tof_analyze_expression_lmm}}, and \code{\link{tof_analyze_expression_ttest}}.
#'
#' @param tof_tibble A `tof_tbl` or a `tibble`.
#'
#' @param method A string indicating which statistical method should be used. Valid
#' values include "diffcyt", "lmm", and "ttest".
#'
#' @param ... Additional arguments to pass onto the `tof_analyze_expression_*`
#' function family member corresponding to the chosen method.
#'
#' @return A tibble or nested tibble containing the differential abundance results
#' from the chosen method. See \code{\link{tof_analyze_expression_diffcyt}},
#' \code{\link{tof_analyze_expression_lmm}}, and \code{\link{tof_analyze_expression_ttest}} for details.
#'
#' @family differential expression analysis functions
#'
#' @export
#'
#' @importFrom rlang arg_match
#'
#' @examples
#' # For differential discovery examples, please see the package vignettes
#' NULL
#'
tof_analyze_expression <-
function(
tof_tibble,
method = c("diffcyt", "glmm", "ttest"),
...) {
# check method argument
method <-
rlang::arg_match(arg = method, values = c("diffcyt", "glmm", "ttest"))
if (method == "diffcyt") {
result <-
tof_tibble |>
tof_analyze_expression_diffcyt(...)
} else if (method == "lmm") {
result <-
tof_tibble |>
tof_analyze_expression_lmm(...)
} else {
result <-
tof_tibble |>
tof_analyze_expression_ttest(...)
}
# return result
return(result)
}
# aliases for backwards compatibility ------------------------------------------
tof_daa_diffcyt <- tof_analyze_abundance_diffcyt
tof_daa_glmm <- tof_analyze_abundance_glmm
tof_daa_ttest <- tof_analyze_abundance_ttest
tof_daa <- tof_analyze_abundance
tof_dea_diffcyt <- tof_analyze_expression_diffcyt
tof_dea_lmm <- tof_analyze_expression_lmm
tof_dea_ttest <- tof_analyze_expression_ttest
tof_dea <- tof_analyze_expression
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.