R/diffcyt_workflow.R

#' Tree versions of diffcyt functions
#'
#' A collection of functions from the \code{\link[diffcyt]{diffcyt}} package
#' have been generalized to work with data provided in a tree structure. The
#' tree represents increasingly coarse clustering of the cells, and the leaves
#' are the clusters from an initial, high-resolution clustering generated by
#' \code{diffcyt}. Note that \code{diffcyt} represents data using
#' \code{SummarizedExperiment} objects with cells in rows and features in
#' columns.
#'
#' The data object is assumed to contain a factor \code{marker_class} in the
#' column meta-data (see \code{\link[diffcyt]{prepareData}}), which indicates
#' the protein marker class for each column of data (\code{"type"},
#' \code{"state"}, or \code{"none"}).
#'
#' Variables \code{id_type_markers} and \code{id_state_markers} are saved in
#' the \code{metadata} slot of the output object. These can be used to
#' identify the 'cell type' and 'cell state' markers in the list of
#' \code{assays} in the output \code{TreeSummarizedExperiment} object,
#' which is useful in later steps of the 'diffcyt' pipeline.
#'
#' \itemize{
#' \item
#' \code{buildTree} applies hierarchical clustering to build a tree starting
#' from the high-resolution clustering created by
#' \code{\link[diffcyt]{generateClusters}}. The function calculates the median
#' abundance for each (ID type) marker and cluster, and uses this data to
#' further aggregate the initial clusters using hierarchical clustering.
#' \item
#' \code{calcTreeCounts} calculates the number of cells per cluster-sample
#' combination (referred to as cluster cell 'counts', 'abundances', or
#' 'frequencies'. This is a tree version of \code{\link[diffcyt]{calcCounts}}.
#' \item
#' \code{calcMediansByTreeMarker} calculates the median value for each
#' cluster-marker combination. A cluster is represented by a node on the tree.
#' This is a tree version of \code{\link[diffcyt]{calcMediansByClusterMarker}}.
#' \item
#' \code{calcTreeMedians} calculates the median expression for each
#' cluster-sample-marker combination. This is a tree version of
#' \code{\link[diffcyt]{calcMedians}}.
#' }
#'
#' @author Ruizhu Huang
#' @name diffcyt_workflow
#'
#' @param d_se A \code{SummarizedExperiment} object, with cells as rows and
#'     features as columns. This should be the output
#'     from \code{\link[diffcyt]{generateClusters}}. The \code{colData} is
#'     assumed to contain a factor named \code{marker_class}.
#' @param dist_method The distance measure to be used. This must be one of
#'     "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski".
#'     Any unambiguous substring can be given. Please refer to \code{method} in
#'     \code{\link[stats]{dist}} for more information.
#' @param hclust_method The agglomeration method to be used. This should be (an
#'     unambiguous abbreviation of) one of "ward.D", "ward.D2", "single",
#'     "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC)
#'     or "centroid" (= UPGMC). Please refer to \code{method} in
#'     \code{\link[stats]{hclust}} for more information.
#' @param tree A \code{phylo} object from \code{\link{buildTree}}.
#' @param message A logical scalar indicating whether progress messages
#'     should be printed.
#'
#' @returns
#' \itemize{
#' \item For \code{buildTree}, a \code{phylo} object representing the
#' hierarchical clustering of the initial high-resolution clusters.
#' \item For \code{calcTreeCounts}, a \code{TreeSummarizedExperiment} object,
#' with clusters (nodes of the tree) in rows, samples in columns and
#' abundances (counts) in an \code{assay}.
#' \item For \code{calcMediansByTreeMarker}, a \code{TreeSummarizedExperiment}
#' object with clusters (nodes of the tree) in rows and markers in columns. The
#' marker expression values are in the \code{assay}.
#' \item For \code{calcTreeMedians}, a \code{TreeSummarizedExperiment} object
#' with median marker expression for each cluster (each node of the tree) and
#' each sample for each cluster (node of the tree). Each node is represented as
#' a separate assay, with the number of columns equal to the number of samples.
#' The \code{metadata} slot contains variables \code{id_type_markers} and
#' \code{id_state_markers}.
#' }
#'
#' @examples
#' ## For a complete workflow example demonstrating each step in the 'diffcyt'
#' ## pipeline, please see the diffcyt vignette.
#' suppressPackageStartupMessages({
#'     library(diffcyt)
#'     library(TreeSummarizedExperiment)
#' })
#'
#' ## Helper function to create random data (one sample)
#' d_random <- function(n = 20000, mean = 0, sd = 1, ncol = 20, cofactor = 5) {
#'     d <- sinh(matrix(rnorm(n, mean, sd), ncol = ncol)) * cofactor
#'     colnames(d) <- paste0("marker", sprintf("%02d", seq_len(ncol)))
#'     d
#' }
#'
#' ## Create random data (without differential signal)
#' set.seed(123)
#' d_input <- list(
#'     sample1 = d_random(), sample2 = d_random(),
#'     sample3 = d_random(), sample4 = d_random()
#' )
#'
#' experiment_info <- data.frame(
#'     sample_id = factor(paste0("sample", seq_len(4))),
#'     group_id = factor(c("group1", "group1", "group2", "group2"))
#' )
#'
#' marker_info <- data.frame(
#'     channel_name = paste0("channel", sprintf("%03d", seq_len(20))),
#'     marker_name = paste0("marker", sprintf("%02d", seq_len(20))),
#'     marker_class = factor(c(rep("type", 10), rep("state", 10)),
#'                           levels = c("type", "state", "none"))
#' )
#'
#' # Prepare data
#' d_se <- diffcyt::prepareData(d_input, experiment_info, marker_info)
#'
#' # Transform data
#' d_se <- diffcyt::transformData(d_se)
#'
#' # Generate clusters
#' d_se <- diffcyt::generateClusters(d_se)
#'
#' # Build a tree
#' tr <- buildTree(d_se)
#'
#' ## Calculate abundances for nodes in each sample
#' d_counts_tree <- calcTreeCounts(d_se = d_se, tree = tr)
#'
#' ## Calculate medians (by cluster and marker)
#' d_medians_by_cluster_marker <-
#'     calcMediansByTreeMarker(d_se = d_se, tree = tr)
#'
#' ## Calculate medians (by cluster, sample and marker)
#' d_medians_tree <- calcTreeMedians(d_se = d_se, tree = tr)
NULL
fionarhuang/treeclimbR documentation built on Jan. 1, 2025, 9:02 p.m.