# upsample.R
# This file contains functions relevant to performing upsampling
# on tof_tbl objects containing CyTOF data.
# tof_upsample_distance --------------------------------------------------------
#' Upsample cells into the closest cluster in a reference dataset
#'
#' This function performs distance-based upsampling on CyTOF data
#' by sorting single cells (passed into the function as `tof_tibble`) into
#' their most phenotypically similar cell subpopulation in a reference dataset
#' (passed into the function as `reference_tibble`). It does so by calculating
#' the distance (either mahalanobis, cosine, or pearson) between each cell in
#' `tof_tibble` and the centroid of each cluster in `reference_tibble`, then
#' sorting cells into the cluster corresponding to their closest centroid.
#'
#' @param tof_tibble A `tibble` or `tof_tbl` containing cells to be upsampled
#' into their nearest reference subpopulation.
#'
#' @param reference_tibble A `tibble` or `tof_tibble` containing cells that have
#' already been clustered or manually gated into subpopulations.
#'
#' @param reference_cluster_col An unquoted column name indicating which column in
#' `reference_tibble` contains the subpopulation label (or cluster id) for
#' each cell in `reference_tibble`.
#'
#' @param upsample_cols Unquoted column names indicating which columns in `tof_tibble` to
#' use in computing the distances used for upsampling. Defaults to all numeric columns
#' in `tof_tibble`. Supports tidyselect helpers.
#'
#' @param parallel_cols Optional. Unquoted column names indicating which columns in `tof_tibble` to
#' use for breaking up the data in order to parallelize the upsampling using
#' `foreach` on a `doParallel` backend.
#' Supports tidyselect helpers.
#'
#' @param distance_function A string indicating which distance function should
#' be used to perform the upsampling. Options are "mahalanobis" (the default),
#' "cosine", and "pearson".
#'
#' @param num_cores An integer indicating the number of CPU cores used to parallelize
#' the classification. Defaults to 1 (a single core).
#'
#' @param return_distances A boolean value indicating whether or not the returned
#' result should include only one column, the cluster ids corresponding to each row
#' of `tof_tibble` (return_distances = FALSE, the default), or if the returned
#' result should include additional columns representing the distance between each
#' row of `tof_tibble` and each of the reference subpopulation centroids
#' (return_distances = TRUE).
#'
#' @return If `return_distances = FALSE`, a tibble with one column named
#' `.upsample_cluster`, a character vector of length `nrow(tof_tibble)`
#' indicating the id of the reference cluster to which each cell
#' (i.e. each row) in `tof_tibble` was assigned.
#'
#' If `return_distances = TRUE`, a tibble with `nrow(tof_tibble)` rows and num_clusters + 1
#' columns, where num_clusters is the number of clusters in `reference_tibble`.
#' Each row represents a cell from `tof_tibble`, and num_clusters
#' of the columns represent the distance between the cell and each of the reference
#' subpopulations' cluster centroids. The final column represents the cluster id of
#' the reference subpopulation with the minimum distance to the cell represented
#' by that row.
#'
#' @export
#'
#' @examples
#' # simulate single-cell data (and reference data with clusters to upsample
#' # into
#' sim_data <-
#' dplyr::tibble(
#' cd45 = rnorm(n = 1000),
#' cd38 = rnorm(n = 1000),
#' cd34 = rnorm(n = 1000),
#' cd19 = rnorm(n = 1000)
#' )
#'
#' reference_data <-
#' dplyr::tibble(
#' cd45 = rnorm(n = 200),
#' cd38 = rnorm(n = 200),
#' cd34 = rnorm(n = 200),
#' cd19 = rnorm(n = 200),
#' cluster_id = c(rep("a", times = 100), rep("b", times = 100))
#' )
#'
#' # upsample using mahalanobis distance
#' tof_upsample_distance(
#' tof_tibble = sim_data,
#' reference_tibble = reference_data,
#' reference_cluster_col = cluster_id
#' )
#'
#' # upsample using cosine distance
#' tof_upsample_distance(
#' tof_tibble = sim_data,
#' reference_tibble = reference_data,
#' reference_cluster_col = cluster_id,
#' distance_function = "cosine"
#' )
#'
tof_upsample_distance <-
function(
tof_tibble,
reference_tibble,
reference_cluster_col,
upsample_cols = where(tof_is_numeric),
parallel_cols,
distance_function = c("mahalanobis", "cosine", "pearson"),
num_cores = 1L,
return_distances = FALSE) {
# if computed on 1 core
if (missing(parallel_cols)) {
result <-
tof_cluster_ddpr(
tof_tibble = tof_tibble,
healthy_tibble = reference_tibble,
healthy_label_col = {{ reference_cluster_col }},
cluster_cols = {{ upsample_cols }},
distance_function = distance_function,
num_cores = num_cores,
return_distances = return_distances,
verbose = FALSE
)
# if computed in parallel
} else {
result <-
tof_cluster_ddpr(
tof_tibble = tof_tibble,
healthy_tibble = reference_tibble,
healthy_label_col = {{ reference_cluster_col }},
cluster_cols = {{ upsample_cols }},
parallel_cols = {{ parallel_cols }},
distance_function = distance_function,
num_cores = num_cores,
return_distances = return_distances,
verbose = FALSE
)
}
result_colnames <- colnames(result)
cluster_colname <- result_colnames[grepl("_cluster$", result_colnames)]
upsample_clusters <- result[[cluster_colname]]
new_result <- result[, !(result_colnames %in% cluster_colname)]
new_result$`.upsample_cluster` <- upsample_clusters
result <- new_result
return(result)
}
# tof_upsample_neighbor --------------------------------------------------------
#' Upsample cells into the cluster of their nearest neighbor a reference dataset
#'
#' This function performs upsampling on CyTOF data
#' by sorting single cells (passed into the function as `tof_tibble`) into
#' their most phenotypically similar cell subpopulation in a reference dataset
#' (passed into the function as `reference_tibble`). It does so by finding
#' each cell in `tof_tibble`'s nearest neighbor in `reference_tibble` and assigning
#' it to the cluster to which its nearest neighbor belongs. The nearest neighbor
#' calculation can be performed with either euclidean or cosine distance.
#'
#' @param tof_tibble A `tibble` or `tof_tbl` containing cells to be upsampled
#' into their nearest reference subpopulation.
#'
#' @param reference_tibble A `tibble` or `tof_tibble` containing cells that have
#' already been clustered or manually gated into subpopulations.
#'
#' @param reference_cluster_col An unquoted column name indicating which column in
#' `reference_tibble` contains the subpopulation label (or cluster id) for
#' each cell in `reference_tibble`.
#'
#' @param upsample_cols Unquoted column names indicating which columns in `tof_tibble` to
#' use in computing the distances used for upsampling. Defaults to all numeric columns
#' in `tof_tibble`. Supports tidyselect helpers.
#'
#' @param distance_function A string indicating which distance function should
#' be used to perform the upsampling. Options are "euclidean" (the default) and
#' "cosine".
#'
#' @param num_neighbors An integer indicating how many neighbors should be used
#' in the nearest neighbor calculation. Clusters are assigned based on majority
#' vote.
#'
#' @return A tibble with one column named
#' `.upsample_cluster`, a character vector of length `nrow(tof_tibble)`
#' indicating the id of the reference cluster to which each cell
#' (i.e. each row) in `tof_tibble` was assigned.
#'
#' @export
#'
#' @importFrom dplyr select
#' @importFrom dplyr pull
#' @importFrom dplyr tibble
#'
#' @examples
#'
#' # simulate single-cell data (and reference data with clusters to upsample
#' # into
#' sim_data <-
#' dplyr::tibble(
#' cd45 = rnorm(n = 1000),
#' cd38 = rnorm(n = 1000),
#' cd34 = rnorm(n = 1000),
#' cd19 = rnorm(n = 1000)
#' )
#'
#' reference_data <-
#' dplyr::tibble(
#' cd45 = rnorm(n = 200),
#' cd38 = rnorm(n = 200),
#' cd34 = rnorm(n = 200),
#' cd19 = rnorm(n = 200),
#' cluster_id = c(rep("a", times = 100), rep("b", times = 100))
#' )
#'
#' # upsample using euclidean distance
#' tof_upsample_neighbor(
#' tof_tibble = sim_data,
#' reference_tibble = reference_data,
#' reference_cluster_col = cluster_id
#' )
#'
#' # upsample using cosine distance
#' tof_upsample_neighbor(
#' tof_tibble = sim_data,
#' reference_tibble = reference_data,
#' reference_cluster_col = cluster_id,
#' distance_function = "cosine"
#' )
#'
tof_upsample_neighbor <-
function(
tof_tibble,
reference_tibble,
reference_cluster_col,
upsample_cols = where(tof_is_numeric),
num_neighbors = 1L,
distance_function = c("euclidean", "cosine", "l2", "ip")) {
distance_function <- rlang::arg_match(distance_function)
query_matrix <-
tof_tibble |>
dplyr::select({{ upsample_cols }}) |>
as.matrix()
nn_result <-
reference_tibble |>
dplyr::select({{ upsample_cols }}) |>
tof_find_knn(
k = num_neighbors,
distance_function = distance_function,
.query = query_matrix
)
nn_ids <- nn_result$neighbor_ids
clusters <- dplyr::pull(reference_tibble, {{ reference_cluster_col }})
if (num_neighbors == 1L) {
upsampled_clusters <- clusters[nn_ids]
} else {
upsampled_clusters <-
purrr::map_chr(
.x = seq_len(nrow(nn_ids)),
.f = \(x) {
clusters[nn_ids[x, ]] |>
table() |>
as.data.frame() |>
dplyr::as_tibble() |>
dplyr::slice_max(order_by = .data$Freq, n = 1L, with_ties = FALSE) |>
dplyr::pull(.data$Var1) |>
as.character()
}
)
}
result <-
dplyr::tibble(.upsample_cluster = upsampled_clusters)
return(result)
}
# tof_upsample -----------------------------------------------------------------
#' Upsample cells into the closest cluster in a reference dataset
#'
#' This function performs distance-based upsampling on CyTOF data
#' by sorting single cells (passed into the function as `tof_tibble`) into
#' their most phenotypically similar cell subpopulation in a reference dataset
#' (passed into the function as `reference_tibble`). It does so by calculating
#' the distance (either mahalanobis, cosine, or pearson) between each cell in
#' `tof_tibble` and the centroid of each cluster in `reference_tibble`, then
#' sorting cells into the cluster corresponding to their closest centroid.
#'
#' @param tof_tibble A `tibble` or `tof_tbl` containing cells to be upsampled
#' into their nearest reference subpopulation.
#'
#' @param reference_tibble A `tibble` or `tof_tibble` containing cells that have
#' already been clustered or manually gated into subpopulations.
#'
#' @param reference_cluster_col An unquoted column name indicating which column in
#' `reference_tibble` contains the subpopulation label (or cluster id) for
#' each cell in `reference_tibble`.
#'
#' @param upsample_cols Unquoted column names indicating which columns in `tof_tibble` to
#' use in computing the distances used for upsampling. Defaults to all numeric columns
#' in `tof_tibble`. Supports tidyselect helpers.
#'
#' @param ... Additional arguments to pass to the `tof_upsample_*`
#' function family member corresponding to the chosen method.
#'
#' @param augment A boolean value indicating if the output should column-bind the
#' cluster ids of each cell as a new column in `tof_tibble` (TRUE, the default) or if
#' a single-column tibble including only the cluster ids should be returned (FALSE).
#'
#' @param method A string indicating which clustering methods should be used. Valid
#' values include "distance" (default) and "neighbor".
#'
#' @return A `tof_tbl` or `tibble` If augment = FALSE, it will have a single column encoding
#' the upsampled cluster ids for each cell in `tof_tibble`.
#' If augment = TRUE, it will have
#' ncol(tof_tibble) + 1 columns: each of the (unaltered) columns in `tof_tibble`
#' plus an additional column encoding the cluster ids.
#'
#' @export
#'
#' @importFrom dplyr bind_cols
#'
#' @examples
#' # simulate single-cell data (and reference data with clusters to upsample
#' # into
#' sim_data <-
#' dplyr::tibble(
#' cd45 = rnorm(n = 1000),
#' cd38 = rnorm(n = 1000),
#' cd34 = rnorm(n = 1000),
#' cd19 = rnorm(n = 1000)
#' )
#' reference_data <-
#' dplyr::tibble(
#' cd45 = rnorm(n = 200),
#' cd38 = rnorm(n = 200),
#' cd34 = rnorm(n = 200),
#' cd19 = rnorm(n = 200),
#' cluster_id = c(rep("a", times = 100), rep("b", times = 100))
#' )
#'
#' # upsample using distance to cluster centroids
#' tof_upsample(
#' tof_tibble = sim_data,
#' reference_tibble = reference_data,
#' reference_cluster_col = cluster_id,
#' method = "distance"
#' )
#'
#' # upsample using distance to nearest neighbor
#' tof_upsample(
#' tof_tibble = sim_data,
#' reference_tibble = reference_data,
#' reference_cluster_col = cluster_id,
#' method = "neighbor"
#' )
#'
tof_upsample <-
function(
tof_tibble,
reference_tibble,
reference_cluster_col,
upsample_cols = where(tof_is_numeric),
...,
augment = TRUE,
method = c("distance", "neighbor")) {
method <- rlang::arg_match(arg = method, values = c("distance", "neighbor"))
if (method == "distance") {
result <-
tof_upsample_distance(
tof_tibble = tof_tibble,
reference_tibble = reference_tibble,
reference_cluster_col = {{ reference_cluster_col }},
upsample_cols = {{ upsample_cols }},
...
)
} else if (method == "neighbor") {
result <-
tof_upsample_neighbor(
tof_tibble = tof_tibble,
reference_tibble = reference_tibble,
reference_cluster_col = {{ reference_cluster_col }},
upsample_cols = {{ upsample_cols }},
...
)
} else {
stop("Not a valid method.")
}
if (augment) {
result <-
dplyr::bind_cols(tof_tibble, result)
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.