#' Cluster cells based on CDR3 sequences
#'
#' @param input Single cell object or data.frame containing V(D)J data. If a
#' data.frame is provided, the cell barcodes should be stored as row names.
#' @param data_col meta.data column containing sequences to use for calculating
#' Levenshtein distance.
#' @param chain Chain to use for clustering sequences. Cells with more than one
#' of the provided chain will be excluded from the analysis. If NULL, sequences
#' for cells with multiple chains will be concatenated.
#' @param method Method to use for clustering, possible values include:
#'
#' - 'louvain', multi-level optimization of modality implemented with
#' [igraph::cluster_louvain()]
#' - 'leiden', the Leiden clustering algorithm implemented with
#' [igraph::cluster_leiden()]
#'
#' @param resolution Resolution (coarseness) of clusters
#' @param k Number of neighbors for k-nearest neighbors algorithm
#' @param dist_method Method to use for computing distance between sequences.
#' If NULL, distance is calculated for amino acid sequences using the BLOSUM62
#' substitution matrix and levenshtein distance is calculated for nucleotide
#' sequences. Other possible values include:
#'
#' - 'levenshtein'
#' - 'hamming'
#' - The name of a substitution matrix available from the Biostrings package,
#' e.g. 'BLOSUM62'
#'
#' @param run_umap Should the Uniform Manifold Approximation and Projection
#' (UMAP) dimensional reduction method be performed. This will add UMAP
#' coordinates to the meta.data.
#' @param chain_col meta.data column containing chains for each cell.
#' @param prefix Prefix to add to graph name
#' @param return_df Return results as a data.frame. If FALSE, results will be
#' added to the input object.
#' @param sep Separator used for storing per cell V(D)J data
#' @param ... Additional parameters to pass to clustering method
#' @return Single cell object or data.frame with clustering results
#' @seealso [plot_motifs()]
#'
#' @examples
#' # Cluster cells based on CDR3 amino acid sequences and plot results
#' res <- cluster_sequences(
#' vdj_sce,
#' data_col = "cdr3",
#' chain = "IGK",
#' resolution = c(0.5, 1)
#' )
#'
#' plot_scatter(
#' res,
#' x = "cdr3_UMAP_1",
#' y = "cdr3_UMAP_2",
#' data_col = "cdr3_cluster_0.5"
#' )
#'
#' @export
cluster_sequences <- function(input, data_col = "cdr3", chain = NULL,
method = "louvain", resolution = 0.5, k = 10,
dist_method = NULL, run_umap = TRUE,
chain_col = global$chain_col,
prefix = paste0(data_col, "_"),
return_df = FALSE, sep = global$sep, ...) {
# Check for installed packages
.check_packages(c("dbscan", "igraph (>= 1.3.0)"))
.check_packages("Biostrings", db = "Bioconductor")
# Check that columns are present in object
.check_obj_cols(
input, data_col, chain = chain, chain_col = chain_col
)
# Check input classes
.check_args()
# Check input values
mets <- c("louvain", "leiden")
if (!method %in% mets) cli::cli_abort("`method` must be {.or {mets}}")
# Fetch data
vdj <- .fetch_seqs(
input,
seq_col = data_col,
chain = chain,
chain_col = chain_col,
sep = sep
)
vdj <- dplyr::distinct(vdj, !!!syms(c(global$cell_col, data_col)))
seqs <- vdj[[data_col]]
seqs <- unique(sort(seqs))
# Calculate distance
if (is.null(dist_method)) {
typ <- .detect_seq_type(seqs, n = 100)
dist_method <- ifelse(typ == "aa", "BLOSUM62", "levenshtein")
}
dist_args <- list(x = seqs)
if (!dist_method %in% c("hamming", "levenshtein")) {
dist_args$substitutionMatrix <- dist_method
dist_method <- "substitutionMatrix"
}
dist_args$method <- dist_method
dist_mat <- .lift(Biostrings::stringDist)(dist_args)
# Calculate UMAP
if (run_umap) {
.check_packages("uwot")
umap_res <- uwot::umap(dist_mat)
colnames(umap_res) <- paste0(prefix, c("UMAP_1", "UMAP_2"))
umap_res <- tibble::as_tibble(umap_res)
}
# Run KNN
knn_res <- dbscan::kNN(dist_mat, k = k)
make_adj_df <- function(mat) {
res <- tibble::as_tibble(mat, rownames = "Var1")
res <- tidyr::pivot_longer(res, -"Var1", values_to = "Var2")
res
}
adj_df <- make_adj_df(knn_res$id)
adj_df <- dplyr::mutate(
adj_df,
Var1 = seqs[as.integer(.data$Var1)],
Var2 = seqs[.data$Var2]
)
# Create adjacency graph
adj_grph <- dplyr::select(adj_df, all_of(c("Var1", "Var2")))
adj_grph <- igraph::graph_from_data_frame(adj_grph, directed = FALSE)
# Clustering parameters
clst_args <- list(graph = adj_grph, ...)
# Louvain clustering
if (identical(method, "louvain")) {
clst_method <- igraph::cluster_louvain
rsln_arg <- "resolution"
# Leiden clustering
} else if (identical(method, "leiden")) {
clst_method <- igraph::cluster_leiden
rsln_arg <- "resolution_parameter"
clst_args$objective_function <- clst_args$objective_function %||%
"modularity"
}
# Cluster sequences
resolution <- purrr::set_names(
resolution,
paste0(prefix, "cluster_", resolution)
)
res <- purrr::imap_dfc(resolution, ~ {
clst_args[rsln_arg] <- .x
clsts <- .lift(clst_method)(clst_args)
clsts <- igraph::membership(clsts)
tibble(!!sym(.y) := as.character(clsts))
})
# Combine cluster and UMAP results, add sequences
if (run_umap) res <- dplyr::bind_cols(umap_res, res)
res <- dplyr::mutate(res, !!sym(data_col) := seqs)
# Join by CDR3 to match clusters with cell IDs
res <- dplyr::left_join(vdj, res, by = data_col)
res <- dplyr::select(res, -all_of(data_col))
# Format results
meta <- .get_meta(input)
res <- dplyr::left_join(meta, res, by = global$cell_col)
if (return_df) input <- meta
res <- .add_meta(input, meta = res)
res
}
#' Plot CDR3 sequence motifs
#'
#' @param input Single cell object or data.frame containing V(D)J data. If a
#' data.frame is provided, the cell barcodes should be stored as row names.
#' @param data_col meta.data column containing sequences to use for plotting.
#' @param cluster_col meta.data column containing cluster IDs to use for
#' grouping cells.
#' @param chain Chain to use for plotting sequences. Cells with more than one
#' of the provided chain will be excluded from the analysis.
#' @param chain_col meta.data column containing chains for each cell.
#' @param width Integer specifying how many residues to include, sequences
#' longer than width will be get trimmed based on the align_end argument,
#' sequences shorter than width will get removed. If a fraction is provided,
#' the width cutoff is set based on percent rank, i.e. a value of 0.75 would
#' select a width where at least 75% of sequences are longer than the cutoff.
#' @param align_end End to use for aligning sequences, specify '5' or '3' to
#' align sequences at the 5' or 3' end when plotting.
#' @param quiet If `TRUE` messages will not be displayed
#' @param sep Separator used for storing per cell V(D)J data
#'
#'
#' @param plot_colors Character vector containing colors for plotting
#' @param plot_lvls Character vector containing levels for ordering
#' @param panel_nrow The number of rows to use for arranging plot panels
#' @param panel_scales Should scales for plot panels be fixed or free. This
#' passes a scales specification to `ggplot2::facet_wrap()`, can be 'fixed', 'free',
#' 'free_x', or 'free_y'. 'fixed' will cause panels to share the same scales.
#' Use this when separate bar graphs are created for each cell cluster.
#' @param n_label Location on plot where n label should be added, this can be
#' one of the following:
#'
#' - 'corner', display the total number of cells plotted in the top right
#' corner, the position of the label can be modified by passing `x` and `y`
#' specifications with the `label_params` argument
#' - 'none', do not display the number of cells plotted
#'
#' @param label_params Named list providing additional parameters to modify
#' n label aesthetics, e.g. list(size = 4, color = "red")
#' @param ... Additional parameters to pass to [ggseqlogo::geom_logo()]
#' @return ggplot object
#' @seealso [cluster_sequences()]
#'
#' @examples
#' # Cluster cells based on CDR3 amino acid sequences and plot sequence motifs
#' res <- cluster_sequences(
#' vdj_sce,
#' data_col = "cdr3"
#' )
#'
#' plot_motifs(
#' res,
#' data_col = "cdr3",
#' cluster_col = "cdr3_cluster_0.5",
#' chain = "IGK"
#' )
#'
#' @export
plot_motifs <- function(input, data_col = global$cdr3_col, cluster_col = NULL,
chain, chain_col = global$chain_col, width = 0.75,
align_end = "5", quiet = FALSE, sep = global$sep,
plot_colors = NULL,
plot_lvls = names(plot_colors),
panel_nrow = NULL,
panel_scales = "free", n_label = "corner",
label_params = list(), ...) {
# Check installed packages
.check_packages("ggseqlogo")
# Check that columns are present in object
.check_obj_cols(
input, data_col, cluster_col, chain = chain, chain_col = chain_col
)
# Check input classes
.check_args()
# Check input values
if (width <= 0) cli::cli_abort("`width` must be >0")
# Fetch sequences
seqs <- .fetch_seqs(
input,
chain = chain,
seq_col = data_col,
chain_col = chain_col,
sep = sep
)
vdj_cols <- c(global$cell_col, data_col, cluster_col)
seqs <- dplyr::select(seqs, all_of(vdj_cols))
n_seqs <- nrow(seqs)
# Trim/filter sequences
trim_fn <- function(x) .trim_seq(x[[data_col]], width, align_end)
if (!is.null(cluster_col)) {
seqs <- .set_lvls(seqs, cluster_col, plot_lvls)
seqs <- split(seqs, seqs[[cluster_col]])
seqs <- purrr::map(seqs, trim_fn)
seqs <- purrr::discard(seqs, is.null)
plt_n_seqs <- sum(purrr::map_dbl(seqs, length))
} else {
seqs <- trim_fn(seqs)
plt_n_seqs <- length(seqs)
}
# Check number of removed sequences
if (plt_n_seqs == 0) {
cli::cli_abort(
"There are no sequences longer than {width} residues,
try selecting a shorter cutoff for `width`"
)
}
if (plt_n_seqs < n_seqs && !quiet) {
pct <- 1 - (plt_n_seqs / n_seqs)
pct <- round(pct * 100, 1)
cli::cli_alert_info(
"{n_seqs - plt_n_seqs} sequences ({pct}%) are shorter
than `width` and were removed",
wrap = TRUE
)
}
# Set n label data
# If cluster_col is provided, seqs will be named list with a vector for each
# cluster
if (!is.list(seqs)) seqs <- list(seq = seqs)
n_lab_dat <- tibble::tibble(
seq_group = names(seqs),
.n = purrr::map_dbl(seqs, length)
)
# Create logos
res <- ggplot2::ggplot() +
ggseqlogo::geom_logo(seqs, ...) +
djvdj_theme()
if (!all(n_label == "none")) {
res <- .add_n_label(
res, n_lab_dat,
n_label = n_label,
crnr_col = "seq_group",
n_fn = sum,
lab_args = label_params
)
}
if (!is.null(cluster_col)) {
res <- res +
ggplot2::facet_wrap(~ seq_group, scales = panel_scales, nrow = panel_nrow)
}
if (!is.null(plot_colors)) {
res <- withCallingHandlers(
expr = res + ggplot2::scale_fill_manual(values = plot_colors),
message = function(msg) {
if (!grepl("Scale for.+fill.+is already present", msg)) {
cli::cli_alert(msg)
}
tryInvokeRestart("muffleMessage")
}
)
}
res
}
#' Fetch chain sequences
#'
#' @param input Single cell object or data.frame containing V(D)J data. If a
#' data.frame is provided, the cell barcodes should be stored as row names.
#' @param seq_col meta.data column containing sequences
#' @param chain Chain to use for clustering sequences. Cells with more
#' than one of the provided chain will be excluded from the analysis.
#' @param chain_col meta.data column containing chains for each cell.
#' @param sep Separator used for storing per cell V(D)J data
#' @noRd
.fetch_seqs <- function(input, seq_col, chain, chain_col, sep = global$sep) {
per_chain <- !is.null(chain)
res <- fetch_vdj(
input,
data_cols = c(seq_col, chain_col),
unnest = FALSE,
filter_cells = TRUE,
per_chain = per_chain,
clonotype_col = seq_col,
sep = sep
)
if (!per_chain) {
if (is.null(sep)) {
cli::cli_abort("`sep` must be provided when `per_chain` is`FALSE`")
}
res <- dplyr::mutate(
res,
!!sym(seq_col) := stringr::str_remove_all(!!sym(seq_col), sep)
)
return(res)
}
res <- .filter_chains(
res,
data_cols = seq_col,
chain = chain,
chain_col = chain_col,
col_names = "{.col}",
allow_dups = FALSE
)
# Remove chain_col since it does not get filtered by .filter_chains() and
# will be included in the results as a list-col
res <- dplyr::select(res, -all_of(chain_col))
res <- tidyr::unnest(res, cols = all_of(seq_col))
res <- dplyr::filter(res, !is.na(!!sym(seq_col)))
res
}
#' Trim and filter sequences
#'
#' @param seqs Vector of sequences
#' @param width Integer specifying how many residues to include, sequences
#' longer than width will be get trimmed based on the end argument, sequences
#' shorter than width will get removed. If a fraction is provided, the width
#' cutoff is set based on percent rank, i.e. a value of 0.75 would select a
#' width where at least 75% of sequences are longer than the cutoff.
#' @param end End to use for aligning sequences, specify '5' or '3' to
#' align sequences at the 5' or 3' end.
#' @noRd
.trim_seq <- function(seqs, width = 0.75, end = "5") {
lens <- nchar(seqs)
if (width < 1) {
width <- 1 - width
pct <- dplyr::cume_dist(lens)
width <- min(lens[pct > width])
}
res <- seqs[lens >= width]
if (purrr::is_empty(res)) {
cli::cli_warn(
"There are no sequences present that are >{width} residues long"
)
return(NULL)
}
s <- ifelse(end == "5", "right", "left")
res <- stringr::str_trunc(res, width, side = s, ellipsis = "")
res
}
#' Detect sequence type
#'
#' @param seqs Vector of sequences
#' @param n Number of sequences to check
#' @noRd
.detect_seq_type <- function(seqs, n = 100) {
dna <- Biostrings::DNA_BASES
rna <- Biostrings::RNA_BASES
aa <- Biostrings::AA_STANDARD
# Take first n sequences and split into residues
seqs <- stats::na.omit(seqs)
seqs <- utils::head(seqs, n)
nts <- purrr::map(seqs, strsplit, "")
nts <- unlist(nts)
# Check intersection with known characters
int <- intersect(nts, c(dna, rna, aa))
if (length(int) == 0) cli::cli_abort("Could not determine sequence type")
# Check if any characters overlap with aa and not dna, rna
int <- setdiff(intersect(nts, aa), c(dna, rna))
res <- dplyr::case_when(
length(int) > 0 ~ "aa",
"U" %in% nts ~ "rna",
TRUE ~ "dna"
)
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.