#' Productive sequences
#'
#' \code{productiveSeq} Remove unproductive CDR3 sequences from a list of data frames.
#'
#' @param study_table A tibble consisting antigen receptor sequencing
#' data imported by the LymphoSeq function readImmunoSeq. "junction_aa", "duplicate_count", and
#' "duplicate_frequency" are required columns.
#' @param aggregate Indicates whether the values of "duplicate_count", "duplicate_frequency",
#' and "esimatedNumberGenomes" should be aggregated by amino acid or junction
#' sequence. Acceptable values are "junction_aa" or "junction". If "junction_aa"
#' is selected, then resulting data frame will have columns corresponding to
#' junction_aa, duplicate_count, and duplicate_frequency.
#' If "junction" is selected then all columns in the
#' original list will be present in the outputted list. The difference in
#' output is due to the fact that the same amino acid CDR3 sequence may be
#' encoded by multiple unique junction sequences with differing V, D, and J
#' genes.
#' @param prevalence A Boolean value indicating if a new column should be added
#' to each of the data frames giving the prevalence of each CDR3 amino acid
#' sequence in 55 healthy donor peripheral blood samples. TRUE means the column
#' is added and FALSE means it is not. Values range from 0 to 100\% where
#' 100\% means the sequence appeared in the blood of all 55 individuals.
#' @return Returns a list of data frames of productive amino acid sequences with
#' recomputed values for "duplicate_count", "duplicate_frequency".
#' A productive sequences is defined as a sequences
#' that is in frame and does not have an early stop codon.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2")
#' stable <- readImmunoSeq(path = file_path)
#' atable <- productiveSeq(study_table = stable,
#' aggregate = "junction_aa",
#' prevalence = TRUE)
#' @export
#' @import tidyverse dtplyr
#'
productiveSeq <- function(study_table, aggregate = "junction_aa", prevalence = FALSE) {
if (aggregate == "junction" & prevalence) {
stop("In order to add prevalence to your list of data frames, aggregate must be equal 'junction_aa'.",
call. = FALSE)
}
nsample <- study_table %>%
dplyr::pull(repertoire_id) %>%
base::unique() %>%
base::length()
progress_bar <- progress::progress_bar$new(format = "Subsetting productive sequences [:bar] :percent eta: :eta",
total = nsample, clear = FALSE, width = 60)
progress_bar$tick(0)
agg_table <- study_table %>%
dplyr::group_by(repertoire_id) %>%
dplyr::group_split() %>%
purrr::map(~ aggreateSeq(.x, aggregate, prevalence, progress_bar)) %>%
dplyr::bind_rows()
return(agg_table)
}
#' Group productive sequences by repertoire
#'
#' @param progress_bar Progress progress bar
#' @inheritParams productiveSeq
aggreateSeq <- function(study_table, aggregate, prevalence, progress_bar) {
progress_bar$tick()
if (aggregate == "junction") {
study_table <- study_table %>%
dplyr::filter(reading_frame == "in-frame") %>%
dplyr::mutate(vdj_comb_call = stringr::str_glue("{v_call};{j_call};{d_call}"),
vdj_comb_family = stringr::str_glue("{v_family};{j_family};{d_family}")) %>%
dtplyr::lazy_dt()
study_table <- study_table %>%
dplyr::group_by(junction, vdj_comb_call) %>%
dplyr::mutate(vdj_comb_count = sum(duplicate_count)) %>%
dplyr::ungroup() %>%
dplyr::group_by(junction) %>%
dplyr::arrange(desc(duplicate_count), desc(vdj_comb_count)) %>%
dplyr::summarize(repertoire_id = dplyr::first(repertoire_id),
junction_aa = dplyr::first(junction_aa),
duplicate_count = base::sum(duplicate_count),
v_call = dplyr::first(v_call),
reading_frame = dplyr::first(reading_frame),
j_call = dplyr::first(j_call),
d_call = dplyr::first(d_call),
v_family = dplyr::first(v_family),
d_family = dplyr::first(d_family),
j_family = dplyr::first(j_family)) %>%
dplyr::ungroup() %>%
dplyr::mutate(duplicate_frequency = duplicate_count / base::sum(duplicate_count)) %>%
dplyr::select(repertoire_id, junction, junction_aa, v_call, d_call, j_call, v_family,
d_family, j_family, reading_frame, duplicate_count, duplicate_frequency) %>%
dplyr::as_tibble()
} else if (aggregate == "junction_aa") {
study_table <- study_table %>%
dplyr::filter(reading_frame == "in-frame") %>%
dplyr::mutate(vdj_comb_call = stringr::str_glue("{v_call};{j_call};{d_call}"),
vdj_comb_family = stringr::str_glue("{v_family};{j_family};{d_family}")) %>%
dtplyr::lazy_dt()
study_table <- study_table %>%
dplyr::group_by(junction_aa, vdj_comb_call) %>%
dplyr::mutate(vdj_comb_count = base::sum(duplicate_count)) %>%
dplyr::ungroup() %>%
dplyr::group_by(junction_aa) %>%
dplyr::arrange(desc(duplicate_count), desc(vdj_comb_count)) %>%
dplyr::summarize(repertoire_id = dplyr::first(repertoire_id),
duplicate_count = base::sum(duplicate_count),
v_call = dplyr::first(v_call),
reading_frame = dplyr::first(reading_frame),
j_call = dplyr::first(j_call),
d_call = dplyr::first(d_call),
v_family = dplyr::first(v_family),
d_family = dplyr::first(d_family),
j_family = dplyr::first(j_family)) %>%
dplyr::ungroup() %>%
dplyr::mutate(duplicate_frequency = duplicate_count / base::sum(duplicate_count)) %>%
dplyr::select(repertoire_id, junction_aa, v_call, d_call, j_call, v_family, d_family,
j_family, reading_frame, duplicate_count, duplicate_frequency) %>%
dplyr::as_tibble()
}
if (prevalence) {
prev_table <- LymphoSeq2::prevalenceTRB
study_table <- dplyr::left_join(study_table, prev_table, by=c("junction_aa" = "aminoAcid")) %>%
dplyr::mutate(prevalence = tidyr::replace_na(prevalence, 0))
}
return(study_table)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.