#' Search for a sequence
#'
#' Search for one or more amino acid or junction CDR3 sequences in a study
#' tibble.
#'
#' @param study_table A tibble generated by the LymphoSeq2 functions
#' [readImmunoSeq()] or [productiveSeq()]. "junction_aa" or "junction",
#' "duplicate_frequency", and "duplicate_count" are required columns.
#' @param sequence A character vector of one ore more amino acid or junction
#' CDR3 sequences to search.
#' @param seq_type A character vector specifying the type of sequences to be
#' searched. Available options are "junction_aa" or "junction".
#' @param edit_distance An integer giving the minimum edit distance that the
#' sequence must be less than or equal to. See details below.
#' @param match A string indicating the type of sequence matching to perform.
#' Acceptable values are `"global"` and `"partial"`. See details below.
#' @details An exact partial match means the searched sequence is contained
#' within target sequence. An exact global match means the searched sequence is
#' identical to the target sequence.
#'
#' Edit distance is a way of quantifying how dissimilar two sequences
#' are to one another by counting the minimum number of operations required to
#' transform one sequence into the other. For example, an edit distance of 0
#' means the sequences are identical and an edit distance of 1 indicates that
#' the sequences different by a single amino acid or junction.
#' @return Returns the rows for every instance in the list of data frames where
#' the searched sequence(s) appeared.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing",
#' package = "LymphoSeq2")
#' study_table <- LymphoSeq2::readImmunoSeq(path = file_path, threads = 1)
#' study_table <- LymphoSeq2::topSeqs(study_table, top = 100)
#' aa1 <- "CASSPVSNEQFF"
#' aa2 <- "CASSQEVPPYQAFF"
#' LymphoSeq2::searchSeq(
#' study_table = study_table,
#' sequence = aa1,
#' seq_type = "junction_aa",
#' edit_distance = 0,
#' match = "global"
#' )
#' LymphoSeq2::searchSeq(
#' study_table = study_table,
#' sequence = c(aa1, aa2),
#' seq_type = "junction_aa",
#' edit_distance = 0,
#' match = "global"
#' )
#' LymphoSeq2::searchSeq(
#' study_table = study_table,
#' sequence = aa1,
#' seq_type = "junction_aa",
#' edit_distance = 1,
#' match = "global"
#' )
#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA"
#' LymphoSeq2::searchSeq(
#' study_table = study_table,
#' sequence = nt,
#' seq_type = "junction",
#' edit_distance = 3,
#' match = "global"
#' )
#' LymphoSeq2::searchSeq(
#' study_table = study_table,
#' sequence = "CASSPVS",
#' seq_type = "junction_aa",
#' edit_distance = 0,
#' match = "global"
#' )
#' LymphoSeq2::searchSeq(
#' study_table = study_table,
#' sequence = nt,
#' seq_type = "junction",
#' edit_distance = 0,
#' match = "global"
#' )
#' @export
searchSeq <- function(study_table,
sequence,
seq_type = "junction",
edit_distance = 0,
match = "global") {
query_list <- study_table |>
dplyr::filter(!is.na(!!base::as.symbol(seq_type))) |>
dplyr::pull(!!base::as.symbol(seq_type))
search_tables <- sequence |>
purrr::map(~ findSeq(.x, query_list, edit_distance, seq_type, match)) |>
dplyr::bind_rows()
search_tables <- dplyr::left_join(study_table, search_tables,
by = stats::setNames(nm = seq_type)) |>
dplyr::filter(!is.na(edit_distance))
return(search_tables)
}
#' Find sequences of interest
#'
#' @describeIn searchSeq Find all sequences below edit distance threshold from
#' query list.
#' @param query_list List of productive CDR3 nucleotide or amino acid sequences
#' @return Tibble of sequences that differ from the input sequence by the
#' edit distance threshold provided
findSeq <- function(sequence, query_list, edit_distance, seq_type, match) {
if (match == "global") {
partial <- FALSE
} else if (match == "partial") {
partial <- TRUE
}
edist <- utils::adist(sequence, query_list, partial = partial)
match_list <- query_list[(edist <= edit_distance)]
edist_list <- edist[(edist <= edit_distance)]
sequence_table <- tibble::tibble(
c1 = match_list,
c2 = edist_list,
c3 = sequence,
.name_repair = ~ c(seq_type, "edit_distance", "searchSequence")
) |>
dplyr::filter(!is.na(!!base::as.symbol(seq_type))) |>
dplyr::distinct()
return(sequence_table)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.