Nothing
#' Search for a sequence
#'
#' Search for one or more amino acid or nucleotide CDR3 sequences in a list of
#' data frames.
#'
#' @param list A list of data frames generated by the LymphoSeq functions readImmunoSeq
#' or productiveSeq. "aminoAcid" or "nucleotide", "frequencyCount", and
#' "count" are required columns.
#' @param sequence A character vector of one ore more amino acid or nucleotide
#' CDR3 sequences to search.
#' @param type A character vector specifying the type of sequence(s) to be
#' searched. Available options are "aminoAcid" or "nucleotide".
#' @param match A character vector specifying whether an exact partial or exact global
#' match of the searched sequence(s) is desired. Available options are
#' "partial" and "global".
#' @param editDistance An integer giving the minimum edit distance that the
#' sequence must be less than or equal to. See details below.
#' @details An exact partial match means the searched sequence is contained within
#' target sequence. An exact global match means the searched sequence is identical to
#' the target sequence.
#'
#' Edit distance is a way of quantifying how dissimilar two sequences
#' are to one another by counting the minimum number of operations required to
#' transform one sequence into the other. For example, an edit distance of 0
#' means the sequences are identical and an edit distance of 1 indicates that
#' the sequences different by a single amino acid or nucleotide.
#' @return Returns the rows for every instance in the list of data frames where
#' the searched sequence(s) appeared.
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#'
#' file.list <- readImmunoSeq(path = file.path)
#'
#' aa1 <- "CASSPVSNEQFF"
#'
#' aa2 <- "CASSQEVPPYQAFF"
#'
#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid",
#' match = "global", editDistance = 0)
#'
#' searchSeq(list = file.list, sequence = c(aa1, aa2),
#' type = "aminoAcid", match = "global", editDistance = 0)
#'
#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", editDistance = 1)
#'
#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA"
#'
#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 3)
#'
#' searchSeq(list = file.list, sequence = "CASSPVS", type = "aminoAcid",
#' match = "partial", editDistance = 0)
#'
#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 0)
#' @export
#' @importFrom plyr llply
#' @importFrom utils adist
searchSeq <- function(list, sequence, type = "aminoAcid", match = "global", editDistance = 0) {
if (editDistance >= 1) {
merged.results <- data.frame()
i <- 1
for (i in 1:length(list)) {
file <- list[[i]]
ed <- adist(sequence, file[, type])
rownames(ed) <- sequence
colnames(ed) <- file[, type]
ed.index <- which(ed <= editDistance, arr.ind = TRUE)
if (nrow(ed.index) >= 1) {
ed.subset <- as.integer()
j <- 1
for (j in 1:nrow(ed.index)) {
ed.j <- ed[ed.index[j, "row"], ed.index[j, "col"]]
ed.subset <- c(ed.subset, ed.j)
}
results <- data.frame(searchSequnece = rownames(ed)[ed.index[, 1]],
foundSequnece = colnames(ed)[ed.index[, 2]],
editDistance = ed.subset)
results <- results[!duplicated(results), ]
search <- plyr::llply(list[i], function(x)
x[which(x[, type] %in% results$foundSequnece), ])
found <- NULL
k <- 1
for (k in 1:length(search)) {
if (nrow(search[[k]]) != 0) {
search[[k]] <- cbind(rep(names(search)[k], nrow(search[[k]])),
search[[k]])
colnames(search[[k]])[1] <- "sample"
found <- rbind(found, search[[k]])
}
}
names(found)[which(names(found) == type)] <- "foundSequnece"
merged.search <- merge(results, found)
merged.results <- rbind(merged.results, merged.search)
merged.results <- merged.results[c("sample", setdiff(names(merged.results), "sample"))]
}
}
if (nrow(merged.results) >= 1) {
merged.results$foundSequnece = as.character(merged.results$foundSequnece)
merged.results$searchSequnece = as.character(merged.results$searchSequnece)
merged.results = merged.results[order(merged.results$frequencyCount, decreasing = TRUE),]
rownames(merged.results) = NULL
return(merged.results)
} else {
cat("No sequences found.")
}
} else {
if (match == "global") {
search <- plyr::llply(list, function(x)
x[which(x[, type] %in% sequence), ])
}
if (match == "partial") {
search <- plyr::llply(list, function(x)
x[grep(paste(sequence, collapse = "|"), x[, type]), ])
}
found <- NULL
l <- 1
for (l in 1:length(search)) {
if (nrow(search[[l]]) != 0) {
search[[l]] <- cbind(rep(names(search)[l], nrow(search[[l]])), search[[l]])
colnames(search[[l]])[1] <- "sample"
found <- rbind(found, search[[l]])
}
}
if (is.null(found)) {
message("No sequences found.")
} else {
found = found[order(found$frequencyCount, decreasing = TRUE),]
rownames(found) = NULL
return(found)
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.