#' @title
#' Identify Terminal Inverted Repeat Matches
#'
#' @description
#' Searches a \code{\link[Biostrings:XStringSet-class]{DNAStringSet}}
#' for potential TIRs based on sequence similarity.
#'
#' @param tirSeq
#' A \code{\link[Biostrings:DNAString-class]{DNAString}}
#' object to be searched for.
#'
#' @param Genome
#' A \code{\link[Biostrings:XStringSet-class]{DNAStringSet}}
#' object containing the
#' \code{\link[Biostrings:DNAString-class]{DNAString}}
#' objects to be searched.
#'
#' @param mismatch
#' The allowable mismatch between \code{tirSeq} and a
#' given slice of \code{Genome}. Includes indels.
#'
#' @param strand
#' The directionality of the search string ("+" or "-"). Note
#' that this does affect the search for tirSeqs, if you wish
#' to search the reverse strand you should use the reverse
#' complement of your sequence.
#'
#' @param tsdLength
#' Integer referring to the length of the flanking TSD region.
#'
#' @param fixed
#' Logical that will be passed to the `fixed` argument of
#' \code{\link[Biostrings]{matchPattern}}. Determines the behaviour of IUPAC
#' ambiguity codes when searching for TIR sequences.
#'
#' @details
#' Called by \code{\link{packSearch}}. Used by
#' \code{\link{packSearch}} as an initial filtering stage.
#' \code{\link[Biostrings]{matchPattern}} from Biostrings
#' is used for pattern matching. It is recommended to use
#' the general pipeline function \code{\link{packSearch}}
#' for identification of potential pack elements, however
#' each stage may be called individually.
#'
#' @return
#' A dataframe, \code{tirMatches}, containing identified
#' matches. The dataframe is in the format generated by
#' \code{\link{packSearch}}.
#'
#' @seealso
#' \code{\link[Biostrings:XStringSet-class]{DNAStringSet}},
#' \code{\link{packSearch}}, \code{\link[Biostrings]{matchPattern}},
#' \code{\link[Biostrings:DNAString-class]{DNAString}}
#'
#' @examples
#' data(arabidopsisThalianaRefseq)
#'
#' forwardMatches <- identifyTirMatches(
#' Biostrings::DNAString("CACTACAA"),
#' arabidopsisThalianaRefseq,
#' tsdLength = 3,
#' strand = "+"
#' )
#'
#' @author
#' Jack Gisby
#'
#' @export
identifyTirMatches <- function(tirSeq, Genome, mismatch = 0, strand = "*",
tsdLength, fixed=TRUE) {
if (strand != "-" & strand != "+") {
stop("Argument 'strand' must be specified as '-' or '+'")
}
tirMatches <- initialisePackMatches()
# get tir matches
for (i in seq_len(length(Genome))) {
matches <- Biostrings::matchPattern(
tirSeq,
Genome[[i]],
max.mismatch = mismatch,
with.indels = TRUE,
fixed = fixed
)
if (length(matches) > 0) {
tirMatches <- rbind(tirMatches, data.frame(
seqnames = names(Genome)[i],
start = GenomicRanges::start(matches),
end = GenomicRanges::start(matches) +
GenomicRanges::width(matches) - 1,
width = GenomicRanges::width(matches),
strand = strand
))
}
}
# remove matches whose TSD sequences do not exist (index range)
if (strand == "-") {
removeMatch <- mapply(function(end, seqnames, tsdLength, Genome) {
seq <- Genome[names(Genome) == seqnames][[1]]
return((end + tsdLength) > length(seq))
},
tirMatches$end,
tirMatches$seqnames,
MoreArgs = list(tsdLength, Genome)
)
tirMatches <- tirMatches[removeMatch == FALSE, ]
} else if (strand == "+") {
tirMatches <- tirMatches[tirMatches$start > tsdLength, ]
}
return(tirMatches)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.