Nothing
#' @title
#' Cluster Transposons with VSEARCH
#'
#' @description
#' Cluster potential pack-TYPE elements by sequence
#' similarity. Resulting groups may be aligned with
#' \code{\link{packAlign}}, or the clusters may be
#' analysed with \code{\link{tirClust}}
#'
#' @param packMatches
#' A dataframe of potential Pack-TYPE transposable elements,
#' in the format given by \code{\link{packSearch}}. This
#' dataframe is in the format produced by coercing a
#' \code{link[GenomicRanges:GRanges-class]{GRanges}}
#' object to a dataframe: \code{data.frame(GRanges)}.
#' Will be saved as a FASTA file for VSEARCH.
#'
#' @param Genome
#' A DNAStringSet object containing sequences referred to
#' in \code{packMatches} (the object originally used to
#' predict the transposons \code{\link{packSearch}}).
#'
#' @param identity
#' The sequence identity of two transposable elements in
#' \code{packMatches} required to be grouped into a cluster.
#'
#' @param threads
#' The number of threads to be used by VSEARCH.
#'
#' @param identityDefinition
#' The pairwise identity definition used by VSEARCH.
#' Defaults to 2, the standard VSEARCH definition.
#'
#' @param strand
#' The strand direction (+, - or *) to be clustered.
#'
#' @param saveFolder
#' The folder to save output files (uc, blast6out, FASTA)
#'
#' @param vSearchPath
#' When the package is run on windows systems, the
#' location of the VSEARCH executable file must be
#' given; this should be left as default on
#' Linux/MacOS systems.
#'
#' @param maxWildcards
#' The maximal allowable proportion of wildcards in the
#' sequence of each match (defaults to \code{0.05}).
#'
#' @return
#' Saves cluster information, including a \code{uc} and
#' \code{blast6out} file, to the specified location. Returns
#' the given \code{packMatches} dataframe with an additional
#' column, \code{cluster}, containing cluster IDs.
#'
#' @note
#' In order to cluster sequences using VSEARCH, the
#' executable file must first be installed.
#'
#' @seealso
#' \code{\link{tirClust}}, \code{\link{packAlign}},
#' \code{\link{readBlast}}, \code{\link{readUc}},
#' \code{\link{filterWildcards}}, \code{\link{packSearch}}
#'
#' @references
#' VSEARCH may be downloaded from
#' \url{https://github.com/torognes/vsearch}. See
#' \url{https://www.ncbi.nlm.nih.gov/pubmed/27781170}
#' for further information.
#'
#' @examples
#' data(arabidopsisThalianaRefseq)
#' data(packMatches)
#'
#' # packClust run on a Linux/MacOS system
#' \dontrun{
#' packClust(packMatches, Genome)
#' }
#'
#' # packClust run on a Windows system
#' \dontrun{
#' packClust(packMatches, Genome,
#' vSearchPath = "path/to/vsearch/vsearch.exe")
#' }
#'
#' @author
#' Jack Gisby
#'
#' @export
packClust <- function(packMatches, Genome, identity = 0.6, threads = 1,
identityDefinition = 2, maxWildcards = 0.05,
strand = "both", saveFolder = NULL,
vSearchPath = "vsearch") {
if (is.null(saveFolder)) {
message("Results will be saved in the working directory: ", getwd())
saveFolder <- getwd()
}
strand <- clustTest(saveFolder, threads, identity, strand, vSearchPath,
identityDefinition, type = "packClust")
packMatches <- filterWildcards(packMatches, Genome,
maxWildcards = maxWildcards)
packMatchesFile <- file.path(saveFolder, "packMatches.fasta")
ID <- as.integer(rownames(packMatches))
packMatches$ID <- ID
packMatches <- packMatches[order(-packMatches$width), ]
packMatchesSet <- getPackSeqs(packMatches, Genome, output = "DNAStringSet")
names(packMatchesSet) <- as.character(rownames(packMatches))
Biostrings::writeXStringSet(packMatchesSet, packMatchesFile)
system2(command = vSearchPath, args = paste0(
"--cluster_smallmem ", packMatchesFile, " \ ", "--id ", identity,
" \ ", "--strand ", strand, " \ ", "--iddef ", identityDefinition,
" \ ", "--threads ", threads, " \ ", "--qmask none \ ",
"--log ", file.path(saveFolder, "packMatches.log"), " \ ",
"--uc ", file.path(saveFolder, "packMatches.uc"), " \ ",
"--blast6out ", file.path(saveFolder, "packMatches.blast6out"), " \ ",
"--sizeout --minseqlength 10")
)
vSearchClusts <- readUc(file = file.path(saveFolder, "packMatches.uc"))
vSearchClusts <- vSearchClusts[vSearchClusts$type != "C", ]
packMatches$strand <- mapply(function(strand) {
if (strand == "*") { return("+")
} else {return(strand)}
},
strand = as.character(vSearchClusts$strand)
)
packMatches$cluster <- vSearchClusts$cluster
rownames(packMatches) <- packMatches$ID
packMatches <- packMatches[order(packMatches$ID), ]
return(subset(packMatches, select = -c(ID)))
}
clustTest <- function(saveFolder, threads, identity, strand, vSearchPath,
identityDefinition, type) {
if (parallel::detectCores() < threads) {
stop("There are not ", threads, " cores available")
}
if (identity > 1 | identity < 0) {
stop("Identity must be of type integer or double,
and have a value between 0 and 1")
}
if (type != "packAlign") {
if (strand == "+") {
strand <- "plus"
} else if (strand == "*") {
strand <- "both"
} else if (strand != "both" & strand != "plus") {
message("Argument 'strand' must be specified as 'plus' or 'both'")
}
}
if (!is.double(identityDefinition)) {
stop("Argument 'identityDefinition' must
be an integer between 0 and 4.")
} else if (identityDefinition > 4 | identityDefinition < 0) {
stop("Argument 'identityDefinition' must
be an integer between 0 and 4.")
}
if (system2(vSearchPath, "--v") != 0) {
message("VSEARCH cannot be found at", vSearchPath, ". Please ensure
that the VSEARCH executable file is installed at the correct
location and that the full file path is specified.")
}
if (type == "packClust") {
return(strand)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.