#' @title aadir2orthofinder
#' @name aadir2orthofinder
#' @description This function calculates (conditional-)reciprocal best hit
#' (CRBHit) pairs for all possible comparison including self
#' comparison from a directory of AA fasta files.
#' Sequence searches are performed with \bold{last}
#' \emph{Kiełbasa, SM et al. (2011)} [default]
#' or with \bold{mmseqs2}
#' \emph{Steinegger, M and Soeding, J (2017)}
#' or with \bold{diamond}
#' \emph{Buchfink, B et al. (2021)}.
#' @param dir directory containing AA fasta files [mandatory]
#' @param file_ending define file ending to consider [default: *]
#' @param searchtool specify sequence search algorithm last, mmseqs2 or diamond
#' [default: last]
#' @param lastpath specify the PATH to the last binaries
#' [default: /extdata/last-1542/bin/]
#' @param lastD last option D: query letters per random alignment
#' [default: 1e6]
#' @param mmseqs2path specify the PATH to the mmseqs2 binaries
#' [default: NULL]
#' @param mmseqs2sensitivity specify the sensitivity option of mmseqs2
#' [default: 5.7]
#' @param diamondpath specify the PATH to the diamond binaries
#' [default: NULL]
#' @param diamondsensitivity specify the sensitivity option of diamond
#' [default: --sensitive]
#' @param diamondmaxtargetseqs specify the maximum number of target sequences
#' per query option of diamond
#' [default: -k0]
#' @param outpath specify the output PATH [default: /tmp]
#' @param crbh specify if conditional-reciprocal hit pairs should be retained
#' as secondary hits [default: TRUE]
#' @param keepSingleDirection specify if single direction secondary hit pairs
#' should be retained [default: FALSE]
#' @param eval evalue [default: 1e-3]
#' @param qcov query coverage [default: 0.0]
#' @param tcov target coverage [default: 0.0]
#' @param pident percent identity [default: 0.0]
#' @param alnlen alignment length [default: 0.0]
#' @param rost1999 specify if hit pairs should be filter by equation 2 of
#' Rost 1999 [default: FALSE]
#' @param filter specify additional custom filters as list to be applied on
#' hit pairs [default: NULL]
#' @param fit.type specify if mean or median should be used for fitting
#' [default: mean]
#' @param fit.varweight factor for fitting function to consider neighborhood
#' [default: 0.1]
#' @param fit.min specify minimum neighborhood alignment length
#' [default: 5]
#' @param threads number of parallel threads [default: 1]
#' @param remove specify if last result files should be removed [default: TRUE]
#' @return List of three (crbh=FALSE)\cr
#' @importFrom Biostrings writeXStringSet
#' @importFrom graphics legend par points
#' @importFrom stats median splinefun
#' @importFrom utils read.table combn
#' @importFrom tidyr %>%
#' @seealso \code{\link[CRBHits]{aafile2rbh}}
#' @references Aubry S, Kelly S et al. (2014) Deep Evolutionary Comparison of
#' Gene Expression Identifies Parallel Recruitment of Trans-Factors in Two
#' Independent Origins of C4 Photosynthesis. \emph{PLOS Genetics},
#' \bold{10(6)} e1004365.
#' @references Kiełbasa, SM et al. (2011) Adaptive seeds tame genomic sequence
#' comparison. \emph{Genome research}, \bold{21(3)}, 487-493.
#' @references Rost B. (1999). Twilight zone of protein sequence alignments.
#' \emph{Protein Engineering}, \bold{12(2)}, 85-94.
#' @examples
#' ## compile last-1542 within CRBHits
#' CRBHits::make_last()
#' @export aadir2orthofinder
#' @author Kristian K Ullrich
aadir2orthofinder <- function(dir,
file_ending="*",
searchtool="last",
lastpath=paste0(find.package("CRBHits"),
"/extdata/last-1542/bin/"),
lastD=1e6,
mmseqs2path=NULL,
mmseqs2sensitivity=5.7,
diamondpath=NULL,
diamondsensitivity="--sensitive",
diamondmaxtargetseqs="-k0",
outpath="/tmp",
crbh=TRUE,
keepSingleDirection=FALSE,
eval=1e-3,
qcov=0.0,
tcov=0.0,
pident=0.0,
alnlen=0.0,
rost1999=FALSE,
filter=NULL,
fit.type="mean",
fit.varweight=0.1,
fit.min=5,
threads=1,
remove=TRUE
){
if(searchtool=="last"){
if(!dir.exists(lastpath)){
stop("Error: last PATH does not exist. Please specify correct
PATH and/or look into package installation prerequisites.
Try to use make_last() function.")
}
if(!file.exists(paste0(lastpath, "lastdb"))){
stop("Error: lastdb binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites. Try to use make_last() function.")
}
if(!file.exists(paste0(lastpath, "lastal"))){
stop("Error: lastal binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites. Try to use make_last() function.")
}
}
if(searchtool=="mmseqs2"){
if(!dir.exists(mmseqs2path)){
stop("Error: mmseqs2 PATH does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
if(!file.exists(paste0(mmseqs2path, "mmseqs"))){
stop("Error: mmseqs2 binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
}
if(searchtool=="diamond"){
if(!dir.exists(diamondpath)){
stop("Error: diamond PATH does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
if(!file.exists(paste0(diamondpath, "diamond"))){
stop("Error: diamond binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
}
aa_files <- list.files(dir, file_ending)
aa_species <- seq_along(aa_files)-1
aa_species_files <- paste0("Species", aa_species, ".fa")
# write SpeciesIDs.txt
sink(file.path(outpath, "SpeciesIDs.txt"))
cat(
apply(cbind(aa_species, aa_files),1,
function(x){paste0(x[1],": ",x[2])}), sep="\n"
)
sink(NULL)
# extract sequence names, create aa_species_files
# and perform selfblast
aa_sequence_names <- data.frame()
for(i in seq_along(aa_files)){
tmp_aa <- Biostrings::readAAStringSet(
file.path(dir, aa_files[i]))
tmp_aa_names <- paste0(
aa_species[i], "_", seq_along(tmp_aa)-1)
aa_sequence_names <- rbind(aa_sequence_names,
data.frame(tmp_aa_names, stringr::word(names(tmp_aa))))
names(tmp_aa) <- tmp_aa_names
Biostrings::writeXStringSet(tmp_aa,
file.path(outpath, aa_species_files[i]))
tmp_aa_rbh <- aafile2rbh(
aafile1 = file.path(outpath, aa_species_files[i]),
aafile2 = file.path(outpath, aa_species_files[i]),
searchtool=searchtool,
lastpath=lastpath,
lastD=lastD,
mmseqs2path=mmseqs2path,
mmseqs2sensitivity=mmseqs2sensitivity,
diamondpath=diamondpath,
diamondsensitivity=diamondsensitivity,
diamondmaxtargetseqs=diamondmaxtargetseqs,
outpath=outpath,
crbh=crbh,
keepSingleDirection=keepSingleDirection,
eval=eval,
qcov=qcov,
tcov=tcov,
pident=pident,
alnlen=alnlen,
rost1999=rost1999,
filter=filter,
plotCurve=FALSE,
fit.type=fit.type,
fit.varweight=fit.varweight,
fit.min=fit.min,
threads=threads,
remove=remove)
write.table(tmp_aa_rbh$crbh1[,1:12], sep="\t", quote=FALSE,
col.names=FALSE, row.names=FALSE,
file=file.path(outpath, paste0("Blast", aa_species[i], "_",
aa_species[i], ".txt")))
}
# write SequenceIDs.txt
sink(file.path(outpath, "SequenceIDs.txt"))
cat(
apply(aa_sequence_names,1,function(x){paste0(x[1],": ",x[2])}), sep="\n"
)
sink(NULL)
# combn
to_calc <- t(combn(aa_species, 2))
apply(to_calc,1,function(x){
tmp_aa_rbh <- aafile2rbh(
aafile1 = file.path(outpath, aa_species_files[x[1]+1]),
aafile2 = file.path(outpath, aa_species_files[x[2]+1]),
searchtool=searchtool,
lastpath=lastpath,
lastD=lastD,
mmseqs2path=mmseqs2path,
mmseqs2sensitivity=mmseqs2sensitivity,
diamondpath=diamondpath,
diamondsensitivity=diamondsensitivity,
diamondmaxtargetseqs=diamondmaxtargetseqs,
outpath=outpath,
crbh=crbh,
keepSingleDirection=keepSingleDirection,
eval=eval,
qcov=qcov,
tcov=tcov,
pident=pident,
alnlen=alnlen,
rost1999=rost1999,
filter=filter,
plotCurve=FALSE,
fit.type=fit.type,
fit.varweight=fit.varweight,
fit.min=fit.min,
threads=threads,
remove=remove)
write.table(tmp_aa_rbh$crbh1[,1:12], sep="\t", quote=FALSE,
col.names=FALSE, row.names=FALSE,
file=file.path(outpath, paste0("Blast", x[1], "_", x[2], ".txt")))
write.table(tmp_aa_rbh$crbh2[,1:12], sep="\t", quote=FALSE,
col.names=FALSE, row.names=FALSE,
file=file.path(outpath, paste0("Blast", x[2], "_", x[1], ".txt")))
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.