#' Aligning sequences against references using bowtie
#'
#' Aligns sequences in a PAC list object against specified references generating
#' summarized report files in the destination folder.
#'
#' Given a PAC object this function will extract the read sequences and align
#' them against single or multiple fasta reference file(s). Summarized output
#' will be saved as .Rdata files in the destination folder. In its default
#' setting the function will only report hit or no hit in up to 3 mismatches,
#' but this can easily be changed between 0-3 mismatches, to include alignment
#' coordinates and reference name extractions. Information from the .Rdata files
#' (Full_reanno_mis0/1/2/3.Rdata) can be extracted and added to a PAC object
#' using \code{make_reanno} and \code{add_reanno} functions. For increased
#' compatibility across platforms, the function provides both R internal bowtie
#' parsing (thorugh the Rbowtie package), as well as external parsing to a
#' locally installed version of bowtie.
#'
#'
#' @family PAC reannotation
#'
#' @seealso \url{http://bowtie-bio.sourceforge.net/index.shtml} for information
#' about Bowtie and for Rbowtie:
#' \url{https://www.bioconductor.org/packages/release/bioc/html/Rbowtie.html}.
#' \url{https://github.com/Danis102} for updates on the current package.
#'
#' @param PAC PAC-list object containing an Anno data.frame with sequences as
#' row names.
#'
#' @param type Character indicating if mapping should be performed by calling
#' the internal bowtie function of the Rbowtie package (type="internal"), or
#' if bowtie should be called externally through a system call for a locally
#' installed bowtie (type="external").
#'
#' @param mismatches Integer indicating the maximum number of mismatches allowed
#' in the alignments. The function currently supports no more than 3
#' mismatches (botwie max).
#'
#' @param output_path Character indicating the path to the destination folder
#' for the files generated by the function.
#'
#' @param ref_paths List of file paths (character) indicating the full paths to
#' each fasta reference file. It is important to carefully name each reference path.
#' The names will appear as they are named in the final annotation table.
#' Thus, if \emph{ref_paths=list(tRNA="<path_tRNA_ref>",
#' miRNA="<path_piRNA_ref>")} mapping to these fasta references will appear as
#' "tRNA" and "miRNA", respectively. Note: All reference fasta files must have
#' bowtie indexes using \code{Rbowtie::bowtie_build}.
#'
#' @param parse_internal One character string specifying the additional options
#' parsed to the bowtie function in the Rbowtie package when type="internal".
#' The string follows similar rules as '...' parsing. See
#' \code{Rbowtie::bowtie} for details on the format. As default:
#' parse_internal= "a=TRUE, f=TRUE".
#'
#' @param parse_external One character string specifying the additional options
#' parsed to externally installed bowtie when type="external". The string
#' follows similar rules as '...' parsing. More information on the formatting
#' use: \code{system("bowtie --manual", intern=TRUE, ignore.stderr=FALSE)}. If
#' this command fails you probably do not have bowtie correctly installed. As
#' default parse_external= "-a -f".
#'
#' @param import Character or a list. If \code{import="genome"} mapping is done
#' against a reference genome and genomic coordinates are acquired. If
#' import="biotype", mapping is done against a specialized fasta reference
#' (e.g. Ensembl_ncrna, pirBase etc), where genomic coordinates is not
#' required because classification will be performed on a match-or-no-match
#' basis. A list of exactly 3 objects, named "coord", "report" and "reduce"
#' can also be provided. This list will be parsed to the
#' \code{\link{import_reanno}} function. When import="genome", the list
#' \code{import=list(coord=TRUE, report="full", reduce=NULL)} is automatically
#' parsed, while when import="biotype" the list parsed is
#' \code{import=list(coord=FALSE, report="full", reduce=NULL)}. Performance
#' increases by setting coord=FALSE. See \code{\link{import_reanno}} for more
#' information on how to set \code{report} and \code{reduce} for increased
#' performance when extremely large and repetitive references are used, such
#' as pirBase and repeatMasker.
#'
#' @param threads Integer indicating the number of parallel processes to be
#' used.
#'
#' @param keep_temp Logical whether or not bowtie output files temporarily stored
#' in the output path should be deleted. Note, this option is only used for
#' troubleshooting. The bowtie output files are named as the reference files
#' and are overwritten in each mismatch cycle. Thus, for safe saving of
#' mismatch 0 bowtie output make sure that \code{mismatches=0}. If not, the
#' mismatch 1 cycle will overwrite the bowtie files.
#'
#' @param override Logical whether or not the function should prompt you for a
#' question if there are files in output_path. As default, override=FALSE will
#' prevent deleting large files by accident, but requires an interactive R
#' session. Setting override=TRUE may solve non-interactive problems.
#'
#' @return Will primarily generate .Rdata files in the destination folder
#' (\code{output_path}) containing summarized information about the reference
#' alignments. One file is generated for every mismatch specified in
#' \emph{mismatches}. The \code{\link{make_reanno}} function can then be used
#' to extract and generate annotation tables for a PAC list object. Large
#' temporary bowtie input and output files will also be generated in the
#' destination folder, but are removed unless \code{temp_remove=FALSE}.
#'
#' @examples
#'
#' #########################################################
#' ##### Simple example for reference mapping
#' ##### Please see manual for ?simply_reanno an ?add_reanno for more advanced
#' ##### mapping
#' closeAllConnections()
#' #########################################################
#' ##### Create an reanno object
#'
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'
#' ## tRNA:
#' trna_file <- system.file("extdata/trna", "tRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' trna_dir <- dirname(trna_file)
#'
#' if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(trna_file,
#' outdir=trna_dir,
#' prefix="tRNA", force=TRUE)
#' }
#' ## rRNA:
#' rrna_file <- system.file("extdata/rrna", "rRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' rrna_dir <- dirname(rrna_file)
#'
#' if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(rrna_file,
#' outdir=rrna_dir,
#' prefix="rRNA", force=TRUE)
#' }
#'
#'
#' ## Then load a PAC-object and remove previous mapping from anno:
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#' anno(pac) <- anno(pac)[,1, drop = FALSE]
#'
#' ref_paths <- list(trna= trna_file, rrna= rrna_file)
#'
#'
#' ## You may add an output path of your choice, but here we use a temp folder:
#' output <- paste0(tempdir(),"/seqpac/test")
#'
#'
#' ## Then map the PAC-object against the fasta references.
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#'
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#' type="internal", mismatches=0, import="biotype",
#' threads=2, keep_temp=FALSE, override=TRUE)
#'
#' ## Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'
#'
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name
#'
#' bio_search <- list(
#' rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#' trna =c("_tRNA", "mt:tRNA"))
#'
#'
#' ## You can merge directly with your PAC-object by adding your original
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#'
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=FALSE,
#' mismatches = 0, merge_pac=pac)
#'
#' table(anno(pac)$mis0_bio)
#'
#' ############################################################################
#' ## Similarly, you can use Bowtie indexed genome fasta references
#' ## But don't forget to use import="genome" for coordinate import
#' #
#' # ref_paths <- list(genome="<path_to_bowtie_indexed_fasta>")
#' # output_genome <- "<your_path_to_output_folder>"
#'
#' ## We can actually map against several genomes simultaneously:
#' # (to illustrate the principle we run the mycoplasma genome twice)
#'
#' mycoplasma_file <- system.file("extdata/mycoplasma_genome",
#' "mycoplasma.fa",
#' package = "seqpac", mustWork = TRUE)
#' mycoplasma_dir<- gsub("mycoplasma.fa", "", mycoplasma_file)
#'
#' ## Don't forget to create bowtie indexes
#' if(!sum(stringr::str_count(list.files(mycoplasma_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(mycoplasma_file,
#' outdir=mycoplasma_dir,
#' prefix="mycoplasma", force=TRUE)
#' }
#'
#' ref_paths <- list(genome1=mycoplasma_file, genome2=mycoplasma_file)
#' map_reanno(PAC=pac, ref_paths=ref_paths, output_path=output,
#' type="internal", mismatches=0, import="genome",
#' threads=2, keep_temp=TRUE, override=TRUE)
#' reanno_genome <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'
#' table(overview(reanno_genome)$Any) # Low Mycoplasma contamination
#'
#' ############################################################################
#'
#' @importFrom methods is
#' @export
#
map_reanno <- function(PAC, type="internal", output_path, ref_paths,
mismatches=3, threads=1, parse_external= "-a -f",
parse_internal = "a=TRUE, f=TRUE",
import="genome", keep_temp=FALSE, override=FALSE){
if(isS4(PAC)){
tp <- "S4"
PAC <- as(PAC, "list")
}else{
tp <- "S3"
}
## setup
stopifnot(PAC_check(PAC))
mis_lst <- as.list(0:mismatches)
names(mis_lst) <- paste0("mis", 0:mismatches)
ref_paths <- lapply(ref_paths, function(x){gsub("\\.fa$", "", x)})
seq_fst <- Biostrings::DNAStringSet(rownames(PAC$Anno))
names(seq_fst) <- paste(seq_fst)
if(length(import)==1){
if(import=="genome"){
import <- list(coord=TRUE, report="full", reduce=NULL)
}else{
if(import=="biotype"){
import <- list(coord=FALSE, report="full", reduce=NULL)
}else{
stop("\nPlease, provide correct import options.",
"\nChose between 'genome', 'biotype' or list of options.",
"\nSee ?map_reanno and ?import_reanno for more details.")
}
}
}
## Look for files and folders in output path
drs <- list.dirs(output_path, full.names = FALSE, recursive = FALSE)
fls <- list.files(output_path, recursive = FALSE)
if(length(fls[!fls %in% drs])>0){
cat("\n")
if(override==FALSE){
warning("\n There are files in the output folder:\n ",
output_path, "\n Is it ok to delete them? (y/n)",
immediate.=TRUE, call.=FALSE)
response <- readline()
}
if(override==TRUE){
warning("\n There are files in the output folder:\n ",
output_path, "\noverride=TRUE, files will be deleted!",
immediate.=TRUE, call.=FALSE)
response <- "y"
}
if(!response %in% c("y", "Y")){
stop("Please move or delete the files in the output folder.")
}
}
if(!dir.exists(output_path)){
dir.create(output_path, showWarnings=FALSE, recursive = TRUE)
}else{
fls_full <- list.files(output_path, recursive = FALSE, full.names=TRUE)
file.remove(fls_full[!fls %in% drs])
}
## Save first input
Biostrings::writeXStringSet(seq_fst,
filepath=file.path(output_path, "anno_mis0.fa"),
format="fasta")
## Run bowtie over each reference
if(type=="internal"){
vrs <- utils::capture.output(Rbowtie::bowtie_version())
vrs <- gsub(" |bowtie version ", " ", basename(vrs[[1]]))
vrs <- gsub(" ", "", basename(vrs[[1]]))
vrs <- gsub("\"", "", basename(vrs[[1]]))
cat("\nR internal mapping using the Rbowtie package was specified.\n")
cat(paste0("This package uses bowtie version ", vrs, ".\n"))
cat("If you need a newer version, please install Bowtie manually\n")
cat("outside R and then use option type='external'.\n")
}
if(type=="external"){
vrs <- stringr::str_split(
basename(utils::capture.output(system2("bowtie --version",
stdout=TRUE))[[1]]), "version")
cat("\nR external mapping depends on correct installation of bowtie.")
cat("\nIf there are problems using external bowtie, try type='internal'.")
cat(paste0("\nAn external bowtie installation was found using version",
vrs[[1]][2]))
}
for(i in seq.int(length(mis_lst))){
cat(paste0("\n\n******************************************************"))
cat(paste0("\n|--- Mismatch ", mis_lst[[i]],
" started at ", format(Sys.time(), "%X")))
cat("\n|--- Bowtie mapping:")
# OBS! file.path doesn't work alone, must have paste to create file names
mis_fl_nam<- paste0("anno_mis",mis_lst[[i]], ".fa")
input_file <- file.path(output_path, mis_fl_nam)
for (j in seq.int(length(ref_paths))){
cat("\n |--->", paste0(names(ref_paths)[j], "..."))
out_fl_nam <- paste0(names(ref_paths)[j], ".out")
output_file <- file.path(output_path, out_fl_nam)
# Fix windows path problem
if(grepl("windows", .Platform$OS.type)){
output_file <- gsub("\\\\", "/", output_file)
input_file <- gsub("\\\\", "/", input_file)
}
## Internal bowtie
if(type=="internal"){
cat("\n\n")
err_ms <- NULL
bwt_exp <- paste0("Rbowtie::bowtie(sequences=input_file,
index=ref_paths[[j]], ",
paste0(parse_internal, ",
v=", mis_lst[[i]], ", p=", threads),
", type ='single', outfile='", output_file,
"', force = TRUE, strict = TRUE)")
#First try with all threads, if error reduce to 1.
err_ms<- try(eval(parse(text=bwt_exp)))
if(is(err_ms,"try-error")){
threads_red <- 1
bwt_exp <- paste0("Rbowtie::bowtie(sequences=input_file,
index=ref_paths[[j]], ",
paste0(parse_internal, ",
v=", mis_lst[[i]], ", p=", threads_red),
", type ='single', outfile='", output_file,
"', force = TRUE, strict = TRUE)")
eval(parse(text=bwt_exp))
}
}
## External bowtie
if(type=="external"){
cat("\n\n")
bwt_exp <- paste0("bowtie ", parse_external,
" -v ", mis_lst[[i]], " -p ", threads, " ",
ref_paths[[j]], " ", input_file, " ", output_file)
system2(bwt_exp, stdout=TRUE)
}
}# for j loop ends (references)
reanno <- import_reanno(bowtie_path=output_path, threads=threads,
coord=import$coord, report=import$report,
reduce=import$reduce)
## Remove No_hits
rm_no_hits <- unlist(lapply(reanno, function(x){x[1,1]=="No_hits"}))
reanno <- reanno[!rm_no_hits]
## Compare with previous input to generate new input
suffix <- paste0("mis", mis_lst[[i]])
anno_path <- list.files(output_path,
pattern = paste0("anno_", suffix, ".fa"),
full.names=TRUE)
reanno_df <- data.table::rbindlist(reanno, fill=FALSE)
reanno_seqs <- unique(reanno_df$.id)
anno <- Biostrings::readDNAStringSet(anno_path)
new_input <- anno[!paste0(anno) %in% reanno_seqs,, drop=FALSE]
new_suffix <- paste0("mis", mis_lst[[i]]+1)
## Write files
reanno_fl_nam <- paste0("Full_reanno_", suffix, ".Rdata")
save(reanno, file= file.path(output_path, reanno_fl_nam))
sav_fl_nam <- paste0("anno_", new_suffix, ".fa")
Biostrings::writeXStringSet(new_input,
filepath=file.path(output_path, sav_fl_nam),
format="fasta")
if(any(rm_no_hits)){cat(paste0("\n\n|--- All reference but ",
names(rm_no_hits)[rm_no_hits],
" generated hits"))
}else{cat(paste0("\n|--- All reference generated hits"))}
cat(paste0("\n|--- Mismatch ", mis_lst[[i]], " finished -----|"))
} # for i loop ends (mismatches)
cat(paste0("\n\n******************************************************"))
cat(paste0("\nCleaning up ... "))
if(keep_temp==FALSE){
fls_temp <- list.files(output_path, full.names=TRUE,
recursive = FALSE, pattern=".out$")
file.remove(fls_temp)}
cat(paste0("\nReanno mapping finished at: ", format(Sys.time(), "%X")))
cat(paste0("\nOutput files are saved in:\n ", paste0(output_path)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.