#' Makes annotation from R imported reannotation mapping
#'
#' \code{make_reanno} makes a reannotation (reanno) object.
#'
#' Given the path to the R reannotation files (reanno_mis0/1/2/3.Rdata)
#' generated by \code{\link{map_reanno}}, this function will summarize the
#' reannotation files into one output that matches the order of sequences in a
#' PAC object.
#'
#' @family PAC reannotation
#'
#' @seealso
#' \url{http://bowtie-bio.sourceforge.net/index.shtml} for information about
#' Bowtie \url{https://github.com/Danis102} for updates on the current
#' package.
#'
#' @param reanno_path Path to a directory where reannotation .Rdata files can be
#' found.
#' @param PAC PAC-list object containing an Anno data.frame with sequences as
#' row names.
#' @param mis_fasta_check Logical TRUE/FALSE if checking against anno_misX.fa
#' should be done. The anno_misX.fa is the fasta file generated by the
#' map_reanno function after completing the last mismatch cycle. This file
#' contains all PAC sequences that failed to receive an alignment in any of
#' fasta references and after all mismatch cycles. This file should be
#' availble in the same folder as the output files generated in the
#' map_reanno function (=reanno_path).
#'
#' @param output Character indicating output format. If type="S4" (default),
#' then the reanno object is returned as an S4 object. If type="list", the
#' reanno object will be returned as a list of tables (tibbles).
#'
#' @return Contains two slots/objects: 1. Overview: A table with a summary of
#' the mapping. 2. Full_anno: Lists of tables holding the full imports per
#' mismatch cycle (mis0, mis1 etc) and reference (mi0-ref1, mis0-ref2,
#' mis1-ref1, mis1-ref2 etc). All table rows are ordered according to the
#' order of the PAC originally used for the mapping. If \emph{mis_fasta_check}
#' is specified, the function will look for a \emph{anno_misX.fa} (X = the
#' file with the highest number) previously generated by the reannotation
#' workflow. This file is used to double check so that no sequences are
#' missing before and after reannotation.
#'
#' @examples
#'
#' #########################################################
#' ##### Simple example for reference mapping
#' ##### Please see manual for ?simply_reanno an ?add_reanno for more advanced
#' ##### mapping
#' closeAllConnections()
#' #########################################################
#' ##### Create an reanno object
#'
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'
#' ## tRNA:
#' trna_file <- system.file("extdata/trna", "tRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' trna_dir <- dirname(trna_file)
#'
#' if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(trna_file,
#' outdir=trna_dir,
#' prefix="tRNA", force=TRUE)
#' }
#' ## rRNA:
#' rrna_file <- system.file("extdata/rrna", "rRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' rrna_dir <- dirname(rrna_file)
#'
#' if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(rrna_file,
#' outdir=rrna_dir,
#' prefix="rRNA", force=TRUE)
#' }
#'
#'
#' ## Then load a PAC-object and remove previous mapping from anno:
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#' anno(pac) <- anno(pac)[,1, drop = FALSE]
#'
#' ref_paths <- list(trna= trna_file, rrna= rrna_file)
#'
#'
#' ## You may add an output path of your choice, but here we use a temp folder:
#' output <- paste0(tempdir(),"/seqpac/test")
#'
#'
#' ## Then map the PAC-object against the fasta references.
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#'
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#' type="internal", mismatches=0, import="biotype",
#' threads=2, keep_temp=FALSE, override=TRUE)
#'
#' ## Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'
#'
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name
#'
#' bio_search <- list(
#' rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#' trna =c("_tRNA", "mt:tRNA"))
#'
#'
#' ## You can merge directly with your PAC-object by adding your original
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#'
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=FALSE,
#' mismatches = 0, merge_pac=pac)
#'
#' table(anno(pac)$mis0_bio)
#'
#' ############################################################################
#' ## Similarly, you can use Bowtie indexed genome fasta references
#' ## But don't forget to use import="genome" for coordinate import
#' #
#' # ref_paths <- list(genome="<path_to_bowtie_indexed_fasta>")
#' # output_genome <- "<your_path_to_output_folder>"
#'
#' ## We can actually map against several genomes simultaneously:
#' # (to illustrate the principle we run the mycoplasma genome twice)
#'
#' mycoplasma_file <- system.file("extdata/mycoplasma_genome",
#' "mycoplasma.fa",
#' package = "seqpac", mustWork = TRUE)
#' mycoplasma_dir<- gsub("mycoplasma.fa", "", mycoplasma_file)
#'
#' ## Don't forget to create bowtie indexes
#' if(!sum(stringr::str_count(list.files(mycoplasma_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(mycoplasma_file,
#' outdir=mycoplasma_dir,
#' prefix="mycoplasma", force=TRUE)
#' }
#'
#' ref_paths <- list(genome1=mycoplasma_file, genome2=mycoplasma_file)
#' map_reanno(PAC=pac, ref_paths=ref_paths, output_path=output,
#' type="internal", mismatches=0, import="genome",
#' threads=2, keep_temp=TRUE, override=TRUE)
#' reanno_genome <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'
#' table(overview(reanno_genome)$Any) # Low Mycoplasma contamination
#'
#' ############################################################################
#'
#' @export
make_reanno <- function(reanno_path, PAC, mis_fasta_check=FALSE, output="S4"){
if(isS4(PAC)){
tp <- "S4"
PAC <- as(PAC, "list")
}else{
tp <- "S3"
}
reanno <- NULL
files <- list.files(reanno_path,
pattern=paste0(
"Full_reanno_mis0|",
"Full_reanno_mis1|",
"Full_reanno_mis2|",
"Full_reanno_mis3"),
full.names = TRUE)
seqs <- (seq(seq.int(length(files))))-1
reanno_lst <- list(NA)
for(i in seq.int(length(files))){
load(files[i])
reanno_lst[[i]] <- reanno
names(reanno_lst)[i] <- paste0("mis", seqs[i])
}
# Identify mismatch cycles with no hits in any reference
test_miss <- unlist(lapply(reanno_lst, length))
complete_nam <- names(reanno_lst)[which(test_miss == max(test_miss))][1]
complete_nam <- names(reanno_lst[[complete_nam]])
cat("\n\nReorganizing and matching reannotation files with PAC ...\n")
PAC_seq <- rownames(PAC$Anno)
reanno_lst_match <- lapply(reanno_lst, function(x){
## Fix if all references has no hits in
if(length(x)==0){
chr_na<- as.character(NA)
na_tibb <- tibble::tibble(seq=PAC_seq, mis_n=chr_na,
mis_where=chr_na, ref_hits=chr_na)
match_lst <- as.list(rep(NA, length(complete_nam)))
match_lst <- lapply(match_lst, function(x){na_tibb})
names(match_lst) <- complete_nam
}else{
match_lst <- lapply(x, function(y){
y$.id <- as.character(y$.id)
y$ref_hits <- as.character(y$ref_hits)
anno_match <- y[match(PAC_seq, y$.id),, drop=FALSE]
anno_match$.id[is.na(anno_match$.id)] <- PAC_seq[is.na(anno_match$.id)]
stopifnot(identical(PAC_seq, anno_match$.id))
names(anno_match)[names(anno_match)==".id"] <- "seq"
return(anno_match)
})
}
return(match_lst)
})
## Check and fix missing references
NA_check <- unlist(lapply(reanno_lst_match, function(x){
identical(names(reanno_lst_match[[1]]), names(x))
}))
if(any(!NA_check)){
NA_which <- lapply(reanno_lst, function(x){
which(!names(reanno_lst[[1]]) %in% names(x))
})
warning("Missing references in Reanno file(s):\n",
paste(basename(files)[!NA_check], collapse="\n "),
"\nMissing ref(s): ",
paste(names(reanno_lst[[1]])[unlist(NA_which)], collapse=" "),
"\nProbable reason: No sequences mapped to reference(s).")
for(j in seq.int(length(NA_which))){
if(length(NA_which[[j]]) >0){
empt <- reanno_lst_match[[1]][[1]]
empt[,2:4] <- NA
stopifnot(!any(!is.na(empt[,2:4])))
for(g in seq.int(length(NA_which[[j]]))){
ps <- length(reanno_lst_match[[j]]) + g
reanno_lst_match[[j]][[ps]] <- empt
names(reanno_lst_match[[j]])[ps] <- names(
reanno_lst_match[[1]])[NA_which[[j]]][g]
}
reanno_lst_match[[j]] <- reanno_lst_match[[j]][match(
names(reanno_lst_match[[1]]), names(reanno_lst_match[[j]]))]
}
}
}
## Generate overview file
cat("\nGenerating the overview file ...\n")
stopifnot(any(do.call("c", lapply(reanno_lst_match, function(t){
identical(names(reanno_lst_match[[1]]), names(t))
}))))
stopifnot(any(do.call("c", lapply(reanno_lst_match, function(t){
do.call("c", lapply(t, function(g){
identical(reanno_lst_match[[1]][[1]]$seq, g$seq)
}))
}))))
bio_cat <- length(reanno_lst_match[[1]])
df_fin <- matrix(NA, ncol=bio_cat, nrow=length(PAC_seq))
colnames(df_fin) <- names(reanno_lst_match[[1]])
df_fin <- tibble::as_tibble(df_fin)
for (bio in seq.int(bio_cat)){
df <- do.call("cbind", lapply(reanno_lst_match, function(x){
return(x[[bio]]$mis_n)
}))
vect <- apply(df, 1, function(x){
return(gsub("NA", "", paste(x, collapse="")))
})
df_fin[, bio] <- vect
}
df_fin[df_fin == ""] <- "_"
vect_mis <- do.call("paste", as.list(df_fin))
df_fin$Any <- ifelse(vect_mis == paste0(
rep("_", times=bio_cat), collapse=" ") , "No_anno", "Hit")
df_fin$Mis0 <- ifelse(grepl("mis0", vect_mis) , "Hit", "No_hit")
df_fin <- dplyr::bind_cols(tibble::tibble(seq=PAC_seq), df_fin)
## Check leftover fasta file
if(mis_fasta_check==TRUE){
cat("\nChecking the last anno_mis fasta file.\n")
anno_mis_fls <- list.files(reanno_path, pattern = "anno_mis\\d.fa")
outfile_fls <- list.files(reanno_path, pattern = "Full_reanno_mis\\d.Rdata")
ns_fa <- max(as.integer(gsub("anno_mis|.fa", "", anno_mis_fls)))
ns_rdata <- max(as.integer(gsub("Full_reanno_mis|.Rdata", "", outfile_fls)))
file_nam <- paste0("anno_mis", ns_fa, ".fa")
if(!ns_fa == ns_rdata+1){
stop(
"\nThe last anno_mis fasta ('leftover') file, named ",
file_nam,
", was not found in reanno path.",
"\nIf it was deleted, set mis_fasta_check=FALSE.")
}
noAnno_fasta <- Biostrings::readDNAStringSet(paste0(reanno_path,"/",
file_nam))
logi_no_anno <- df_fin$Any=="No_anno"
logi_olap <- df_fin$seq[df_fin$Any=="No_anno"] %in% gsub("NO_Annotation_",
"",
names(noAnno_fasta))
if(length(noAnno_fasta) + sum(logi_no_anno)==0){
cat("Congratulation, all sequences recieved an annotation.\n")
perc<-100
}else{
perc <- round(sum(logi_olap) / sum(logi_no_anno)*100, digits=2)
cat("Of the ", sum(logi_no_anno),
"missing sequences in the reannotation\n")
cat(paste0("files ", perc, "% were found in ", file_nam, ".\n"))
}
if(!perc==100){
warning(" Not all missing annotations were found in ",
file_nam,
".\n This indicates that something has gone wrong in the",
"\n reannotation workflow.\n")
}else{
cat("Passed fasta check!\n")
}
}
## S3
if(output=="list"){
rn <- list(Overview=df_fin, Full_anno=reanno_lst_match)
class(rn) <- c("list", "reanno_S3")
}
## S4
if(output=="S4"){
rn <- reanno(Overview=df_fin, Full_anno=reanno_lst_match)
}
return(rn)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.