#'Generates biotype annotation from reannotation object
#'
#'\code{add_reanno} adds biotypes from reannotation to PAC.
#'
#'Given a reanno object generated by \code{\link{make_reanno}} this function
#'will either extract genome coordinates or classify biotypes using a list of
#'search terms (see details). Information will be compiled into a data frame with
#'the same order of sequences as in the original PAC master file.
#'
#'@family PAC reannotation
#'
#'@seealso \url{http://bowtie-bio.sourceforge.net/index.shtml} for information
#' about Bowtie and \url{https://github.com/Danis102} for updates on the
#' current package.
#'
#'@param reanno A reannotation list object generated by
#' \code{\link{make_reanno}}, with an Overview data.frame and a Full_anno list.
#'
#'@param mismatches Integer indicating the number of mismatches that should be
#' reported. Can never have a higher number than was originally specified with
#' the \code{\link{map_reanno}} function. While default=0, reporting the
#' maximum number is recommended. Note that the mismatch information is only
#' added to the report. Further classification can be generated with the
#' \code{\link{simplify_reanno}} function.
#'
#'@param type Character indicating what type of mapping that has been performed. If
#' type="genome", the reanno object is expected to have been mapped against a
#' genome or larger fasta references like chromosomes. This may include
#' coordinates as controlled by \code{\link{map_reanno}} and the
#' \code{\link{import_reanno}} functions. See \code{genome_max} on how to
#' control how coordinates are reported. If coordinates are missing, only hit or
#' no hit will be reported. If type="biotype", then coordinates are not
#' expected. Reference hits will instead be classified according to the biotype
#' search terms in \code{bio_search}.
#'
#'@param bio_search List of character vectors indicating search terms targeting
#' feature names in the original reference fasta files. These search terms will
#' be parsed to \code{\link{grepl}} as regular expressions. List names must
#' match names of the references in the reanno object. Classifications will be
#' reported with reference name + search term (e.g. "Ensembl_trna").
#'
#'@param bio_perfect Logical whether the function should allow that not all hits
#' against the references must have a unique bio_search term. If perfect=FALSE
#' (default) all references hits that was not caught by a search term will be
#' annotated as reference + other (e.g. "Ensembl_other"). In case perfect=TRUE,
#' the function will throw an error if the search terms do not catch all reference
#' hits. Can be used to ensure that all hits receive a classification.
#'
#'@param genome_max Integer or character indicating the number of maximum
#' coordinates to be reported when type="genome". If the number of hits
#' exceeds \code{genome_max} it will be indicated in the classification and
#' only the first hits up to \code{max_hits} will be reported. This is useful to
#' handling sequences that multimap. If genome_max="all", all coordinates will
#' be reported which may dramatically affect performance. (default=10)
#'
#'@param merge_pac PAC object. If a PAC object is provided in merge_pac, then
#' the function will automatically merge the Anno table of the PAC object with
#' the newly generated annotations. As default, merge_pac=NULL will return
#' a tibble data.frame instead.
#'
#'@return Data frame with mismatch and coordinate or biotype information (search
#' term hits) that can directly be added to an Anno data.frame of a PAC object,
#' given that the PAC object was used in the reannotation workflow. Annotations
#' can be further simplified using the \code{\link{simplify_reanno}} function.
#'
#' @examples
#'
#' #########################################################
#' ##### Example for reference mapping and classification
#' closeAllConnections()
#' #########################################################
#' ##### Create an reanno object
#'
#' ### First, if you haven't already generated Bowtie indexes for the included
#' # fasta references you need to do so. If you are having problem see the small
#' # RNA guide (vignette) for more info.
#'
#' ## tRNA:
#' trna_file <- system.file("extdata/trna", "tRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' trna_dir <- dirname(trna_file)
#'
#' if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(trna_file,
#' outdir=trna_dir,
#' prefix="tRNA", force=TRUE)
#' }
#' ## rRNA:
#' rrna_file <- system.file("extdata/rrna", "rRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' rrna_dir <- dirname(rrna_file)
#'
#' if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
#' Rbowtie::bowtie_build(rrna_file,
#' outdir=rrna_dir,
#' prefix="rRNA", force=TRUE)
#' }
#'
#'
#' ## Then load a PAC-object and remove previous mapping from anno:
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#' anno(pac) <- anno(pac)[,1, drop = FALSE]
#'
#' ref_paths <- list(trna= trna_file, rrna= rrna_file)
#'
#'
#' ## You may add an output path of your choice, but here we use a temp folder:
#' output <- paste0(tempdir(),"/seqpac/test")
#'
#'
#' ## Then map the PAC-object against the fasta references.
#' # Warning: if you use your own data, you may want to use override=FALSE, to avoid
#' # deleting previous mapping by mistake.
#'
#' map_reanno(pac, ref_paths=ref_paths, output_path=output,
#' type="internal", mismatches=0, import="biotype",
#' threads=2, keep_temp=FALSE, override=TRUE)
#'
#' ## Then import and generate a reanno-object of the temporary bowtie-files
#' reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#'
#'
#' ## Now make some search terms against reference names to create shorter names
#' # Theses can be used to create factors in downstream analysis
#' # One search hit (regular expressions) gives one new short name
#'
#' bio_search <- list(
#' rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
#' trna =c("_tRNA", "mt:tRNA"))
#'
#'
#' ## You can merge directly with your PAC-object by adding your original
#' # PAC-object, that you used with map_reanno, to merge_pac option.
#'
#' pac <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=FALSE,
#' mismatches = 0, merge_pac=pac)
#'
#' table(anno(pac)$mis0_bio)
#'
#' ############################################################################
#' ## Similarly, you can use Bowtie indexed genome fasta references
#' ## But don't forget to use import="genome" for coordinate import
#' #
#' # ref_paths <- list(genome="<path_to_bowtie_indexed_fasta>")
#' # output_genome <- "<your_path_to_output_folder>"
#' #
#' ## We can actually map against several genomes simultaneously:
#' # (to illustrate the principle we run the mycoplasma genome twice)
#' # ref_paths <- list(genome1=mycoplasma_file, genome2=mycoplasma_file)
#' # ### Run map_reanno
#' # map_reanno(PAC=pac, ref_paths=ref_paths, output_path=output,
#' # type="internal", mismatches=0, import="genome",
#' # threads=2, keep_temp=TRUE, override=TRUE)
#' # reanno_genome <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
#' ############################################################################
#' #### The trick to succeed with bio_perfect=TRUE
#'
#' ## Run add_reanno with bio_perfect="FALSE" (look where "Other=XX" occurs)
#' # Use the bio
#' #' ## Add a search terms that catches the other rrna
#'
#' anno <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=FALSE, mismatches = 0)
#'
#' ## Find sequences that has been classified as other
#' other_seqs <- anno[grepl("other", anno$mis0),]$seq
#' tab <- full(reanno_biotype)$mis0$trna
#' tab[tab$seq %in% other_seqs,] #No other hit in trna
#'
#' tab <- full(reanno_biotype)$mis0$rrna
#' tab[tab$seq %in% other_seqs,] #A few in rrna
#'
#'
#' ## Add a search terms that catches the other rrna
#' bio_search <- list(
#' rrna=c("5S", "5.8S", "12S", "16S",
#' "18S", "28S", "pre_S45", "Other"),
#' trna =c("_tRNA", "mt:tRNA"))
#'
#' anno <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=FALSE, mismatches = 0)
#'
#' table(anno$mis0)
#'
#' ## Repeat search until no "Other" appear when running add_reanno,
#' ## then run bio_perfect=TRUE:
#'
#' anno <- add_reanno(reanno_biotype, bio_search=bio_search,
#' type="biotype", bio_perfect=TRUE, mismatches = 0)
#'
#'
#'
#'
#' @export
add_reanno <- function(reanno, mismatches=0, type="genome", bio_search,
bio_perfect=FALSE, genome_max=10, merge_pac=NULL){
if(isS4(reanno)){
tp <- "S4"
reanno <- as(reanno, "list")
}else{
tp <- "S3"
}
## General setup ###############################
stopifnot(any(do.call("c", lapply(reanno$Full_anno, function(x){
do.call("c", lapply(x, function(y){
identical(reanno$Overview$seq, y$seq)}))
}))))
seq_nam <- reanno$Overview$seq
cat(paste0("\nSequences in total: ", length(seq_nam)))
fin_lst <- list(NA)
if(length(reanno[[2]])-1 < mismatches){
cat("\n")
stop("\nToo many mismatches were specified.",
"\nPlease specify mismatches to same number of mismatches used",
"\nwhen the reanno object was generated using map_reanno.")
}
## Extract genome ###############################
if(type=="genome"){
cat("\nExtracting genome(s) ...")
## Loop mismatch
for (i in seq.int(mismatches+1)){
full_lst <- reanno$Full_anno[[i]]
gen_vec_lst <- list(NA)
## Loop biotype
for (a in seq.int(length(full_lst))){
gen_nams <- names(full_lst)
gen_len <- sum(!is.na(full_lst[[a]]$ref_hits))
prc <- round(gen_len/length(seq_nam)*100, digits=2)
cat_nam <- paste0(
gen_nams[a], ":", paste0(
rep(" ", times= max(nchar(gen_nams)+1)-nchar(gen_nams[a])),
collapse=""))
cat_len <- paste0(gen_len, paste0(
rep(" ", times= 8-nchar(gen_len)),
collapse=""))
cat_hits <- paste0("Ref_hits= ", cat_len, " (", prc, "%)")
cat(paste0("\n\tmis", i-1, "\t", cat_nam,"\t", cat_hits))
if(genome_max=="all"){
gen_vec_lst[[a]] <- full_lst[[a]]$ref_hits
}else{
ref_hits <- full_lst[[a]]$ref_hits
loc <- stringr::str_locate_all(ref_hits, "\\-\\||\\+\\|")
loc <- lapply(loc, function(x){
if(nrow(x)>=genome_max){
incl <- x[genome_max,1]
}else{incl <- "all"}
return(incl)
})
loc <- unlist(loc, use.names = FALSE)
vect <- NULL
vect[loc == "all"] <- ref_hits[loc == "all"]
vect[!loc == "all"] <- paste0("Warning>",
genome_max, "|",
substr(ref_hits[!loc == "all"],
start=1,
stop=as.integer(
loc[!loc == "all"])))
stopifnot(identical(is.na(vect), is.na(ref_hits)))
gen_vec_lst[[a]] <- vect
}
names(gen_vec_lst)[a] <- names(full_lst)[a]
}
fin_lst[[i]] <- gen_vec_lst
names(fin_lst)[i] <- names(reanno$Full_anno)[i]
}
## Finish up
cat("\n\nCompiling everything into one table ...")
gen_nam <- names(fin_lst[[1]])
shrt <- lapply(as.list(gen_nam), function(x){
df <- data.frame(matrix(NA, nrow=length(seq_nam),
ncol=length(names(fin_lst))))
colnames(df) <- paste(names(fin_lst), x, sep="_")
for(z in seq.int(length(fin_lst))){
df[,z] <- fin_lst[[names(fin_lst)[z]]][[x]]
}
return(df)
})
fin <- as.data.frame(do.call("cbind", shrt), stringsAsFactors =FALSE)
rownames(fin) <- seq_nam
}
## Extract biotypes ###############################
if(type=="biotype"){
bio_nam <- names(bio_search)
fin_strand_lst <- list(NA)
cat("\nExtracting biotype(s) ...")
## Loop mismatch
for (i in seq.int(mismatches+1)){
full_lst <- reanno$Full_anno[[i]]
bio_vec_lst <- list(NA)
strand_lst <- list(NA)
## Loop biotype
for (a in seq.int(length(bio_search))){
extracted <- full_lst[bio_nam[a]]
## First annotate sense antisense
sns <- grepl(":sense\\||:sense$", extracted[[1]]$ref_hits)
antisns <- grepl(":antisense\\||:antisense$", extracted[[1]]$ref_hits)
strand_lst[[a]] <- paste0(ifelse(sns, "s", ""),
ifelse(antisns, "a", ""))
## Now classify with search terms
if(!length(extracted) == 1){
stop("\nThe names in bio_search did not match",
"\nthe names in the reanno tables.",
"\nPlease modify your reference input names in bio_search.")}
if(length(bio_search[[a]])==1){
bio_df <- do.call("cbind",
lapply(as.list(bio_search[[a]]), function(x){
return(
ifelse(grepl(x, extracted[[1]]$ref_hits),
paste0(bio_nam[a]), "<NA>"))
}))
}
if(length(bio_search[[a]])>1){
bio_df <- do.call("cbind",
lapply(as.list(bio_search[[a]]), function(x){
return(ifelse(grepl(x, extracted[[1]]$ref_hits),
paste0(bio_nam[a], "_", x, "|"),
"<NA>"))}
))
}
bio_vec <- gsub("<NA>", "", apply(bio_df, 1, function(x){
return(paste(x, collapse=""))}
))
bio_hits <- as.numeric(grepl(".", bio_vec))
`%>%` <- dplyr::`%>%`
over_hits <- as.numeric(
grepl(names(reanno$Full_anno)[i],
reanno$Overview %>% dplyr::pull(bio_nam[a])))
## Check bio_perfect
if(bio_perfect == TRUE){
if(!identical(bio_hits, over_hits)){
cat("\n")
stop("\n Your search terms did not catch all",
"\n instances in Overview column: ",
bio_nam[a],
"\n Please, modify bio_search or set bio_perfect=FALSE.",
call. = FALSE)
}
}
## Fix missing hits
if(bio_perfect == FALSE){
if(any(paste0(over_hits, bio_hits) == "01")){
stop("\nThere were more bio_search hits than there were references",
"\nhits in the Overview table. This should not happen.",
"\nPlease, rerun make_reanno to generate an Overview table",
"\nthat matches the Full annotation tables.")
}
bio_vec[paste0(over_hits, bio_hits) == "10"] <- paste0(bio_nam[a],
"_other")
}
## Print some stats and fix classifications
cat_nam <- paste0(
bio_nam[a], ":", paste0(
rep(" ", times= max(nchar(bio_nam)+1)-nchar(bio_nam[a])),
collapse=""))
cat_over <- paste0(
sum(over_hits), paste0(
rep(" ", times= 8-nchar(sum(over_hits))),
collapse=""))
cat_bio <- paste0(
sum(bio_hits), paste0(
rep(" ", times= 8-nchar(sum(bio_hits))),
collapse=""))
cat_diff <- paste0(paste0(
sum(over_hits)-sum(bio_hits)), paste0(
rep(" ", times= 8-nchar(paste0(
sum(over_hits)-sum(bio_hits)))), collapse=""))
cat(paste0("\n\tmis", i-1, "\t", cat_nam,"\tRef_hits=",
cat_over, "\t|Search=", cat_bio, "\t|Other=", cat_diff))
bio_vec_lst[[a]] <- gsub("\\|$", "", bio_vec)
names(bio_vec_lst)[a] <- names(bio_search)[a]
}
fin_lst[[i]] <- bio_vec_lst
names(fin_lst)[i] <- names(reanno$Full_anno)[i]
fin_strand_lst[[i]] <- strand_lst
}
## Finish up
cat("\nCompiling everything into one table ...")
shrt <- lapply(fin_lst, function(x){do.call("cbind", x)})
fin <- lapply(shrt, function(x){apply(x, 1, function(y){
y[y==""] <- "<NA>"
bio_comb <- paste(y, collapse=";")
bio_comb <- gsub(";<NA>|<NA>;", "", bio_comb)
bio_comb <- gsub("<NA>", "_", bio_comb)
bio_comb <- gsub("\\?|\\^|\\$", "_", bio_comb)
return(bio_comb)
})})
fin <- as.data.frame(do.call("cbind", fin), stringsAsFactors =FALSE)
rownames(fin) <- seq_nam
logi_strand <- apply(do.call("cbind", lapply(fin_strand_lst, function(x){
do.call("cbind", x)})), 1 , function(z){
paste(z, collapse="")}
)
fin_strand <- paste(ifelse(grepl("s", logi_strand), "+",""),
ifelse(grepl("a", logi_strand), "-",""), sep="|")
fin <- cbind(data.frame(strand=fin_strand, stringsAsFactors =FALSE), fin)
}
## Return table
reanno <- cbind(reanno$Overview, fin)
# Fix unique column names
if(type=="biotype"){
col_fix <- "bio"
colsrch <- c("Mis0_bio", paste0("Mis0_bio",seq.int(100)))
}
if(type=="genome"){
col_fix <- "genome"
colsrch <- c("Mis0_genome", paste0("Mis0_genome", seq.int(100)))
}
# Merge PAC
if(!is.null(merge_pac) && !is.logical(merge_pac)){
if(isS4(merge_pac)){
tp <- "S4"
merge_pac <- as(merge_pac, "list")
}else{
tp <- "S3"
}
if(!identical(rownames(reanno), rownames(merge_pac$Anno))){
stop("\nReanno sequence (row) names do not match PAC sequence names.",
"\nDid you use another PAC object as input for map_reanno?")
}
cat("\nMerging with PAC ...")
reanno <- reanno[,!colnames(reanno) == "seq", drop=FALSE]
anno <- merge_pac$Anno
col_hits <- colnames(anno) %in% colsrch
if(any(col_hits)){
num <- as.numeric(gsub("_bio$|_genome$|^Mis0", "",
colnames(anno)[col_hits]))
if(is.na(num)){
col_fix <- paste0(col_fix, 2)
}else{
col_fix <- paste0(col_fix, num+1)}
}
colnames(reanno) <- paste0(colnames(reanno), "_", col_fix)
colnames(reanno) <- gsub("genome_genome", "genome", colnames(reanno))
colnames(reanno) <- gsub("bio_bio", "bio", colnames(reanno))
merge_pac$Anno <- cbind(merge_pac$Anno, reanno, stringsAsFactors = FALSE)
PAC_check(merge_pac)
if(tp=="S4"){
return(as.PAC(merge_pac))
}else{
return(merge_pac)
}
}else{
return(tibble::as_tibble(reanno))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.