#' Interval classification.
#'
#' \code{map_rangetype} Classifies sequences based on interval mapping against a
#' reference.
#'
#' Given a PAC_map object (\code{\link{PAC_mapper}}) and an interval list this
#' function will attempt to classify mapped sequences based on where these
#' sequences starts and ends in reference. This function can for example be used
#' for 5' and 3' tRNA classification.
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#' package.
#'
#' @param map PAC_map (generated by \code{\link{PAC_mapper}}) or a Reanno object
#' (generated by \code{\link{map_reanno}}), containing mapping coordinates
#' against a reference fasta file.
#'
#' @param intervals A named list with integer intervals.
#'
#' @param type Character indicating what type of intervals that is provided.
#'
#' If type="nucleotides", then the interval list is given as ranges of
#' nucleotide positions. For example, if interval=list(start=1:3, end=1:3) the
#' function will classify sequences starting within the first three
#' nucleotides of the reference as 'type_start_nuc' and sequences ending in
#' within the last three nucleotides of the reference as 'type_end_nuc'.
#'
#' If type="percent", then intervals needs to be provided as percent ranges.
#' For example, if intervals=list(start=1:5, mid=45:50, end=95:100) then the
#' function will classify sequences starting within the 5% first nucleotides
#' in the references as 'type_start_per', and sequences ending within the 5%
#' last nucleotides of the references as "type_end_per". It will also,
#' classify sequences starting within 45-50% of the references as
#' "type_mid_start_per" and sequences ending within 45-50% of the references
#' as 'type_mid_end_per'.
#'
#' If type="ss", then intervals is obtained from an ss file, obtained for
#' example from tRNAscan-SE (\url{http://lowelab.ucsc.edu/tRNAscan-SE/}) or at
#' GtRNAdb \url{http://gtrnadb.ucsc.edu/}.
#'
#' Importantly - the intervals list is name sensitive. If type="nuclotides",
#' intervals can only contain two intervals named 'start' and 'end', while if
#' type="percent" then intervals needs to contain three intervals named
#' 'start', 'mid' and 'end'.
#'
#' Hint, for classifying 5' and 3' half tsRNA you need to run the function
#' twice. First, classify each sequence as 5'-start or 3'-end tsRNA using
#' type="nucleotides", and then rerun the the map object using type="percent"
#' specifying the 'mid' region as the half interval.
#'
#' @param ss File path to ss file (character), readLines vector of ss file
#' (character) or ss list. If character, the function will attempt to read a
#' file from the path given in the character string. If this fails, the
#' function assumes that the ss file has already been read using
#' \code{readLines}, and will attempt to split that character vector into a
#' list of unique sequences by splitting at the empty lines. Empty line
#' normally delimits each sequence entry in the ss file. Such a list can also
#' be parsed directly to the function, making it easy to change for example
#' sequence names using \code{\link{lapply}} prior to running the function.
#'
#' @param N_include Logical whether or not N "wild card" nucleotides should be
#' counted in the terminals. This conveniently controls the N_up and N_down
#' arguments in the \code{\link{PAC_mapper}} function. If N_include=FALSE
#' (default), start and end of tRNA will be measured from the first and last
#' canonical nucleotides (A, T, C, G). Thus, if fragments align to an
#' NNN-terminal, it will receive a negitve value. If N_include=TRUE, N
#' wild-cards will be treated as any other nucleotide.
#'
#'
#' @param min_loop_width Integer setting the minimum number of nucleotides for a
#' loop. Only applicable when type="ss". Loops in ss-files are defined by ">"
#' followed by x number of "." ending with "<". For example:\cr
#' \code{ATCGGTGGTTCAGTGGTAGAATGCTCGCCTCGCGGGCGGCCCGGGTTCGATTCCCGGCCGATG}\cr
#' \code{>>>>>..>>>>.......<<<<.>>>>>...<<<<<....>>>>>.......<<<<<<<<<<<}\cr
#' Here are three possible loops: "AGTGGTA", "CTC", "TTCGATT". If
#' min_loop_width=3, the middle loop (">...<"="CTC") will be classified as a
#' loop. If min_loop_width=4 (default), the middle loop will not be classified
#' as a loop because it is too short.
#'
#' @return Map list object containing reference sequence (Ref_seq) as
#' Biostrings::DNAStringSet and the new classifications embedded with the
#' alignments (Alignments) in a dataframe.
#'
#' @examples
#'
#' ###########################################################
#' ### test the map_rangetype function
#' # More complicated examples can be found in the vignette.
#' ##----------------------------------------
#'
#' # First create an annotation blank PAC with group means
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#' anno(pac) <- anno(pac)[,1, drop=FALSE]
#' pac_trna <- PAC_summary(pac, norm = "cpm", type = "means",
#' pheno_target=list("stage"), merge_pac = TRUE)
#'
#' # Then re-annotate only tRNA using the PAC_mapper function
#' ref <- system.file("extdata/trna", "tRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#' map_object <- PAC_mapper(pac_trna, ref=ref, N_up = "NNN", N_down = "NNN",
#' mismatches=0, threads=2, report_string=TRUE,
#' override=TRUE)
#'
#' ## Coverage plot of tRNA using PAC_covplot
#'
#' # Single tRNA targeting a summary table
#' PAC_covplot(pac_trna, map=map_object,
#' summary_target= list("cpmMeans_stage"),
#' map_target="tRNA-Ala-AGC-1-1")
#'
#' ## Classify range types with map_rangetype (see vignette for examples
#' # on how to use ss-files for detailed tRNA loop structure).
#'
#' # Classify fragments using percent intervals
#' map_object <- map_rangetype(map_object,
#' intervals = list(start = 1:5, mid = 45:55, end = 95:100))
#'
## Explore the map-object with range classifications
#' names(map_object)
#' map_object[[1]]
#'
#' @export
map_rangetype <- function(map, type="percent", ss=NULL, min_loop_width=4,
intervals=list(start=1:5, mid=45:55, end=95:100),
N_include=FALSE){
## Compensate for N up and down setup
if(N_include==FALSE){
N_counts <- lapply(map, function(x){
Ns <- unlist(strsplit(paste0(x[[1]]), "A|T|C|G"))
Ns <- Ns[c(1, length(Ns))]
if(!identical(stringr::str_count(Ns, "N"), nchar(Ns))){
stop("\nYou have other wild-cards than N in your terminals.",
"\nThis function does not allow other nucleotides than ",
"N, A, T, C and G.")
}
nchar(Ns)
})
}else{
N_counts <- NULL
}
## Interval mapping
if(type %in% c("nucleotides", "percent")){
range_types <- map
for(i in seq.int(length(map))){
ref_len <- nchar(paste0(map[[i]]$Ref_seq))
align <- map[[i]]$Alignments
## Compensate for N_include
if(!is.null(N_counts[[i]])){
if(!align$Mismatch[1] == "no_hits"){
ref_len <- ref_len - sum(unlist(N_counts[[i]]))
align$Align_start <- align$Align_start - N_counts[[i]][1]
align$Align_end <- align$Align_end - N_counts[[i]][2]
# substring(
# align$Align_string, 1, N_counts[[i]][1]) <- paste(
# rep("*", times=N_counts[[i]][1]), collapse="")
# substring(
# align$Align_string,
# (nchar(align$Align_string) - N_counts[[i]][2] + 1)) <- paste(
# rep("*", times=N_counts[[i]][2]), collapse="")
}
}
## Percent
if(type=="percent"){
if(align$Mismatch[1] == "no_hits"){
align$type_start_per <- "no_hits"
align$type_mid_start_per <- "no_hits"
align$type_mid_end_per <- "no_hits"
align$type_end_per <- "no_hits"
}else{
inter_seq <- lapply(intervals, function(y){
inter <- round(ref_len*(y*0.01), digits=0)
return(seq(min(inter), max(inter)))
})
align$type_start_per <- ifelse(
align$Align_start %in% inter_seq$start, "TRUE", "FALSE")
align$type_mid_start_per <- ifelse(
align$Align_start %in% inter_seq$mid, "TRUE", "FALSE")
align$type_mid_end_per <- ifelse(
align$Align_end %in% inter_seq$mid, "TRUE", "FALSE")
align$type_end_per <- ifelse(
align$Align_end %in% inter_seq$end, "TRUE", "FALSE")
}
}
## Nucleotides
if(type=="nucleotides"){
if(align$Mismatch[1] == "no_hits"){
align$type_start_nuc <- "no_hits"
align$type_end_nuc <- "no_hits"
}else{
inter_seq <- list(start=intervals$start,
end= seq(ref_len-max(intervals$end),
(ref_len-min(intervals$end))+1))
align$type_start_nuc <- ifelse(
align$Align_start %in% inter_seq$start, "TRUE", "FALSE")
align$type_end_nuc <- ifelse(
align$Align_end %in% inter_seq$end, "TRUE", "FALSE")
}
}
range_types[[i]]$Alignments <- align
}
}
## loop mapping
if(type=="ss"){
## Try to catch file path
ss_anno <- tryCatch(readLines(ss), error=function(e){ss})
## First make a split function if not already a list
if(!methods::is(ss_anno, "list")){
logi_breaks <- ss_anno == ""
breaks <- 1
fact <- NULL
for (i in seq.int(length(logi_breaks))){
if(logi_breaks[i] == TRUE){
breaks <- breaks+1
}
fact <- c(fact, breaks)
}
ss_lst <- split(ss_anno, as.factor(fact))
}
ss_lst <- ss_lst[!unlist(lapply(ss_lst, function(x){
length(x) <1}))] # Remove list objects
ss_lst <- lapply(ss_lst, function(x){x <- x[!x == ""]}) # Remove empty lines
## Extract data from ss
fin_ss <- lapply(ss_lst, function(x){
seqs <- x[grepl("\\<Seq: ", x)]
seqs <- gsub("Seq: ", "", seqs)
seqs <- gsub("t", "T", seqs) # Fix lower case letters from tRNA-scan-SE
seqs <- gsub("g", "G", seqs)
seqs <- gsub("a", "A", seqs)
seqs <- gsub("c", "C", seqs)
str <- x[grepl("\\<Str: ", x)]
str <- gsub("Str: ", "", str)
tpe <- x[grepl("\\<Type: ", x)]
tpe <- gsub("Type: ", "", tpe)
isodec_ss <- do.call("rbind", strsplit(tpe, "\t| at"))[,1]
isoacc_ss <- gsub("Anticodon: ",
"",
do.call("rbind", strsplit(tpe, "\t| at"))[,2])
fin_ss <- data.frame(Ref_seqs=as.character(seqs),
ss_anno=as.character(str),
isodec_ss=as.character(isodec_ss),
isoacc_ss=as.character(isoacc_ss),
stringsAsFactors=FALSE)
return(fin_ss)
})
fin_ss_df <- do.call("rbind", fin_ss)
names(fin_ss) <- fin_ss_df$Ref_seq
## Test if map is mature tRNA
logi_map_mature <- any(!unlist(lapply(map, function(x){
grepl("CCA\\>", x$Ref_seq)
})))
logi_ss_mature <- any(!unlist(lapply(fin_ss, function(x){
grepl("CCA\\>", x$Ref_seqs)
})))
if(logi_map_mature == FALSE && logi_ss_mature == TRUE){
cat("\nIt seems the map object, but not the ss object, contains only",
"\nmature tRNA (CCA in the 3'end). Will attempt to add CCA to ss",
"\nobject and then match map with ss.\n")
fin_ss_df$Ref_seqs <- as.character(paste0(fin_ss_df$Ref_seqs, "CCA"))
fin_ss_df$ss_anno <- as.character(paste0(fin_ss_df$ss_anno, "..."))
}
## Test if all references are included
original_refs <- unlist(lapply(map, function(x){
gsub("N", "", paste0(x$Ref_seq))
}))
logi_all <- original_refs %in% fin_ss_df$Ref_seqs
match_original <- match(fin_ss_df$Ref_seqs, original_refs)
# Remove the missing references
if(any(!logi_all)){
cat("\n")
warning(
"\nThe ss object does not contain all references in the map object.",
"\nFor best result provide a matched ss file, generated by for",
"\nexample tRNAscan-SE. (http://lowelab.ucsc.edu/tRNAscan-SE/)",
call. = FALSE, immediate.=TRUE)
cat("\nThe following reference(s) will therefore be",
"\nremoved from the results:\n")
print(names(map)[!logi_all])
map <- map[match_original]
N_counts <- N_counts[match_original]
}
# Match the ss
original_refs <- unlist(lapply(map, function(x){
gsub("N", "", paste0(x$Ref_seq))
}))
match_ss <- match(original_refs, fin_ss_df$Ref_seqs)
fin_ss_df <- fin_ss_df[match_ss,, drop=FALSE]
# Compensate for N_up and N_down
stopifnot(identical(fin_ss_df$Ref_seqs,
paste0(unlist(lapply(map, function(x){
gsub("N", "", paste0(x$Ref_seq))
})))))
stopifnot(identical(names(N_counts), names(map)))
range_types <- map
for(i in seq.int(nrow(fin_ss_df))){
if(!is.null(N_counts[[i]])){
fin_ss_df$Ref_seqs[i] <- paste0(
paste(rep("N", times=N_counts[[i]][1]), collapse=""),
fin_ss_df$Ref_seqs[i])
fin_ss_df$Ref_seqs[i] <- paste0(
fin_ss_df$Ref_seqs[i],
paste(rep("N", times=N_counts[[i]][1]), collapse=""))
fin_ss_df$ss_anno[i] <- paste0(
paste(rep(".", times=N_counts[[i]][1]), collapse=""),
fin_ss_df$ss_anno[i])
fin_ss_df$ss_anno[i] <- paste0(
fin_ss_df$ss_anno[i],
paste(rep(".", times=N_counts[[i]][1]), collapse=""))
if(!map[[i]][[2]]$Align_string[1] == "no_hits"){
substring(map[[i]][[2]]$Align_string, 1, N_counts[[i]][1]) <- paste(
rep("*", times=N_counts[[i]][1]), collapse="")
substring(
map[[i]][[2]]$Align_string,
nchar(map[[i]][[2]]$Align_string) - N_counts[[i]][2]+1,
nchar(map[[i]][[2]]$Align_string)) <- paste(
rep("*", times=N_counts[[i]][2]), collapse="")
}
}
## Identify different loops map references using ss
trgt <- fin_ss_df[i, ,drop=FALSE]
stopifnot(nchar(trgt$ss_anno) == nchar(trgt$Ref_seqs))
## Check if there are special case character
splt_char <- strsplit(paste0(trgt$ss_anno), "")[[1]]
logi_char <- splt_char %in% c(">", ".", "<")
if(any(!logi_char)){
warning("Located special character in ss loop annoation (character",
"\nelse than: '>' '.' '<'). Will attempt to translate that ",
"\ncharacter to '.'")
splt_char[!logi_char] <- "."
trgt$ss_anno <- as.character(paste0(splt_char, collapse=""))
}
## Generate loop intervals
# Find ">" with only "." or "X" before "<" ./X must occur at least
# 5 times. Explanation: str_locate_all looks for < and > but they
# should not be counted.
code_split <- as.data.frame(
stringr::str_locate_all(as.character(trgt$ss_anno),
paste0(">[\\.X]{", min_loop_width,",}<"))[[1]])
# Start and ends needs to be adjusted; +1 at starts and -1 at ends
code_split$start <- code_split$start +1
code_split$end <- code_split$end -1
## Merge with map
algn <- map[[i]][[2]]
algn$n_ssloop <- nrow(code_split)
code_split$loop_name <- rownames(code_split)
code_split$range <- paste(code_split$start, code_split$end, sep=":")
df <- t(data.frame(matrix(code_split$range,
nrow = nrow(code_split) ,
ncol =nrow(algn))))
colnames(df) <- paste0("range_loop", code_split$loop_name)
rownames(df) <- rownames(algn)
algn_loop <- cbind(algn, df)
## Generate fragment classification
ncl <- ncol(algn_loop)
for (k in seq.int(nrow(code_split))){
algn_loop[,ncl+k] <- algn$Align_start %in% seq(code_split$start[k],
code_split$end[k])
colnames(algn_loop)[ncl+k] <- paste0("type_start_loop", k)
}
ncl <- ncol(algn_loop)
for (k in seq.int(nrow(code_split))){
algn_loop[,ncl+k] <- algn$Align_end %in% seq(code_split$start[k],
code_split$end[k])
colnames(algn_loop)[ncl+k] <- paste0("type_end_loop", k)
}
algn_loop$decoder <- trgt$isodec_ss
algn_loop$acceptor <- trgt$isoacc_ss
range_types[[i]]$Alignments <- algn_loop
range_types[[i]]$Loop <- c(paste0("Loops are located between '>>..<<'"),
paste0(map[[i]]$Ref_seq),
unique(paste0(trgt$ss_anno)))
}
}
range_types <- range_types[!is.null(range_types)]
return(range_types)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.