#' tsRNA analysis of PAC object
#'
#' Analyzing and plotting tsRNAs between groups.
#'
#' Given a PAC and a PAC_map object generated by \code{\link{PAC_mapper}} this
#' function will attempt to analyze and plot tsRNA over two pheno_target groups,
#' using two tsRNA classifications (anno_target_1 and annot_target_2).
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#' package.
#'
#' @param PAC PAC-list object. The Anno object needs to contain at least two
#' tsRNA classification columns, typically isotype (e.g. LysCTT, GlyGCC) and
#' rangetype (5'tsRNA, 5'tsRNA-halves, i'-tsRNA, 3'tsRNA, 3'tsRNA-halves).
#' These can be generated by first mapping sequences to tRNA reference using
#' \code{\link{PAC_mapper}} and extract isotypes from tRNA names, and then
#' obtain rangetype classifications using \code{\link{map_rangetype}}.
#'
#' @param norm Character indicating what type of data in PAC to be used as
#' input. If norm="raw" the raw counts in Counts will be used. Given any other
#' character string, the function will search for the string as a name on a
#' dataframe stored in the PAC$norm list-folder (created for example by
#' PAC_rpm), (default="rpm")
#'
#' @param pheno_target List with: 1st object being a character vector of the
#' target column name in Pheno, and the 2nd object being a character vector
#' specifying the names of two target groups in the target column (1st
#' object). Note, that this function will only work using two groups from the
#' pheno_target column, like this: pheno_target=list(<Pheno_column_name>,
#' c(<group_1>, <group_2>)). If there are more groups than two in the target
#' column, unspecified groups will not be analyzed.
#'
#' @param anno_target_1 List with: 1st object being a character of the target
#' column name in Anno, 2nd object being a character vector of the target
#' classifications in the 1st target Anno column. The anno_target_1 is
#' typically used for isotype classification in tsRNA analysis (example:
#' LysCTT, GlyGGC etc). Specifying only the column name, will automatically
#' include all classifications in the anno_target column.
#'
#' @param anno_target_2 Same as anno_target_2 but specifying the the second
#' classification typically used for 3'-5' classification in tsRNA analysis
#' (example: 5'tsRNA, 5'tsRNA-halves, i'-tsRNA, 3'tsRNA, 3'tsRNA-halves).
#' Note, that anno_target_2[[2]] is order sensitive, meaning that the order
#' will be preserved in the graphs.
#'
#' @param filter Integer specifying the minimum coverage value (specified by
#' norm) for a sequence to be included in the analysis. If filter=10 and
#' norm="rpm", only sequences that have at least 10 rpm in 100% of samples
#' will be included. (default=NULL).
#'
#' @param top Integer, specifying the number of the most highly expressed
#' classifications in anno_target_1 that should be reported in the graphs.
#'
#' @param join Logical, whether separate expression tables of anno_target_1 and
#' anno_target_2 should be generated for each group in pheno_target (FALSE),
#' or if pheno_target groups should be joined (TRUE).
#'
#' @param log2fc Logical, whether log2 fold change point-bars (independent
#' pheno_target groups) or errorbars (paired pheno_target groups) should be
#' reported.
#'
#' @param paired Logical, whether pheno_target is given as paired samples (e.g.
#' before and after treatment of the same patient). Only works with
#' \code{paired_IDs}.
#'
#' @param paired_IDs Character, specifying the name of a column in Pheno
#' containing the paired sample names. For example, if
#' \emph{pheno_target=list("treatment", c("before", "after"))}, and
#' \emph{paired=TRUE}, then \emph{paired_IDs="patient"}, must be a column in
#' Pheno containing the same patient ID reported twice for each patient
#' (before and after).
#'
#' @param ymax_1 Integer that sets the maximum y for all mean plots (all plots
#' gets the same y-axes). If ymax_1=NULL (default), then ggplot2 will
#' automatically set ymax for each plot individually (different y-axes).
#'
#' @return List of ggplot2 plots and the data used for generating the plots. Use
#' ls.str() to explore each level.
#'
#' @examples
#'
#'
#'###########################################################
#'### tRNA analysis in seqpac
#'# (More details see vignette and manuals.)
#'
#'##-------------------------------##
#'## Setup environment for testing ##
#'
#'# First create an annotation blanc PAC with group means
#'load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#'anno(pac) <- anno(pac)[,1, drop=FALSE]
#'pac_trna <- PAC_summary(pac, norm = "cpm", type = "means",
#' pheno_target=list("stage"), merge_pac = TRUE)
#'
#'
#'# Then load paths to example trna ref and ss files
#'trna_ref <- system.file("extdata/trna2", "tRNA2.fa",
#' package = "seqpac", mustWork = TRUE)
#'
#'ss_file <- system.file("extdata/trna2", "tRNA2.ss",
#' package = "seqpac", mustWork = TRUE)
#'
#'
#'##--------------------------------------##
#'## Create a map object using PAC_mapper ##
#'map_object <- PAC_mapper(pac_trna, ref=trna_ref,
#' N_up = "NNN", N_down = "NNN",
#' mismatches=0, threads=2,
#' report_string=TRUE, override = TRUE)
#'# Warning: override = TRUE, will remove everything in temporary output path.
#'# Note: bowtie indexes are not nedded for PAC_mapper.
#'
#'
#'##------------------------------------------##
#'## Coverage plots of tRNA using PAC_covplot ##
#'
#'# Single tRNA targeting a summary data.frame
#'PAC_covplot(pac_trna, map=map_object, summary_target= list("cpmMeans_stage"),
#' map_target="tRNA-Ala-AGC-1-1")
#'
#'# Find tRNAs with many fragments and then plot
#'n_tRFs <- unlist(lapply(map_object, function(x){nrow(x[[2]])}))
#'selct <- (names(map_object)[n_tRFs>1])[c(1, 16, 25, 43)]
#'cov_plt <- PAC_covplot(pac_trna, map=map_object,
#' summary_target= list("cpmMeans_stage"),
#' map_target=selct)
#'cowplot::plot_grid(plotlist=cov_plt, nrow=2, ncol=2)
#'
#'
#'##-------------------------------------------##
#'## Classify tRNA fragment with map_rangetype ##
#'
#'# Classify according to loop structure using ss file provided with seqpac
#'map_object_ss <- map_rangetype(map_object, type="ss", ss=ss_file,
#' min_loop_width=4)
#'# Note 1: You may download your own ss file at for example GtRNAdb
#'# Note 2: The ss file must match the reference used in creating the map_object
#'
#'# Classify instead according to nucleotide position or percent intervals
#'map_object_rng <- map_rangetype(map_object, type="nucleotides",
#' intervals=list(start=1:5, end=95:100))
#'
#'map_object_prc <- map_rangetype(map_object, type="percent",
#' intervals=list(start=1:5, mid=45:50,
#' end=95:100))
#'
#'# Compare the output (same fragment different classifications)
#'map_object_ss[[2]]
#'map_object_prc[[2]]
#'map_object_rng[[2]]
#'
#'
#'##-------------------------------##
#'## Classify tRFs with tRNA_class ##
#'# Warning: this function currently only works using map objects
#'# created using ss files with specific names for loop1/2/3 columns:
#'colnames(map_object_ss[[2]][[2]])
#'
#'# Important: We added three "Ns" in PAC_mapper (above). If, terminal
#'# classification should correspond to the original tRNA sequences we need to
#'# account for these Ns when defining what is a terminal fragment. Here, we set
#'# "terminal=5", which means that PAC sequences will receive 5'/3'
#'# classification if they map to the first two or last two nucleotides of the
#'# original full-length tRNAs (2+NNN =5).
#'
#'pac_trna <- tRNA_class(pac_trna, map_object_ss, terminal=5)
#'
#'##---------------------------------##
#'## Plot some results with PAC_trna ##
#'
#'# Here is one example of tRNA visualization using our PAC_trna function:
#'trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL,
#' join = TRUE, top = 15, log2fc = TRUE,
#' pheno_target = list("stage", c("Stage1", "Stage5")),
#' anno_target_1 = list("type"),
#' anno_target_2 = list("class"))
#'
#'cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Grand_means,
#' trna_result$plots$Log2FC_Anno_1,
#' trna_result$plots$Percent_bars$Grand_means,
#' nrow=1, ncol=3)
#'# Note: There are no 3ยด fragments in our test data.
#'
#'# By setting join = FALSE you will get group means instead of grand means:
#'trna_result <- PAC_trna(pac_trna, norm="cpm", filter = NULL,
#' join = FALSE, top = 15, log2fc = TRUE,
#' pheno_target = list("stage", c("Stage1", "Stage3")),
#' anno_target_1 = list("type"),
#' anno_target_2 = list("class"))
#'
#'cowplot::plot_grid(trna_result$plots$Expression_Anno_1$Stage1,
#' trna_result$plots$Expression_Anno_1$Stage3,
#' trna_result$plots$Log2FC_Anno_1,
#' trna_result$plots$Percent_bars$Stage1,
#' trna_result$plots$Percent_bars$Stage3,
#' nrow=1, ncol=5)
#'
#'##-----------------------------------------##
#'## Clean up temp folder ##
#'# (Sometimes needed for automated examples)
#'
#'closeAllConnections()
#'fls_temp <- tempdir()
#'fls_temp <- list.files(fls_temp, recursive=TRUE,
#' full.names = TRUE)
#'suppressWarnings(file.remove(fls_temp))
#'
#' @export
PAC_trna <- function(PAC, norm="cpm", filter=100, join=FALSE, top=15,
log2fc=FALSE, pheno_target = NULL, anno_target_1 = NULL,
ymax_1=NULL, anno_target_2 = NULL, paired=FALSE,
paired_IDs=NULL) {
Group.1 <- means <- SE <- value <- ann1 <- perc <- ann2 <- NULL
## Setup ##
if(isS4(PAC)){
tp <- "S4"
PAC <- as(PAC, "list")
}else{
tp <- "S3"
}
if(!is.null(anno_target_1)){
if(!methods::is(anno_target_1, "list")){
anno_target_1 <-list(anno_target_1)}
if(length(anno_target_1)==1){
anno_target_1[[2]] <- unique(PAC$Anno[, anno_target_1[[1]]])
}
anno_target_1 <- lapply(anno_target_1, function(x){as.character(x)})
}else{
stop("You have provided an invalid anno_target list object.")}
if(!is.null(anno_target_2)){
if(!methods::is(anno_target_2, "list")){
anno_target_2 <-list(anno_target_2)}
if(length(anno_target_2)==1){
anno_target_2[[2]] <- unique(PAC$Anno[, anno_target_2[[1]]])
}
anno_target_2 <- lapply(anno_target_2, function(x){as.character(x)})
}else{
stop("You have provided an invalid anno_target list object.")}
if(!is.null(pheno_target)){
if(!methods::is(pheno_target, "list")){
pheno_target <- list(pheno_target)}
if(length(pheno_target)==1){
pheno_target[[2]] <- unique(PAC$Pheno[, pheno_target[[1]]])
}
pheno_target <- lapply(pheno_target, function(x){as.character(x)})
}else{
stop("You have provided an invalid anno_target list object.")}
PAC <- PAC_filter(PAC, subset_only=TRUE, anno_target=anno_target_1,
pheno_target=pheno_target)
PAC <- PAC_filter(PAC, subset_only=TRUE, anno_target=anno_target_2)
if(!is.null(filter)){
PAC <- PAC_filter(PAC, norm=norm, threshold=filter, coverage=100)
}
PAC$Pheno[,pheno_target[[1]]] <- factor(PAC$Pheno[,pheno_target[[1]]],
levels=pheno_target[[2]])
PAC$Anno[,anno_target_1[[1]]] <- factor(PAC$Anno[,anno_target_1[[1]]],
levels=anno_target_1[[2]])
PAC$Anno[,anno_target_2[[1]]] <- factor(PAC$Anno[,anno_target_2[[1]]],
levels=anno_target_2[[2]])
if(norm=="raw"){
data <- PAC$Counts
}else{
data <- PAC$norm[[norm]]
}
## Aggregate data for anno_target_1 ##
spl_lst <- split(as.data.frame(t(data)),
PAC$Pheno[,pheno_target[[1]]], drop=FALSE)
Ann1_agg_lst <- lapply(spl_lst, function(x){
x <- t(x)
x_agg <- stats::aggregate(x, list(PAC$Anno[[anno_target_1[[1]]]]), sum)
x_agg_long <- reshape2::melt(x_agg, id.var= "Group.1")
})
if(join==TRUE){
Ann1_agg_lst <- list(do.call("rbind", Ann1_agg_lst))
names(Ann1_agg_lst) <- "Grand_means"
}
## Aggregate data for anno_target_2 ##
spl_lst_2 <- split(as.data.frame(t(data)),
PAC$Pheno[,pheno_target[[1]]], drop=FALSE)
Ann2_agg_lst <- lapply(spl_lst_2, function(x){
x <- t(x)
x_agg <- stats::aggregate(x, list(PAC$Anno[[anno_target_2[[1]]]]), sum)
x_agg_long <- reshape2::melt(x_agg, id.var= "Group.1")
})
if(join==TRUE){
Ann2_agg_lst <- list(do.call("rbind", Ann2_agg_lst))
names(Ann2_agg_lst) <- "Grand_means"
}
## Aggregate data for anno_target_1 and anno_target_2 ##
spl_lst_12 <- split(as.data.frame(t(data)),
PAC$Pheno[,pheno_target[[1]]], drop=FALSE)
Ann12_agg_lst <- lapply(spl_lst_12, function(x){
x <- t(x)
x_agg <- stats::aggregate(x, list(paste(PAC$Anno[[anno_target_1[[1]]]],
PAC$Anno[[anno_target_2[[1]]]],
sep="____")), sum)
x_agg_long <- reshape2::melt(x_agg, id.var= "Group.1")
facts <- as.data.frame(do.call("rbind",
strsplit(x_agg_long$Group.1, "____")))
return(cbind(x_agg_long, data.frame(ann1=facts[,1], ann2=facts[,2])))
})
if(join==TRUE){
Ann12_agg_lst <- list(do.call("rbind", Ann12_agg_lst))
names(Ann12_agg_lst) <- "Grand_means"
}
## Fix missing values in each category and generate percent types
Ann12_perc <- lapply(Ann12_agg_lst, function(x){
spl_inside <- split(x, x$ann1)
ann12_lst <- lapply(spl_inside, function(y){
missing <- anno_target_2[[2]][!anno_target_2[[2]] %in% unique(y$ann2)]
if(length(missing)>0){
df <- data.frame(Group.1= paste(as.character(unique(y$ann1)),
missing, sep="____"),
variable= NA,
value= 0,
ann1= as.character(unique(y$ann1)),
ann2= missing)
df <- df[rep( seq.int(length(missing)), times=length(unique(y$variable))),]
df$variable <- rep( unique(y$variable), each=length(missing))
}else{
df <- NULL
}
df_fin <- rbind(y, df)
df_fin_agg <- stats::aggregate(df_fin$value, list(df_fin$Group.1), mean)
names(df_fin_agg)[names(df_fin_agg)=="x"] <- "values"
tot <- sum(df_fin_agg$values)
df_fin_agg$perc <- df_fin_agg$values/tot*100
df_fin_agg$perc[is.na(df_fin_agg$perc)] <- 0
return(df_fin_agg)
})
return(ann12_lst)
})
## Ordered according to sums of anno_target_1 in first object
ordr <- order(unlist(lapply(Ann12_perc[[1]], function(x){
sum(x$values)})), decreasing=TRUE)
ordr <- ordr[seq.int(top)] # Extract the top
lvls <- names(Ann12_perc[[1]])[ordr]
if(join==FALSE){
stopifnot(identical(lapply(Ann12_perc, names)[[1]],
lapply(Ann12_perc, names)[[2]]))
}
Ann12_perc_ord <- lapply(Ann12_perc, function(x){
ord_df <- do.call("rbind", x[ordr])
facts <- as.data.frame(do.call("rbind",
strsplit(as.character(ord_df$Group.1),
"____")))
return(cbind(ord_df, data.frame(ann1=facts[,1], ann2=facts[,2])))
})
## Generate colors
colfunc_ann1 <- grDevices::colorRampPalette(c("white", "black"))
rgb_vec_ann1 <- rev(colfunc_ann1(top))
colfunc_ann2<- grDevices::colorRampPalette(c("#094A6B", "#FFFFFF", "#9D0014"))
rgb_vec_ann2 <- rev(colfunc_ann2(length(anno_target_2[[2]])))
## Fix zeros for log10
Ann1_agg_lst <- lapply(Ann1_agg_lst, function(x) {
x$value[x$value == 0] <- 0.000001; return(x)
})
plot_lst <- list(NULL)
#######################################################################
## Plot mean expression anno_target_1
plot_lst$Expression_Anno_1 <- lapply(Ann1_agg_lst, function(x){
## Bars for mean RPM per type - log10
x$Group.1 <- factor(x$Group.1, levels=rev(lvls))
x <- x[!is.na(x$Group.1),]
dat <- stats::aggregate(x$value, list(x$Group.1), mean)
names(dat)[names(dat)=="x"] <- "means"
dat$SE <- (stats::aggregate(x$value, list(x$Group.1), function(y){
stats::sd(y)/(sqrt(length(y)))}))$x
dat$means[dat$means<1] <- 1
set_max <- max(dat$means + dat$SE)
if(set_max < 10000){
breaks <- c(1,10,100,1000,10000)
}
if(set_max >= 10000 && set_max <100000){
breaks <- c(1,10,100,1000,10000,100000)
}
if(set_max >= 100000){
breaks <- c(1,10,100,1000,10000,100000,1000000)
}
plot <- ggplot2::ggplot(dat, ggplot2::aes(x=Group.1, y=means, fill=Group.1 ,
ymax = means + SE,
ymin = means - SE)) +
ggplot2:: geom_errorbar(width=0.5, size=1.0, colour="black",
position = "identity") +
ggplot2::geom_col(width = 0.8, cex=0.2, colour="black")+
ggplot2::labs(title=paste0("Mean ", norm))+
ggplot2::ylab(paste0("Log10 ", norm, " +/- SE")) +
ggplot2::scale_fill_manual(values=rev(rgb_vec_ann1))+
ggplot2::theme_classic()+
ggplot2::theme(legend.position="none",
axis.title.y= ggplot2::element_blank(),
panel.grid.major.y=ggplot2::element_line(linetype="dashed",
colour="grey",
size=0.5),
panel.grid.major.x = ggplot2::element_line(colour="grey",
size=0.5),
axis.text.x = ggplot2::element_text(angle = 0, hjust = 0),
axis.text.y = ggplot2::element_text(angle = 0, hjust = 0),
axis.line.x = ggplot2::element_blank())+
ggplot2::scale_y_log10(limits = c(min(breaks),max(breaks)),
breaks=breaks)+
ggplot2::coord_flip()
if(!is.null(ymax_1)){
plot <- plot + ggplot2::scale_y_continuous(limits=c(0,ymax_1))
}
return(plot)
})
#Paired#######################################################################
if(paired==TRUE && log2fc==TRUE){
return(cat("\nPaired samples are not yet implemented in the function,",
"\nbut will be in later versions of seqpac."))
}
#Independent###############################################################
if(paired==FALSE && log2fc==TRUE){
## Error bars for log2_FC types - independent
if(join==TRUE){
Ann1_agg_lst <- split(Ann1_agg_lst[[1]],
factor(do.call("rbind",
strsplit(
rownames(Ann1_agg_lst[[1]]),
"\\."))[,1],
levels=pheno_target[[2]]))
}
data_lst <- lapply(Ann1_agg_lst, function(x){
agg <- stats::aggregate(x$value,
list(factor(x$Group.1, levels=lvls)), mean)
colnames(agg)[colnames(agg)=="x"] <- "value"
return(agg)
})
stopifnot(identical(data_lst[[1]]$Group.1, data_lst[[2]]$Group.1))
logfc <- data.frame(Group.1=data_lst[[1]]$Group.1,
value=log2(data_lst[[1]]$value/data_lst[[2]]$value))
logfc$Group.1 <- factor(logfc$Group.1, levels= rev(logfc$Group.1))
lim <- max(sqrt(logfc$value^2))
plot_lst$Log2FC_Anno_1 <- ggplot2::ggplot(logfc,
ggplot2::aes(x=Group.1,
y=value,
fill=Group.1,
ymin=value,
ymax=value)) +
ggplot2::geom_hline(yintercept = 0, linetype="dashed",
size=1, color="azure4")+
ggplot2::geom_errorbar(width=0.8, size=0.5, position = "identity") +
ggplot2::geom_point(shape=21, size=4, position = "identity") +
ggplot2::labs(title=paste0(pheno_target[[2]], collapse=" vs ")) +
ggplot2::ylab(paste0("Log2FC between groups (", norm, ") +/- SE")) +
ggplot2::scale_fill_manual(values=rev(rgb_vec_ann1)) +
ggplot2::scale_x_discrete(labels=levels(logfc$Group.1)) +
ggplot2::theme_classic()+
ggplot2::theme(legend.position="none",
axis.title.y = ggplot2::element_blank(),
panel.grid.major.y=
ggplot2::element_line(linetype="dashed",
colour="grey",
size=0.5),
panel.grid.major.x = ggplot2::element_line(colour="grey",
size=0.5),
axis.text.x = ggplot2::element_text(angle = 0, hjust = 0),
axis.text.y = ggplot2::element_blank(),
axis.line.x = ggplot2::element_blank(),
axis.line.y = ggplot2::element_blank())+
ggplot2::coord_flip(ylim = c((min(logfc$value)-1), (max(logfc$value)+1)))
}
## Percent filled bar (All)
plot_lst$Percent_bars <- lapply(Ann12_perc_ord, function(x){
x$ann1 <- factor(x$ann1, levels=rev(unique(x$ann1)))
x$ann2 <- factor(x$ann2, levels=rev(anno_target_2[[2]]))
plot <- ggplot2::ggplot(x, ggplot2::aes(x=ann1, y=perc, fill=ann2)) +
ggplot2::geom_col(width = 0.9, cex=0.2, colour="black", position="fill")+
ggplot2::labs(title="Mean percent content")+
ggplot2::ylab("%") +
ggplot2::scale_fill_manual(values=rev(rgb_vec_ann2))+
ggplot2::theme_classic()+
ggplot2::theme(axis.title.y= ggplot2::element_blank(),
axis.text.x = ggplot2::element_text(angle = 0,
hjust = 0),
axis.line.x = ggplot2::element_blank(),
axis.line.y = ggplot2::element_blank())+
ggplot2::coord_flip(ylim=c(0, 1))
return(plot)
})
return(list(plots=plot_lst[-1], data=list(anno_target_1=Ann1_agg_lst,
annot_target_2=Ann2_agg_lst,
Percent=Ann12_perc)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.