#' Plot sequence coverage over a reference
#'
#' \code{PAC_covplot} Plotting sequences in a PAC object using an PAC mapping
#' object.
#'
#' Given a PAC object and a map object generated by \emph{PAC_mapper} this
#' function will attempt to plot the sequence coverage over long references.
#'
#' @family PAC analysis
#'
#' @seealso \url{https://github.com/Danis102} for updates on the current
#' package.
#'
#' @param PAC PAC-list object.
#'
#' @param map PAC_map object generated by \emph{PAC_mapper}.
#'
#' @param summary_target List with 2 character vectors: 1st object: Character
#' indicating the name of the target PAC summary object which should be used
#' as input data for the plots. If left empty the 1st summary object will be
#' used. 2nd object: Character vector indicating the names of the columns in
#' the summary object from which individual graphs should be plotted. If left
#' empty all summary columns are plotted in the same graph. Summaries are
#' generated by the PAC_summary function or custom generated by the user and
#' stored in the summary 'folder' of a PAC object.
#'
#' @param map_target (optional) Character vector. Important: This is similar to
#' an anno_target, but instead extracts target references in the PAC_mapping
#' object. Should contain search terms that can find unique strings in the
#' reference names. The search terms are parsed to grepl("<search terms>,
#' names(<PAC_mapper object>)). (default=NULL)
#'
#' @param xseq Logical indicating whether the nucleotides should be plotted on
#' the x-axis. Plotting long references with xseq=FALSE will increase script
#' performance. (default=TRUE).
#'
#' @param colors Character vector indicating the rgb colors to be parsed to
#' ggplot2 for plotting the coverage lines (default = c("black", "red",
#' "grey", "blue"))
#'
#' @param style Character string. If style="line" (default) then graphs are
#' plotted as simple path/line plots (ggplot2::geom_path). If style="solid"
#' then the area under the line is filled with same color as the lines but
#' with added transparency (ggplot2::geom_line + geom_ribbon, alpha=0.5).
#'
#' @param check_overide Logical, whether all sequences names in PAC must be
#' found in the PAC_map object (TRUE) or not (FALSE). If the PAC_map object
#' was generated from the provided PAC objects, then all sequences should
#' match. If the PAC list object was subsetted or the PAC_map is used with an
#' independent PAC list object in which not all sequences are available, then
#' this function will create an error, unless check_overide=TRUE. In such
#' case, all missing sequences in the PAC object will automatically be
#' assigned a 0-value in all summary_target columns.
#'
#' @return Plot list with plots generated by ggplot2
#'
#' @examples
#'
#'###########################################################
#'### Simple example of how to use PAC_mapper and PAC_covplot
#' # Note: More details, see vignette and manuals.)
#' # Also see: ?map_rangetype, ?tRNA_class or ?PAC_trna for more examples
#' # on how to use PAC_mapper.
#'
#' ## Load PAC-object, make summaries and extract rRNA and tRNA
#' load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata",
#' package = "seqpac", mustWork = TRUE))
#'
#' pac <- PAC_summary(pac, norm = "cpm", type = "means",
#' pheno_target=list("stage", unique(pheno(pac)$stage)))
#'
#' pac_tRNA <- PAC_filter(pac, anno_target = list("Biotypes_mis0", "tRNA"))
#'
#'
#' ## Give paths to a fasta reference (with or without bowtie index)
#' # (Here we use an rRNA/tRNA fasta included in seqpac)
#'
#' ref_tRNA <- system.file("extdata/trna", "tRNA.fa",
#' package = "seqpac", mustWork = TRUE)
#'
#'
#' ## Map using PAC-mapper
#'
#' map_tRNA <- PAC_mapper(pac_tRNA, mismatches=0,
#' threads=1, ref=ref_tRNA, override=TRUE)
#'
#' ## Plot tRNA using xseq=TRUE gives you reference sequence as X-axis:
#' # (OBS! Long reference will not print well with xseq=TRUE)
#' cov_tRNA <- PAC_covplot(pac_tRNA, map_tRNA,
#' summary_target = list("cpmMeans_stage"),
#' xseq=TRUE, style="line",
#' color=c("red", "black", "blue"))
#'
#' cov_tRNA[[1]]
#'
#' ## Check which tRNA reached a decent number of unique fragments
#' # (OBS! This is a very down sampled dataset)
#' logi_hi <- unlist(lapply(map_tRNA, function(x){nrow(x$Alignments) > 10 }))
#'
#' table(logi_hi)
#' names(map_tRNA[logi_hi])
#'
#' targets <- c("Ala-AGC-1-1", "Lys-CTT-1-13","Ser-AGA-2-2")
#'
#' cov_tRNA_sub <- PAC_covplot(pac_tRNA, map_tRNA,
#' summary_target = list("cpmMeans_stage"),
#' map_target = targets,
#' xseq=TRUE, style="line",
#' color=c("red", "black", "blue"))
#'
#' cowplot::plot_grid(plotlist= cov_tRNA_sub)
#'
#' @export
PAC_covplot <- function(PAC, map, summary_target=NULL, map_target=NULL,
style="line", xseq=TRUE, colors=NULL,
check_overide=FALSE){
if(isS4(PAC)){
tp <- "S4"
PAC <- as(PAC, "list")
}else{
tp <- "S3"
}
if(is.null(summary_target)){
stop("\nYou need to specify a target object in ",
"\nPAC$summary with summary_target.")
}
if(is.null(names(PAC$summary[[summary_target[[1]]]]))){
stop("\nYou need to specify a valid summary_target.",
"\n(Hint: Double check correct object name in",
"\n PAC$summary or rerun the 'PAC_summary' function.)")
}
if(length(summary_target)==1){
summary_target[[2]] <- names(PAC$summary[[summary_target[[1]]]])
}
if(is.null(map_target)){
map_target <- names(map)
}
norm <- summary_target[[1]]
## Subsetting data
smry <- PAC$summary[[summary_target[[1]]]]
data <- smry[,summary_target[[2]], drop=FALSE]
data$empty_ <- 0 # Avoids problems with automatic vectorization
sub_map <- map[names(map) %in% map_target]
if(length(sub_map)==0){
sub_map <- map[grepl(paste(map_target, collapse="|"), names(map))]
}
sub_map<-sub_map[!unlist(lapply(sub_map, function(x){x[[2]][1,1] == "no_hits"}))]
uni_map <- do.call("c",lapply(sub_map, function(x){
d_sub <- gsub("\\.\\d", "", rownames(x$Alignments))
d_sub <- gsub('[[:digit:]]+', '', d_sub)
return(d_sub)
}))
uni_map <- as.character(unique(uni_map[!uni_map == "1"]))
PAC <- PAC_filter(PAC, anno_target=uni_map, subset_only=TRUE)
if(!nrow(PAC$Anno) == length(uni_map)){
warning("Only ",
nrow(PAC$Anno),
" of ",
length(uni_map),
" mapped sequences were found in PAC.",
"\n Will proceed with the ones that were found.",
"\n (Hint: Did you subset the PAC object after you made the map?)")
}
## Remove empty references
rm_filt <- !do.call("c", lapply(sub_map, function(x){
as.character(x$Alignments[1,1]) == "no_hits"
}))
cat(paste0("\nOf ",
length(rm_filt),
" references analyzed, ",
length(rm_filt[rm_filt==TRUE]),
" was covered\n"))
cat(paste0("by one or more query sequence in PAC.\n"))
if(length(names(sub_map)[!rm_filt]) > 0 ){
if(length(names(sub_map)[!rm_filt]) > 30){
cat("References with no coverage will be ignored.\n")
}
if(length(names(sub_map)[!rm_filt]) <= 30){
cat("These reference names was not covered by any query sequence:\n")
print(names(sub_map)[!rm_filt])
}
}
sub_map <- sub_map[rm_filt]
## Check for multialigning sequences
multi_seq_lst <- lapply(sub_map, function(x){
grepl("\\.\\d", rownames(x$Alignments))
})
if(any(do.call("c", multi_seq_lst))){
lst <- list(NA)
for(i in seq.int(length(multi_seq_lst))){
lst[[i]] <- sub_map[[i]]$Alignments[multi_seq_lst[[i]],]
}
names(lst) <- names(multi_seq_lst)
warning(
"Occurens of multiple identical alignments within the same ",
"\nreference was detected. Will proceed anyway, but the ",
"\nmultimapping sequence(s) are listed above.\n")
print(lst[do.call("c", lapply(multi_seq_lst, any))])
}
# Fix seqs column in sub_mab for each ref? Remove "." and digits.
sub_map <- lapply(sub_map, function(x){
d_sub <- gsub("\\.\\d", "", rownames(x$Alignments))
x$Alignments$seqs<- gsub('[[:digit:]]+', '', d_sub)
return(x)
})
# Extract data for each ref.
ref_data_lst <- lapply(sub_map, function(x){
sub_data <- data[rownames(data) %in% x$Alignments$seqs,,drop=FALSE]
fin <- sub_data[match(x$Alignments$seqs, rownames(sub_data)),,drop=FALSE]
return(fin)
})
if(check_overide==TRUE){
for(i in seq_along(ref_data_lst)){
rownames(ref_data_lst[[i]]) <- rownames(sub_map[[i]]$Alignments)
ref_data_lst[[i]][is.na(ref_data_lst[[i]])] <- 0
}}
test<- do.call("c", lapply(ref_data_lst, function(x){
d_sub <- gsub("\\.\\d", "", rownames(x))
gsub('[[:digit:]]+', '', d_sub)
}))
test2<-do.call("c", lapply(sub_map, function(x){
x$Alignments$seqs
}))
stopifnot(identical(test, test2))
stopifnot(identical(length(ref_data_lst), length(sub_map)))
## Generate coverage files using granges
rep_lst <- lapply(ref_data_lst, function(x){
lapply(as.list(x), function(y){
rep_vect <- rep(seq_along(rownames(x)), time=round(y))
if(length(rep_vect)<1){
rep_vect <- 0
}
return(rep_vect)
})
})
cov_lst <- list(NA)
for(i in seq.int(length(sub_map))){
df <- data.frame(seqnames="sequence",
start=sub_map[[i]]$Alignments$Align_start,
end=sub_map[[i]]$Alignments$Align_end,
ID=sub_map[[i]]$Alignments$seqs)
cov_lst[[i]] <- lapply(as.list(seq.int(length(summary_target[[2]]))), function(x){
df <- df[rep_lst[[i]][[x]],,drop=FALSE]
if(nrow(df)==0){
fin <- data.frame(
Position=as.factor(seq.int(IRanges::width(sub_map[[i]]$Ref_seq))),
Coverage=as.numeric(0))
}
if(nrow(df)>0){
gr <- GenomicRanges::GRanges(df)
GenomeInfoDb::seqlengths(gr) <- as.numeric(
IRanges::width(sub_map[[i]]$Ref_seq))
cov <- as.numeric(GenomicRanges::coverage(gr)[[1]])
fin <- data.frame(Position=ordered(as.factor(seq.int(length(cov)))),
Coverage=cov)
}
return(fin)
})
names(cov_lst[[i]]) <- summary_target[[2]]
names(cov_lst)[i] <- names(sub_map)[i]
}
## Plot graphs
# Setup colors
if(is.null(colors)){
n_colrs <- length(summary_target[[2]])
colfunc <- grDevices::colorRampPalette(c("#094A6B", "#EBEBA6", "#9D0014"))
colors <- colfunc(n_colrs)
names(colors) <- summary_target[[2]]
}
# Plot
plot_lst <- list(NA)
for(i in seq.int(length(cov_lst))){
Coverage <- Group <- Position <- NULL
cov_df <- cbind(data.frame(Position=cov_lst[[i]][[1]][,1]),
do.call("cbind", lapply(cov_lst[[i]], function(x){x[,2]})))
cov_df <- reshape2::melt(cov_df, id.vars="Position")
colnames(cov_df) <- c("Position", "Group", "Coverage")
if(xseq==TRUE){
x_lab <- c(unlist(strsplit(as.character(sub_map[[i]]$Ref_seq), ""),
use.names=FALSE))
tcks <- ggplot2::element_line()
}else{
x_lab <- NULL
tcks <- ggplot2::element_blank()}
########### Solid style ##########
if(style=="solid"){
if(max(cov_df$Coverage) >= 100){
plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
x=Position, y=Coverage, group=Group, fill=Group)) +
ggplot2::geom_line(size=0.3) +
ggplot2::geom_ribbon(data=cov_df,
ggplot2::aes(x=Position, ymax=Coverage),
ymin=0, alpha=0.5) +
ggplot2::scale_fill_manual(name='', values=colors)+
ggplot2::geom_abline(intercept =0, slope=0)+
ggplot2::ylab(paste0("mean_", norm)) +
ggplot2::xlab(paste("Position on ", names(cov_lst)[i], sep=""))+
ggplot2::scale_x_discrete(labels=x_lab)+
ggplot2::theme_classic()+
ggplot2::theme(axis.ticks.x=tcks,
axis.text.x = ggplot2::element_text(size=8),
plot.title = ggplot2::element_text(size=10))}
if(max(cov_df$Coverage) < 100){
plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
x=Position, y=Coverage, group=Group, fill=Group)) +
ggplot2::geom_line(size=0.3) +
ggplot2::geom_ribbon(
data=cov_df,
ggplot2::aes(x=Position, ymax=Coverage), ymin=0, alpha=0.5) +
ggplot2::scale_fill_manual(name='', values=colors)+
ggplot2::coord_cartesian(ylim=c(0,100))+
ggplot2::scale_y_continuous(breaks = seq(0, 100, 30))+
ggplot2::geom_abline(intercept =0, slope=0)+
ggplot2::ylab(paste0("mean_", norm)) +
ggplot2::xlab(paste("Position on ", names(cov_lst)[i], sep=""))+
ggplot2::scale_x_discrete(labels=x_lab)+
ggplot2::theme_classic()+
ggplot2::theme(axis.ticks.x=tcks,
axis.text.x = ggplot2::element_text(size=8),
plot.title = ggplot2::element_text(size=10))}
}
########### Line style ##########
if(style=="line"){
if(max(cov_df$Coverage) >= 100){
plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
x=Position, y=Coverage, group=Group, color=Group, fill=Group)) +
ggplot2::geom_path(lineend="butt", linejoin="round",
linemitre=1, size=1.0)+
ggplot2::scale_color_manual(values=colors)+
ggplot2::labs(title=names(sub_map)[i])+
ggplot2::ylab(paste0("mean_", norm)) +
ggplot2::expand_limits(y=max(cov_df$Coverage)+20) +
ggplot2::scale_x_discrete(labels=x_lab)+
ggplot2::theme_classic()+
ggplot2::theme(axis.ticks.x=tcks,
axis.text.x = ggplot2::element_text(size=8),
plot.title = ggplot2::element_text(size=10))}
if(max(cov_df$Coverage) < 100){
plot_lst[[i]] <- ggplot2::ggplot(cov_df, ggplot2::aes(
x=Position, y=Coverage, group=Group, color=Group, fill=Group)) +
ggplot2::geom_path(lineend="butt", linejoin="round",
linemitre=1, size=1.0)+
ggplot2::scale_color_manual(values=colors)+
ggplot2::coord_cartesian(ylim=c(0,100))+
ggplot2::scale_y_continuous(breaks = seq(0, 100, 30))+
ggplot2::labs(title=names(sub_map)[i])+
ggplot2::ylab(paste0("mean_", norm)) +
ggplot2::expand_limits(y=max(cov_df$Coverage)+20) +
ggplot2::scale_x_discrete(labels=x_lab)+
ggplot2::theme_classic()+
ggplot2::theme(axis.ticks.x = tcks,
axis.text.x = ggplot2::element_text(size=8),
plot.title = ggplot2::element_text(size=10))}
names(plot_lst)[i] <- names(cov_lst)[i]
plot_lst[[i]]$seqs_rpm <- ref_data_lst[[i]]
}
}
names(plot_lst) <- names(sub_map)
return(plot_lst)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.