#' Make plot of ORFik QCreport
#'
#' From post-alignment QC relative to annotation, make a plot for all samples.
#' Will contain among others read lengths, reads overlapping leaders,
#' cds, trailers, mRNA / rRNA etc.
#' @param stats the experiment object or path to custom ORFik QC folder where a file
#' called "STATS.csv" is located.
#' @param output.dir NULL or character path, default: NULL, plot not saved to disc.
#' If defined saves plot to that directory with the name "/STATS_plot.pdf".
#' @param plot.ext character, default: ".pdf". Alternatives: ".png" or ".jpg".
#' @param as_gg_list logical, default FALSE. Return as a list of ggplot objects
#' instead of as a grob. Gives you the ability to modify plots more directly.
#' @return the plot object, a grob of ggplot objects of the the statistics data
#' @importFrom data.table melt
#' @importFrom gridExtra grid.arrange
#' @export
#' @examples
#' df <- ORFik.template.experiment()[3,]
#' ## First make QC report
#' # QCreport(df)
#' ## Now you can get plot
#' # QCstats.plot(df)
QCstats.plot <- function(stats, output.dir = NULL, plot.ext = ".pdf",
as_gg_list = FALSE) {
if (is(stats, "experiment")) {
path <- QCfolder(stats)
stats <- QCstats(stats)
if (is.null(stats))
stop("No QC report made for experiment, run ORFik QCreport")
} else { # From path to QC folder
path <- stats
stats <- fread(file.path(stats, "STATS.csv"))
}
if (colnames(stats)[1] == "V1") colnames(stats)[1] <- "sample_id"
temp_theme <- theme(legend.text=element_text(size=8),
legend.key.size = unit(0.3, "cm"),
plot.title = element_text(size=11),
strip.text.x = element_blank(),
panel.grid.minor = element_blank())
stats$sample_id <- factor(stats$Sample,
labels = as.character(seq(length(stats$Sample))),
levels = stats$Sample)
stats$Sample <- factor(stats$Sample, levels = stats$Sample)
colnames(stats) <- gsub("percentage", "%", colnames(stats))
# Update all values to numeric
stats[,(seq.int(ncol(stats))[-c(1,2)]):= lapply(.SD, as.numeric),
.SDcols = seq.int(ncol(stats))[-c(1,2)]]
dt_plot <- melt(stats, id.vars = c("Sample", "sample_id"))
step_counts <- c("mRNA", "ncRNA", "Introns", "Intergenic")
stat_regions <- colnames(stats)[c(which(colnames(stats) %in% step_counts))]
dt_STAT <- dt_plot[(variable %in% stat_regions),]
dt_STAT_normalized <- copy(dt_STAT)
dt_STAT_normalized[, sample_total := sum(value), by = .(sample_id)]
dt_STAT_normalized[, percentage := (value / sample_total)*100]
gg_STAT <- ggplot(data = dt_STAT_normalized, aes(x = sample_id, y = percentage)) +
geom_bar(aes(fill = variable), stat="identity", position = "stack")+
theme_minimal() +
ylab("% content") +
scale_y_continuous(breaks = c(50, 100)) +
temp_theme
# Read lengths
dt_read_lengths <- readLengthTable(NULL, output.dir = path)
dt_read_lengths <- dt_read_lengths[perc_of_counts_per_sample > 1, ]
dt_read_lengths[, sample_id := as.factor(sample_id)]
gg_read_lengths <- ggplot(dt_read_lengths, aes(y = perc_of_counts_per_sample, x = `read length`, fill = sample_id)) +
geom_bar(stat="identity", position = "dodge")+
ylab("% counts") +
facet_wrap( ~ sample_id, nrow = length(levels(dt_read_lengths$sample_id))) +
scale_y_continuous(breaks = c(15, 30)) +
theme_minimal() +
temp_theme
# mRNA regions
mRNA_regions <- colnames(stats)[colnames(stats) %in% c("LEADERS", "CDS", "TRAILERs")]
dt_mRNA_regions <- dt_plot[(variable %in% mRNA_regions),]
gg_mRNA_regions <- ggplot(dt_mRNA_regions, aes(y = value, x = sample_id)) +
geom_bar(aes(fill = variable), stat="identity", position = "fill")+
ylab("ratio") +
scale_y_continuous(breaks = c(0.5, 1.0)) +
theme_minimal() +
temp_theme
lay <- rbind(c(2),
c(2),
c(2),
c(1,3),
c(1,3))
plot_list <- list(gg_STAT, gg_read_lengths, gg_mRNA_regions)
final <- gridExtra::grid.arrange(grobs = plot_list, layout_matrix = lay)
if (!is.null(output.dir)) {
ggsave(file.path(output.dir, paste0("STATS_plot", plot.ext)), final, width = 13,
height = 8, dpi = 300)
}
if (as_gg_list) return(plot_list)
return(final)
}
#' Correlation plots between all samples
#'
#' Get correlation plot of raw counts and/or log2(count + 1) over
#' selected region in: c("mrna", "leaders", "cds", "trailers")\cr\cr
#' Note on correlation: Pearson correlation, using pairwise observations
#' to fill in NA values for the covariance matrix.
#' @inheritParams QCstats.plot
#' @inheritParams QCplots
#' @inheritParams cor_table
#' @inheritParams cor_plot
#' @param output.dir directory to save to, named : cor_plot,
#' cor_plot_log2 and/or cor_plot_simple with either .pdf or .png
#' @param type which value to use, "fpkm", alternative "counts".
#' @param height numeric, default 400 (in mm)
#' @param width numeric, default 400 (in mm)
#' @param size numeric, size of dots, default 0.15. Deprecated.
#' @param data_for_pairs a data.table from ORFik::countTable of counts wanted.
#' Default is fpkm of all mRNA counts over all libraries.
#' @return invisible(NULL) / if as_gg_list is TRUE, return a list of raw plots.
correlation.plots <- function(df, output.dir,
region = "mrna", type = "fpkm",
height = 400, width = 400, size = 0.15, plot.ext = ".pdf",
complex.correlation.plots = TRUE,
data_for_pairs = countTable(df, region, type = type),
as_gg_list = FALSE, text_size = 4,
method = c("pearson", "spearman")[1]) {
message("- Correlation plots")
if (nrow(df) == 1) { # Avoid error from ggplot2 backend
message("- Skipping correlation plots (only 1 sample)")
return(invisible(NULL))
}
message(" - raw scaled fpkm (simple)")
dt_cor <- cor_table(as.data.table(data_for_pairs), method = method)
cor_plot1 <- cor_plot(dt_cor, text_size = text_size,
label_name = paste0(method, "\ncorrelation"))
if (!is.null(output.dir)) {
ggsave(pasteDir(output.dir, paste0("cor_plot_simple", plot.ext)), cor_plot1,
height = height, width = width, units = 'mm', dpi = 300)
}
plot_list <- list(cor_plot1)
if (complex.correlation.plots) warning("Complex correlation plots are for now deprecated!")
if (as_gg_list) return(plot_list)
return(invisible(NULL))
}
#' Get correlation between columns
#' @param dt a data.table
#' @param method c("pearson", "spearman")[1]
#' @param upper_triangle logical, default TRUE. Make lower triangle values NA.
#' @param decimals numeric, default 2. How many decimals for correlation
#' @param melt logical, default TRUE.
#' @param na.rm.melt logical, default TRUE. Remove NA values from melted table.
#' @return a data.table with 3 columns, Var1, Var2 and Cor
#' @importFrom data.table melt.data.table setcolorder
cor_table <- function(dt, method = c("pearson", "spearman")[1],
upper_triangle = TRUE, decimals = 2, melt = TRUE,
na.rm.melt = TRUE) {
stopifnot(is(dt, "data.table"))
cor <- round(cor(dt, method = method), decimals)
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
if (upper_triangle) cor <- get_upper_tri(cor)
if (melt) {
cor <- suppressWarnings(data.table::melt.data.table(as.data.table(cor),
value.name = "Cor",
variable.name = "Var2", na.rm = FALSE))
cor[, Var1 := rep(unique(Var2), times = length(unique(Var2)))]
data.table::setcolorder(cor, neworder = c(3,1,2))
if (na.rm.melt) cor <- cor[!is.na(Cor),]
}
return(as.data.table(cor))
}
#' Get correlation between columns
#' @param dt_cor a data.table, with column Cor
#' @param col colors c(low = "blue", high = "red", mid = "white", na.value = "white")
#' @param limit default (-1, 1), defined by:
#' \code{c(ifelse(min(dt_cor$Cor, na.rm = TRUE) < 0, -1, 0), 1)}
#' @param midpoint midpoint of correlation values in label coloring.
#' @param label_name name of correlation method, default
#' \code{"Pearson Correlation"} with newline after Pearson.
#' @param legend.position default c(0.4, 0.7), other: "top", "right",..
#' @param text_size size of correlation numbers
#' @param legend.direction default "horizontal", or "vertical"
#' @return a ggplot (heatmap)
cor_plot <- function(dt_cor, col = c(low = "blue", high = "red", mid = "white", na.value = "white"),
limit = c(ifelse(min(dt_cor$Cor, na.rm = TRUE) < 0, -1, 0), 1),
midpoint = mean(limit), label_name = "Pearson\nCorrelation",
text_size = 4, legend.position = c(0.4, 0.7),
legend.direction = "horizontal") {
stopifnot(is(dt_cor, "data.table"))
if (!all(c("low", "high", "mid", "na.value") %in% names(col)))
stop("'col' must have names low, high, mid and na.value")
ggheatmap <- ggplot(dt_cor, aes(Var2, Var1, fill = Cor)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = col["low"], high = col["high"], mid = col["mid"], na.value = col["na.value"],
midpoint = midpoint, limit = limit, space = "Lab",
name = label_name) +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_blank()) +
coord_fixed()
wh <- if (legend.direction == "horizontal") {c(7,1)} else c(1,7)
# Add text boxes
ggheatmap <- ggheatmap +
geom_text(aes(Var2, Var1, label = Cor), color = "black", size = text_size) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = legend.position,
legend.direction = legend.direction)+
guides(fill = guide_colorbar(barwidth = wh[1], barheight = wh[2],
title.position = "top", title.hjust = 0.5))
return(ggheatmap)
}
#' Simple PCA analysis
#'
#' Detect outlier libraries with PCA analysis.
#' Will output PCA plot of PCA component 1 (x-axis) vs
#' PCA component 2 (y-axis) for each library (colored by library),
#' shape by replicate. Will be extended to allow batch correction
#' in the future.
#' @inheritParams QCplots
#' @param output.dir default NULL, else character path to directory.
#' File saved as "PCAplot_(experiment name)(plot.ext)"
#' @param table data.table, default countTable(df, "cds", type = "fpkm"),
#' a data.table of counts per column (default normalized fpkm values).
#' @param title character, default "CDS fpkm".
#' @param subtitle character, default: \code{paste("Numer of genes:", nrow(table))}
#' @param color.by.group logical, default TRUE. Colors in PCA plot represent
#' unique library groups, if FALSE. Color each sample in seperate color
#' (harder to distinguish for > 10 samples)
#' @param return.data logical, default FALSE. Return data instead of plot
#' @return ggplot or invisible(NULL) if output.dir is defined or < 3 samples.
#' Returns data.table with PCA analysis if return.data is TRUE.
#' @export
#' @examples
#' df <- ORFik.template.experiment()
#' # Select only Ribo-seq and RNA-seq
#' pcaExperiment(df[df$libtype %in% c("RNA", "RFP"),])
pcaExperiment <- function(df, output.dir = NULL,
table = countTable(df, "cds", type = "fpkm"),
title = "PCA analysis by CDS fpkm",
subtitle = paste("Numer of genes/regions:", nrow(table)),
plot.ext = ".pdf",
return.data = FALSE,
color.by.group = TRUE) {
if (nrow(df) < 3) {
message("- Skipping PCA analysis (< 3 samples)")
return(invisible(NULL))
}
pca <- stats::prcomp(table, scale = FALSE)
dt <- data.table(pca$rotation, keep.rownames = TRUE)
if (color.by.group) {
dt$sample <- bamVarName(df, skip.replicate = TRUE)
} else dt$sample <- dt$rn
if (any(df$rep > 1, na.rm = TRUE)) {
dt$replicate <- df$rep
dt$replicate[is.na(dt$replicate)] <- 1
dt$replicate <- as.factor(dt$replicate)
} else dt$replicate <- as.factor("1")
plot <- ggplot(data = dt,
aes(x = PC1, y = PC2)) +
geom_point(aes(shape=replicate, color = sample),
size = 3, alpha = 0.8) +
scale_fill_brewer() +
ggtitle(title, subtitle) +
theme_bw() +
theme(legend.text=element_text(size=7))
if(!is.null(output.dir)) {
if (output.dir == "auto") {
path <- file.path(dirname(df$filepath[1]), "QC_STATS",
paste0("PCAplot_", df@experiment, plot.ext))
} else {
path <- file.path(output.dir,
paste0("PCAplot_", df@experiment, plot.ext))
}
ggsave(path, plot,
height = 4 + (nrow(df)*0.1), width = 5 + (nrow(df)*0.1))
if (return.data) return(dt)
return(invisible(NULL))
}
if (return.data) return(dt)
return(plot)
}
#' Quality control for pshifted Ribo-seq data
#'
#' Combines several statistics from the pshifted reads into a plot:\cr
#' -1 Coding frame distribution per read length\cr
#' -2 Alignment statistics\cr
#' -3 Biotype of non-exonic pshifted reads\cr
#' -4 mRNA localization of pshifted reads\cr
#' @param type type of library loaded, default pshifted,
#' warning if not pshifted might crash if too many read lengths!
#' @param width width of plot, default 6.6 (in inches)
#' @param height height of plot, default 4.5 (in inches)
#' @param weight which column in reads describe duplicates, default "score".
#' @param bar.position character, default "dodge". Should Ribo-seq frames
#' per read length be positioned as "dodge" or "stack" (on top of each other).
#' @inheritParams QCstats.plot
#' @inheritParams outputLibs
#' @return the plot object, a grob of ggplot objects of the the data
#' @importFrom ggplot2 theme
#' @export
#' @examples
#' df <- ORFik.template.experiment()
#' df <- df[9,] #lets only p-shift RFP sample at index 9
#' #shiftFootprintsByExperiment(df)
#' #RiboQC.plot(df, tempdir())
RiboQC.plot <- function(df, output.dir = QCfolder(df),
width = 6.6, height = 4.5, plot.ext = ".pdf",
type = "pshifted", weight = "score", bar.position = "dodge",
as_gg_list = FALSE,
BPPARAM = BiocParallel::SerialParam(progressbar = TRUE)) {
stats <- QCstats(df)
stopifnot(bar.position %in% c("stack", "dodge"))
if (colnames(stats)[1] == "V1") colnames(stats)[1] <- "sample_id"
stats$sample_id <- factor(stats$Sample,
labels = as.character(seq(length(stats$Sample))),
levels = stats$Sample)
stats$Sample <- factor(stats$Sample, levels = stats$Sample)
colnames(stats) <- gsub("percentage", "%", colnames(stats))
stats[,(seq.int(ncol(stats))[-c(1,2)]):= lapply(.SD, as.numeric),
.SDcols = seq.int(ncol(stats))[-c(1,2)]]
dt_plot <- melt(stats, id.vars = c("Sample", "sample_id"))
step_counts <- c("mRNA", "rRNA")
stat_regions <- colnames(stats)[c(which(colnames(stats) %in% step_counts))]
dt_STAT <- dt_plot[(variable %in% stat_regions),]
dt_STAT_normalized <- copy(dt_STAT)
dt_STAT_normalized[, sample_total := sum(value), by = .(sample_id)]
dt_STAT_normalized[, percentage := (value / sample_total)*100]
# Assign themes
temp_theme <- theme(legend.text=element_text(size=8),
legend.key.size = unit(0.3, "cm"),
plot.title = element_text(size=11),
strip.text.x = element_blank(),
panel.grid.minor = element_blank())
rm.minor <- theme(legend.text=element_text(size=8),
legend.key.size = unit(0.3, "cm"),
plot.title = element_text(size=11),
panel.grid.minor = element_blank())
# frame distributions
frame_sum_per <- orfFrameDistributions(df, type = type, weight = weight,
BPPARAM = BPPARAM)
gg_frame_per_stack <- ggplot(frame_sum_per, aes(x = length, y = percent)) +
geom_bar(aes(fill = frame), stat="identity", position = bar.position)+
scale_x_continuous(breaks = unique(frame_sum_per$length)) +
theme_minimal() +
rm.minor+
xlab("read length") +
ylab("percent") +
facet_wrap( ~ fraction, ncol = 1, scales = "free_y") +
scale_y_continuous(breaks = c(15, 35))
if (length(unique(frame_sum_per$length)) > 14) {
gg_frame_per_stack <- gg_frame_per_stack + scale_x_continuous(n.breaks = 9)
}
gg_frame_per_stack
# content: all_tx_types > 1%
all_tx_types <- which(colnames(stats) == "ratio_cds_leader") + 1
all_tx_regions <- colnames(stats)[c(all_tx_types:length(colnames(stats)))]
all_tx_regions <- c("mRNA", all_tx_regions)
dt_all_tx_regions<- dt_plot[(variable %in% all_tx_regions),]
dt_all_tx_regions[, sample_total := sum(value), by = .(sample_id)]
dt_all_tx_regions[, percentage := (value / sample_total)*100]
dt_all_tx_other <- dt_all_tx_regions[percentage < 1,]
dt_all_tx_regions[percentage < 1, variable := "other"]
gg_all_tx_regions <-
ggplot(dt_all_tx_regions, aes(y = percentage, x = sample_id)) +
geom_bar(aes(fill = variable), stat="identity", position = "stack")+
ylab("percent") +
xlab("sample id") +
theme_minimal() +
temp_theme +
labs(fill = "tx. type") +
scale_y_continuous(breaks = c(50, 100))
gg_all_tx_regions
# content: all_tx_types <= 1%
gg_all_tx_other <-
ggplot(dt_all_tx_other, aes(y = percentage, x = sample_id)) +
geom_bar(aes(fill = variable), stat="identity", position = "stack")+
ylab("percent") +
xlab("sample id") +
theme_minimal() +
temp_theme +
labs(fill = "tx. other") +
scale_fill_grey() +
scale_y_continuous(n.breaks = 3)
gg_all_tx_other
# Aligned reads
dt_aligned <- dt_plot[(variable %in% "Aligned_reads"),]
gg_all_mrna <- ggplot(dt_aligned, aes(y = value, x = sample_id)) +
geom_bar(stat="identity", position = "stack")+
ylab("alignments") +
xlab("sample id") +
theme_minimal() +
temp_theme +
labs(fill = "region") +
scale_y_continuous(n.breaks = 3, labels = function(x) format(x, scientific = TRUE))
gg_all_mrna
# 5' UTR, CDS & 3' UTR
mRNA_regions <- colnames(stats)[colnames(stats) %in% c("LEADERS", "CDS", "TRAILERs")]
dt_mRNA_regions <- dt_plot[(variable %in% mRNA_regions),]
dt_mRNA_regions[, variable := as.character(variable)]
dt_mRNA_regions[variable == "LEADERS", variable := "5'UTR"]
dt_mRNA_regions[variable == "TRAILERs", variable := "3'UTR"]
dt_mRNA_regions[, variable := factor(as.character(dt_mRNA_regions$variable),
levels = c(unique(dt_mRNA_regions$variable)),
ordered = TRUE)]
dt_mRNA_regions[, sample_total := sum(value), by = .(sample_id)]
dt_mRNA_regions[, percentage := (value / sample_total)*100]
gg_mRNA_regions <- ggplot(dt_mRNA_regions, aes(y = percentage, x = sample_id)) +
geom_bar(aes(fill = variable), stat="identity", position = "stack")+
ylab("percent") +
xlab("sample id") +
scale_y_continuous(breaks = c(50, 100)) +
theme_minimal() +
temp_theme +
labs(fill = "region")
lay <- rbind(c(2, 1),
c(2, 3),
c(2, 4),
c(2, 5))
plot_list <- list(gg_all_mrna, gg_frame_per_stack, gg_all_tx_regions, gg_all_tx_other, gg_mRNA_regions)
final <- gridExtra::grid.arrange(grobs = plot_list, layout_matrix = lay)
if (!is.null(output.dir)) {
ggsave(file.path(output.dir, paste0("STATS_plot_Ribo-seq_Check", plot.ext)), final,
width = width, height = height, dpi = 300)
}
if (as_gg_list) return(plot_list)
return(final)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.