#' @title Plot the Sequence Length Distribution
#'
#' @description Plot the Sequence Length Distribution across one or more FASTQC
#' reports
#'
#' @details
#' This extracts the Sequence Length Distribution from the supplied object and
#' generates a ggplot2 object, with a set of minimal defaults.
#' The output of this function can be further modified using the standard
#' ggplot2 methods.
#'
#' A cdf plot can also be generated to provide guidance for minimum
#' read length in some NGS workflows, by setting `plotType = "cdf"`.
#' If all libraries have reads of identical lengths, these plots may be less
#' informative.
#'
#' An alternative interactive plot is available by setting the argument
#' `usePlotly = TRUE`.
#'
#' @param x Can be a `FastqcData`, `FastqcDataList` or file paths
#' @param usePlotly `logical`. Output as ggplot2 or plotly object.
#' @param plotType `character`. Can only take the values
#' `plotType = "heatmap"` `plotType = "line"` or
#' `plotType = "cdf"`
#' @param labels An optional named vector of labels for the file names.
#' All filenames must be present in the names.
#' @param pattern Regex to remove from the end of any filenames
#' @param counts `logical` Should distributions be shown as counts or
#' frequencies (percentages)
#' @param colour Line colour
#' @param cluster `logical` default `FALSE`. If set to `TRUE`,
#' fastqc data will be clustered using hierarchical clustering
#' @param dendrogram `logical` redundant if `cluster` and
#' `usePlotly` are `FALSE`. If both `cluster` and
#' `dendrogram` are specified as `TRUE` then the dendrogram
#' will be displayed.
#' @param heat_w Relative width of any heatmap plot components
#' @param pwfCols Object of class [PwfCols()] to give colours for
#' pass, warning, and fail values in plot
#' @param showPwf logical(1) Show PASS/WARN/FAIL status
#' @param ... Used to pass additional attributes to theme()
#' @param expand.x Output from `expansion()` or numeric vector of
#' length 4. Passed to `scale_x_discrete`
#' @param scaleFill,scaleColour Optional ggplot scale objects
#' @param heatCol The colour scheme for the heatmap
#' @param plotlyLegend logical(1) Show legend for interactive line plots
#'
#' @return A standard ggplot2 object, or an interactive plotly object
#'
#' @examples
#'
#' # Get the files included with the package
#' packageDir <- system.file("extdata", package = "ngsReports")
#' fl <- list.files(packageDir, pattern = "fastqc.zip", full.names = TRUE)
#'
#' # Load the FASTQC data as a FastqcDataList object
#' fdl <- FastqcDataList(fl)
#'
#' # Plot as a frequency plot using lines
#' plotSeqLengthDistn(fdl)
#'
#' # Or plot the cdf
#' plotSeqLengthDistn(fdl, plotType = "cdf")
#'
#' @docType methods
#'
#' @importFrom dplyr vars mutate
#' @importFrom plotly ggplotly layout subplot
#' @importFrom tidyr pivot_wider complete nesting unnest
#' @importFrom tidyselect everything any_of
#' @importFrom grDevices hcl.colors
#' @importFrom rlang !! sym
#' @import ggplot2
#'
#' @name plotSeqLengthDistn
#' @rdname plotSeqLengthDistn-methods
#' @export
setGeneric(
"plotSeqLengthDistn",
function(x, usePlotly = FALSE, labels, pattern = ".(fast|fq|bam).*", ...) standardGeneric("plotSeqLengthDistn")
)
#' @rdname plotSeqLengthDistn-methods
#' @export
setMethod(
"plotSeqLengthDistn", signature = "ANY",
function(x, usePlotly = FALSE, labels, pattern = ".(fast|fq|bam).*", ...){.errNotImp(x)}
)
#' @rdname plotSeqLengthDistn-methods
#' @export
setMethod(
"plotSeqLengthDistn", signature = "character",
function(
x, usePlotly = FALSE, labels, pattern = ".(fast|fq|bam).*", ...
){
x <- FastqcDataList(x)
if (length(x) == 1) x <- x[[1]]
plotSeqLengthDistn(x, usePlotly, labels, pattern = pattern, ...)
}
)
#' @rdname plotSeqLengthDistn-methods
#' @export
setMethod(
"plotSeqLengthDistn", signature = "FastqcData",
function(
x, usePlotly = FALSE, labels, pattern = ".(fast|fq|bam).*", counts = TRUE,
plotType = c("line", "cdf"), expand.x = c(0, 0.2, 0, 0.2),
plotlyLegend = FALSE, colour = "red", ...
){
df <- getModule(x, "Sequence_Length_Distribution")
plotType <- match.arg(plotType)
if (!length(df)) {
p <- .emptyPlot("No Sequence Length Module Detected")
if (usePlotly) p <- ggplotly(p, tooltip = "")
return(p)
}
## Drop the suffix, or check the alternate labels
labels <- .makeLabels(x, labels, pattern)
labels <- labels[names(labels) %in% df$Filename]
df$Filename <- labels[df$Filename]
## Add zero counts for lengths either side of the included range
## This is only required if a single value exists
if (nrow(df) == 1) {
df <- dplyr::bind_rows(
df,
dplyr::mutate(df, Lower = Lower - 1, Count = 0),
dplyr::mutate(df, Lower = Lower + 1, Count = 0)
)
}
df$Lower <- as.integer(df$Lower)
df <- dplyr::arrange_at(df, vars("Lower"))
df <- df[c("Filename", "Length", "Lower", "Count")]
df$Cumulative <- cumsum(df$Count)
df$Length <- factor(df$Lower, levels = unique(df$Lower))
df$Frequency <- df$Count / sum(df$Count)
df[["Cumulative Frequency"]] <- cumsum(df$Frequency)
## Sort out some plotting parameters
stopifnot(is.numeric(expand.x), length(expand.x) == 4)
xLab <- "Sequence Length (bp)"
type <- ifelse(counts, "Count", "Frequency")
yLab <- paste(c(cdf = "Cumulative", line = "")[plotType], type)
plotY <- c(cdf = "Cumulative", line = "Count")[plotType]
if (!counts)
plotY <- c(line = "Frequency", cdf = "Cumulative Frequency")[plotType]
lab_fun <- scales::comma
if (!counts) lab_fun <- scales::percent
p <- ggplot(
df, aes(Length, !!sym(plotY), colour = Filename, group = Filename)
) +
geom_line(colour = colour) +
facet_wrap(~Filename) +
labs(x = xLab, y = yLab) +
scale_x_discrete(expand = expand.x) +
scale_y_continuous(labels = lab_fun) +
theme_bw()
p <- .updateThemeFromDots(p, ...)
if (usePlotly) {
if (!plotlyLegend) p <- p + theme(legend.position = "none")
p <- suppressMessages(plotly::ggplotly(p, tooltip = c("x", "y")))
}
p
}
)
#' @rdname plotSeqLengthDistn-methods
#' @export
setMethod(
"plotSeqLengthDistn", signature = "FastqcDataList",
function(
x, usePlotly = FALSE, labels, pattern = ".(fast|fq|bam).*", counts = FALSE,
plotType = c("heatmap", "line", "cdf"), cluster = FALSE, dendrogram = FALSE,
heat_w = 8, pwfCols, showPwf = TRUE, scaleFill = NULL, scaleColour = NULL,
heatCol = hcl.colors(50, "inferno"), plotlyLegend = FALSE, ...
){
mod <- "Sequence_Length_Distribution"
df <- getModule(x, mod)
if (!length(df)) {
p <- .emptyPlot("No Sequence Length Module Detected")
if (usePlotly) p <- ggplotly(p, tooltip = "")
return(p)
}
## Check for valid plotType
plotType <- match.arg(plotType)
## Drop the suffix, or check the alternate labels
labels <- .makeLabels(x, labels, pattern)
labels <- labels[names(labels) %in% df$Filename]
## Lengths will probably be binned so define the bins then expand
## the range at the lower and upper limits to add zero.
## This will enable replication of the default FastQC plot
## In reality, this will only be required when there are <2 bins
df <- complete(
df, Filename, nesting(Length, Lower, Upper), fill = list(Count = 0)
)
df <- dplyr::arrange(df, Lower)
df$Length <- forcats::fct_inorder(df$Length)
df <- dplyr::arrange(df, Filename, Length)
## Get the cdf count
df <- dplyr::group_by(df, Filename)
df <- mutate(df, Cumulative = cumsum(Count))
if (!counts) {
df <- mutate(
df,
Cumulative = Cumulative / max(Cumulative),
Freq = Count / sum(Count)
)
}
df <- dplyr::ungroup(df)
## Round the values for better plotting
freq_cols <- colnames(df) %in% c("Cumulative", "Freq")
df[freq_cols] <- lapply(df[freq_cols], round, digits = 4)
## Check axis expansion
if (missing(pwfCols)) pwfCols <- pwf
if (plotType %in% c("line", "cdf")) {
## Decide whether to plot the Count or cdf sum
## and set all labels
plotY <- dplyr::case_when(
plotType == "cdf" ~ "Cumulative",
plotType == "line" & counts ~ "Count",
plotType == "line" & !counts ~ "Freq"
)
yLab <- dplyr::case_when(
plotType == "cdf" & counts ~ "Cumulative Count",
plotType == "cdf" & !counts ~ "Cumulative (%)",
plotType == "line" & counts ~ "Count",
plotType == "line" & !counts ~ "Percent (%)"
)
yLabelFun <- ifelse(counts, scales::comma, scales::percent)
if (!is.null(scaleColour)) {
stopifnot(is(scaleColour, "ScaleDiscrete"))
stopifnot(scaleColour$aesthetics == "colour")
}
df$Filename <- labels[df$Filename]
p <- ggplot(
df,
aes(
Length, y = !!sym(plotY), colour = Filename,
group = Filename
)
) +
geom_line() +
labs(y = yLab) +
scale_y_continuous(labels = yLabelFun) +
scaleColour +
theme_bw()
p <- .updateThemeFromDots(p, ...)
if (usePlotly) {
ttip <- c("x", "y", "colour")
if (!plotlyLegend) p <- p + theme(legend.position = "none")
p <- suppressMessages(
suppressWarnings(
plotly::ggplotly(p, tooltip = ttip)
)
)
}
}
if (plotType == "heatmap") {
## Now define the order for a dendrogram if required
## This only applies to a heatmap
key <- names(labels)
cols <- c("Filename", "Length", "Freq")
clusterDend <- .makeDendro(df[cols], "Filename","Length", "Freq")
dx <- ggdendro::dendro_data(clusterDend)
if (dendrogram | cluster) key <- labels(clusterDend)
df$Filename <- factor(labels[df$Filename], levels = labels[key])
if (!dendrogram) dx$segments <- dx$segments[0,]
df$y <- as.integer(df$Filename)
df$Percent <- scales::percent(df$Freq, accuracy = 0.1)
df$Total <- scales::comma(df$Count)
if (is.null(scaleFill)) {
scaleFill <- scale_fill_viridis_c(
option = "inferno", labels = scales::percent,
limits = c(0, 1)
)
if (!is.null(heatCol))
scaleFill <- scale_fill_gradientn(
colours = heatCol, labels = scales::percent,
limits = c(0, 1)
)
}
stopifnot(is(scaleFill, "ScaleContinuous"))
stopifnot(scaleFill$aesthetics == "fill")
## Make the basic heatmap. The first aes sets the labels for plotly
aes <- aes(
`%` = Percent, Count = Total, Length = Length,
Filenane = Filename
)
hj <- 0.5 * heat_w / (heat_w + 1 + dendrogram)
p <- ggplot(df, aes) +
geom_rect(
aes(
xmin = Lower, xmax = Upper + 1,
ymin = y - 0.5, ymax = y + 0.5, fill = Freq
),
colour = NA
) +
labs(x = "Sequence Length", fill = "Percent") +
ggtitle(gsub("_", " ", mod)) +
scaleFill +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(
expand = c(0, 0), position = "right",
breaks = seq_along(levels(df$Filename)),
labels = levels(df$Filename)
) +
theme_bw() +
theme(
plot.title = element_text(hjust = hj),
axis.title.x = element_text(hjust = hj)
)
if (showPwf | dendrogram)
p <- p +
theme(plot.margin = unit(c(5.5, 5.5, 5.5, 0), "points"))
p <- .updateThemeFromDots(p, ...)
## Get the PWF status
status <- subset(getSummary(x), Category == gsub("_", " ", mod))
status$Filename <- factor(
labels[status$Filename], levels = labels[key]
)
if (!showPwf) status <- status[0,]
p <- .prepHeatmap(
p, status, dx$segments, usePlotly, heat_w, pwfCols
)
}
p
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.