handleCompression <- function(filename, skip=TRUE) {
if (R.utils::isBzipped(filename)) {
cat(paste0(paste("bunzip2",filename),"\n"))
return(R.utils::bunzip2(filename, temporary=TRUE, remove=FALSE, skip=skip, overwrite=!skip))
}
if (R.utils::isGzipped(filename)) {
cat(paste0(paste("gunzip",filename),"\n"))
return(R.utils::gunzip(filename, temporary=TRUE, remove=FALSE, skip=skip, overwrite=!skip))
}
return(filename)
}
#' load a sequencing_summary.txt file into memory
#'
#' importSequencingSummary loads a sequencing_summary.txt file into
#' memory and performs basic sanity checking to ensure that the file
#' is cleaned of potential duplicate headers from aggregation
#'
#' @importFrom R.utils bunzip2
#' @importFrom R.utils gunzip
#' @importFrom LaF laf_open_csv
#' @param seqsum is a path to a file
#' @param cache whether the object created should be cached in env
#' @param chunksize the number of reads to parse per iteration block
#' @param skip whether to skip uncompression if file already exists - may
#' cause collisions due to standard sequencing_summary nomenclature
#' @return data.frame of observations from the sequencing_summary.txt file
#' provided
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#'
#' @export
importSequencingSummary <- function(seqsum, cache=TRUE, chunksize=1000000, skip=FALSE) {
seqsum <- handleCompression(seqsum, skip=skip)
# identify the available columns ...
con <- file(seqsum, "r")
sample_lines <- readLines(con, n=2)
close(con)
columns <- strsplit(sample_lines[1], "\t")[[1]]
model <- LaF::detect_dm_csv(filename=seqsum, header=TRUE, sep="\t")
dat <- LaF::laf_open(model, ignore_failed_conversion=TRUE)
select_columns <- c(
"read_id", "channel", "start_time", "duration", "passes_filtering",
"sequence_length_template", "mean_qscore_template", "barcode_arrangement")
cids <- as.integer(na.omit(match(select_columns, columns)))
seqsumdata <- data.frame()
LaF::begin(dat)
while (TRUE) {
#cat(paste0(dim(seqsumdata), "\n"))
d <- LaF::next_block(dat, columns=cids, nrows=chunksize)
if ("passes_filtering" %in% columns) {
d$passes_filtering <- as.logical(d$passes_filtering)
}
if (nrow(d) == 0) break;
seqsumdata <- rbind(seqsumdata, d)
}
# remove the redundant headers from merged files
if (length(which(seqsumdata$read_id == "read_id")) > 0) {
seqsumdata <- seqsumdata[-which(seqsumdata$read_id == "read_id"), ]
}
if (!"passes_filtering" %in% colnames(seqsumdata)) {
# set all of the reads to pass? apply a cutoff?
seqsumdata$passes_filtering <- TRUE
}
if (cache) {
setCachedObject("seqsumdata", seqsumdata)
}
return(invisible(seqsumdata))
}
#' does the SequencingSummary information define a barcode?
#'
#' does the SequencingSummary information define a barcode?
#'
#' @return logical
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' SequencingSummaryHasBarcodeInfo()
#'
#' @export
SequencingSummaryHasBarcodeInfo <- function() {
seqsum <- handleSeqSumCache(NA)
if ("barcode_arrangement" %in% names(seqsum)) {
return(TRUE)
} else if (hasCachedObject("barcodedata")) {
return(TRUE)
}
return(FALSE)
}
#' prepare a gauge plot of sequencing_summary reads passing QC
#'
#' plots the basic eye-candy gauge plot of reads passing QC threshold
#'
#' @importFrom dplyr mutate
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 gauge plot
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryPassGauge(seqsum)
#'
#' @export
SequencingSummaryPassGauge <- function(seqsum = NA) {
seqsum <- handleSeqSumCache(seqsum)
df <- data.frame(matrix(nrow = 1, ncol = 3))
names(df) <- c("variable", "percentage", "label")
df$variable <- c("pass")
df$percentage <- c(round(
length(which(seqsum$passes_filtering == TRUE))/nrow(seqsum), 3))
df <- df %>% dplyr::mutate(
group = ifelse(df$percentage < 0.6, "red", ifelse(df$percentage >= 0.6 &
df$percentage < 0.8, "orange", "green")),
label = paste0(df$percentage * 100, "%"))
title = "Percentage of reads\npassing QC filter"
gaugePlot <- ggplot(df, aes_string(fill = "group", ymax = "percentage",
ymin = 0, xmax = 2, xmin = 1)) + geom_rect(aes(ymax = 1, ymin = 0, xmax
= 2, xmin = 1), fill = "#ece8bd") + geom_rect() + coord_polar(theta =
"y", start = -pi/2) + xlim(c(0, 2)) + ylim(c(0, 2)) + guides(fill =
FALSE) + guides(colour = FALSE) + theme_void() + theme(strip.background
= element_blank(), strip.text.x = element_blank()) + geom_text(
aes_string(x = 0, y = 0, label = "label"), size = 13) + geom_text(
aes_string(x = 1.5, y = 1.5, label = "title"), size = 11) +
scale_fill_manual(values = c(red = "#C9146C", orange = "#DA9112",
green = "#129188")) + scale_colour_manual(values = c(red = "#C9146C",
orange = "#DA9112", green = "#129188"))
return(ggplot2handler(gaugePlot))
}
#' prepare a channel activity plot from sequencing_summary reads file
#'
#' plots the basic eye-candy gauge channel activity plot of reads against
#' channel of origin
#'
#' @usage SequencingSummaryChannelActivity(
#' seqsum=NA, platform=NA, showcount=FALSE)
#' @importFrom reshape2 acast
#' @importFrom grDevices colorRamp colorRampPalette
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param platform is the nanopore platform [MinION/Flongle/PromethION]
#' @param showcount logical - show read counts per channel
#' @return ggplot2 channel activity plot
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryChannelActivity(seqsum)
#'
#' @export
SequencingSummaryChannelActivity <- function(
seqsum=NA, platform=NA, showcount=FALSE) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(platform)) {
platform <- SequencingSummaryGetPlatform(seqsum)
}
channelMap <- SequencingSummaryGetChannelMap(platform)
hm.palette <- colorRampPalette(brewer.pal(9, "Blues"), space = "Lab")
#RdPu, Oranges, Greens, YlOrRd, Purples
channelCounts <-
as.data.frame(matrix(rep(0, max(channelMap$channel)), ncol = 1))
channelCountRaw <-
as.data.frame(table(unlist(seqsum[, "channel"])), row.names = 1)
channelCounts[row.names(channelCountRaw), ] <- channelCountRaw[, 1]
channelMap <- merge(channelMap, channelCounts, by.x = "channel", by.y = 0)
colnames(channelMap)[4] <- "count"
channelMapMatrix <-
reshape2::acast(channelMap, col ~ row, value.var = "count")
theme_update(plot.title = element_text(hjust = 0.5))
activityPlot <- ggplot(
channelMap, aes_string(x = "col", y = "row", fill = "count")) +
geom_tile() + scale_x_discrete(breaks = NULL) + scale_y_discrete(
breaks = NULL) + coord_equal() + scale_fill_gradientn(colours =
hm.palette(100)) + scale_color_gradient2(low = hm.palette(100),
high = hm.palette(1)) + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + labs(title =
"Channel activity plot showing number of reads per flowcell channel") +
theme(panel.border = element_blank(), panel.grid.major=element_blank(),
panel.grid.minor = element_blank(), axis.title.x = element_blank(),
axis.title.y = element_blank(), legend.position = "bottom",
legend.key.width = unit(5.6, "cm"))
if (showcount) {
activityPlot <- activityPlot + geom_text(data = channelMap, aes_string(
x = "col", y = "row", label = "count", color = "count"),
show.legend = FALSE, size = 2.5)
}
return(ggplot2handler(activityPlot))
}
#' identify the most likely sequencing platform used to create
#' the summary data
#'
#' when provided with the seqsum object from a Sequencing_summary.txt file
#' identify the
#' most likely sequencing platform used
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return platform is the nanopore platform [MinION/Flongle/PromethION]
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' platform <- SequencingSummaryGetPlatform(seqsum)
#' platform
#'
#' @export
SequencingSummaryGetPlatform <- function(seqsum = NA) {
if (!is.data.frame(seqsum) && is.na(seqsum)) {
oname <- "seqsumdata"
if (hasCachedObject(oname)) {
seqsum <- getCachedObject(oname)
}
}
platform <- "MinION"
if (max(seqsum$channel) < 130) {
# this is likely to be a Flongle ...
platform <- "Flongle"
}
if (max(seqsum$channel) > 1000) {
# this is likely to be a PromethION
platform <- "PromethION"
}
return(platform)
}
# https://stackoverflow.com/questions/6461209/
# how-to-round-up-to-the-nearest-10-or-100-or-x
roundUpNice <- function(x, nice = seq(from = 1, to = 10, by = 0.25)) {
if (length(x) != 1)
stop("'x' must be of length 1")
10^floor(log10(x)) * nice[[which(x <= 10^floor(log10(x)) * nice)[[1]]]]
}
getBinAssignments <- function(seqsum, breaks) {
binAssignments <- cut(seqsum$sequence_length_template, breaks,
include.lowest = TRUE, right = FALSE)
return(binAssignments)
}
#' @importFrom stats quantile
getBinBreaks <- function(seqsum) {
# pick a friendly upper limit to render sequence lengths into a histogram
# here we're aiming for a robustly rounded up 97.5 quantile of the data
# (skip a few outliers ...)
upperLimit <- roundUpNice(as.numeric(quantile(
x = seqsum$sequence_length_template, probs = c(0.975))))
# an ideal histogram will have 40 or so bins
histogramBinCount <- 40
breakVal = roundUpNice(upperLimit/histogramBinCount)
breaks <- seq(0, to = upperLimit, by = breakVal)
return(breaks)
}
#' plot a weighted histogram of sequence read lengths
#'
#' plots the histogram of read lengths, weighted, and shaded by pass/fail status
#'
#' @importFrom dplyr last
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 showing weighted read length distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryWeightedReadLength(seqsum)
#'
#' @export
SequencingSummaryWeightedReadLength <- function(seqsum=NA) {
seqsum <- handleSeqSumCache(seqsum)
breaks <- getBinBreaks(seqsum)
breakVal <- breaks[2] # assuming that the range is 0 based
upperLimit <- dplyr::last(breaks)
binAssignments <- getBinAssignments(seqsum, breaks)
passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
passedMeanLength = round(
mean(passedSeqs$sequence_length_template), digits = 0)
scrapeBinnedBases <- function(level, qcpass, binAssignments, seqsum) {
candidates <- which(binAssignments == level)
if (length(candidates) > 0) {
data <- seqsum[candidates, ]
candidates2 <- which(data$passes_filtering == qcpass)
if (length(candidates2) > 0) {
return(sum(data[candidates2, "sequence_length_template"]))
}
}
return(0)
}
passedBinnedBases <-unlist(lapply(levels(binAssignments), scrapeBinnedBases,
qcpass = TRUE, binAssignments = binAssignments,seqsum = seqsum))
failedBinnedBases <-unlist(lapply(levels(binAssignments), scrapeBinnedBases,
qcpass = FALSE, binAssignments = binAssignments,seqsum = seqsum))
binnedBaseDist <- data.frame(length = head(breaks, -1), pass =
passedBinnedBases, fail = failedBinnedBases)
binnedBaseMelt <- reshape2::melt(binnedBaseDist, id.vars = c("length"))
weightedReadLengths <- ggplot(binnedBaseMelt, aes_string(x = "length",
fill = "variable", y = "value")) + geom_bar(stat = "identity") +
xlab("Read length\n") + ylab("Number of bases sequenced\n") +
scale_fill_manual("QC", values = c(fail = brewer.pal(6, "Paired")[1],
pass = brewer.pal(6, "Paired")[2])) + scale_x_continuous(limits = c(
-breakVal,upperLimit),breaks=pretty(passedSeqs$sequence_length_template,
n = 40)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste0("Histogram showing the number of sequenced bases a",
"gainst sequence length"), fill = "QV filter") + geom_vline(xintercept =
N50, size = 1) + annotate("text", x = N50, y = max(passedBinnedBases +
failedBinnedBases), label = " N50", hjust = 0, colour = "SteelBlue") +
geom_vline(xintercept = passedMeanLength, size = 1) + annotate("text",
x = passedMeanLength, y = max(passedBinnedBases + failedBinnedBases),
label = " Mean", hjust = 0, colour = "SteelBlue")
return(ggplot2handler(weightedReadLengths))
}
#' plot a histogram of sequence read lengths
#'
#' plots the histogram of read lengths shaded by pass/fail status
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 showing read length distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryReadLengthHistogram(seqsum)
#'
#' @export
SequencingSummaryReadLengthHistogram <- function(seqsum=NA) {
seqsum <- handleSeqSumCache(seqsum)
breaks <- getBinBreaks(seqsum)
breakVal <- breaks[2] # assuming that the range is 0 based
upperLimit <- dplyr::last(breaks)
binAssignments <- getBinAssignments(seqsum, breaks)
passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
passedMeanLength = round(mean(
passedSeqs$sequence_length_template), digits = 0)
scrapeBinnedReads <- function(level, qcpass) {
# length(subset(seqsum[which(binAssignments == level), ],
# `passes_filtering`==qcpass)$sequence_length_template)
length(which(seqsum[which(
binAssignments == level), ]$passes_filtering == qcpass))
}
passedBinnedReads <- unlist(lapply(levels(
binAssignments), scrapeBinnedReads, qcpass = TRUE))
failedBinnedReads <- unlist(lapply(levels(
binAssignments), scrapeBinnedReads, qcpass = FALSE))
binnedReadDist <- data.frame(length = head(
breaks, -1), pass = passedBinnedReads, fail = failedBinnedReads)
binnedReadMelt <- reshape2::melt(binnedReadDist, id.vars = c("length"))
lengthHistogram <- ggplot(binnedReadMelt, aes_string(x = "length", fill =
"variable", y = "value")) + geom_bar(stat = "identity") + xlab(
"Read length\n") + ylab("Number of reads\n") + scale_fill_manual("QC",
values = c(fail = brewer.pal(6, "Paired")[1], pass = brewer.pal(6,
"Paired")[2])) + scale_x_continuous(limits = c(-breakVal, upperLimit),
breaks = pretty(passedSeqs$sequence_length_template, n = 40)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title =
paste0("Histogram showing distribution of read lengths across quality ",
"passing sequences"), fill = "QV filter") + geom_vline(xintercept = N50,
size = 1) + annotate("text", x = N50, y = max(passedBinnedReads +
failedBinnedReads), label = " N50", hjust = 0, colour = "SteelBlue") +
geom_vline(xintercept = passedMeanLength, size = 1) + annotate("text",
x = passedMeanLength, y = max(passedBinnedReads + failedBinnedReads),
label = " Mean", hjust = 0, colour = "SteelBlue")
return(ggplot2handler(lengthHistogram))
}
#' plot a histogram of sequence quality scores
#'
#' plots the histogram of read mean quality scores shaded by pass/fail status
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return ggplot2 showing read quality distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryReadQualityHistogram(seqsum)
#'
#' @export
SequencingSummaryReadQualityHistogram <- function(seqsum=NA) {
seqsum <- handleSeqSumCache(seqsum)
qdist <- ggplot(seqsum, aes_string(x = "mean_qscore_template", fill =
"passes_filtering")) + geom_histogram(breaks = seq(from = 0,
to = 15, by = 0.1)) + scale_fill_manual(name = "QC", values = c(`TRUE`=
brewer.pal(6, "Paired")[2], `FALSE` = brewer.pal(6, "Paired")[1]),
labels = c("pass", "fail"), breaks = c("TRUE", "FALSE")) + labs(title =
"Plot showing distribution of quality scores across all reads") +
xlab("Mean Q score of read") + ylab("Number of reads")
return(ggplot2handler(qdist))
}
#' plot a density map of sequence lengths and quality scores
#'
#' plots the density plot of read length against read mean quality scores
#'
#' @usage SequencingSummaryReadLengthQualityDensity(seqsum=NA,
#' binFilter = 2,
#' qcThreshold = 7
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param binFilter is the minimum number of reads to include a cell in plot
#' (removes speckle)
#' @param qcThreshold is the QC threshold used
#' @return ggplot2 showing densities of read length and quality distribution
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryReadLengthQualityDensity(seqsum)
#'
#' @export
SequencingSummaryReadLengthQualityDensity <- function(
seqsum=NA, binFilter=2, qcThreshold=7) {
seqsum <- handleSeqSumCache(seqsum)
# prepare the density plot, but do not render
lq_dens <- ggplot(seqsum, aes(log10(seqsum$sequence_length_template),
seqsum$mean_qscore_template)) + geom_bin2d(bins = 100)
# extract the density map from the plot
lq_dens_counts <- ggplot_build(lq_dens)$data[[1]]
if (binFilter > 0) {
# remove the bins from the density map that do not contain sequence
# count above threshold
lq_dens_counts <- lq_dens_counts[-which(
lq_dens_counts$count <= binFilter), ]
}
# directly plot this modified density map (stat=='identity')
qldensityplot <- ggplot(lq_dens_counts) + geom_bin2d(aes_string("x", "y",
fill = "count"), stat = "identity") + scale_fill_distiller(palette =
"Blues", trans = "reverse") + geom_hline(yintercept = qcThreshold,
size = 1) + xlab("log10(read length)") + ylab("read mean quality") +
scale_x_continuous(breaks = c(1, 2, 3, 4, 5), labels = c("10", "100",
"1000", "10,000", "100,000")) + annotation_logticks(base = 10, sides =
"b", scaled = TRUE) + labs(title = paste0("Contour Plot showing distri",
"bution of quality scores against log10 read lengths (all reads)"))
return(ggplot2handler(qldensityplot))
}
getTemporalDataset <- function(seqsum, sampleHours, sampleIntervalMinutes) {
breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
# has this object already been created?
objectName <- paste0(
"seqsum.temporaldata.",sampleIntervalMinutes,".",length(breaks))
if (hasCachedObject(objectName)) {
return(getCachedObject(objectName))
}
binass <- findInterval(seqsum$start_time, breaks)
mergeItPerHour <- function(interval, binnedAssignments, filter) {
totalbases = 0
if (length(which(binnedAssignments == interval)) > 0) {
subset <- seqsum[which(binnedAssignments == interval), ]
if (length(which(subset$passes_filtering == filter)) > 0) {
totalbases = sum(subset[which(subset$passes_filtering ==
filter), "sequence_length_template"])
}
}
# need to scale what is being returned - totalbases value is total
# bases within an interval
# (sampleIntervalMinutes)
return(totalbases/1e+09/sampleIntervalMinutes * 60)
}
binnedTemporalDataPerHour <- data.frame(cbind(time = breaks, pass = unlist(
lapply(seq(breaks), mergeItPerHour, binnedAssignments = binass,
filter = TRUE)), fail = unlist(lapply(seq(breaks), mergeItPerHour,
binnedAssignments = binass, filter = FALSE))))
binnedTemporalDataPerHour$time <- binnedTemporalDataPerHour$time/60/60
setCachedObject(objectName, binnedTemporalDataPerHour)
return(binnedTemporalDataPerHour)
}
#' plot a sequence throughput against time for specified
#' sequencing_summary run
#'
#' plots a ggplot2 graph of performance against time for run and separates
#' passed and failed sequence reads
#'
#' @usage SequencingSummaryTemporalThroughput(seqsum,
#' sampleHours = NA,
#' sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryTemporalThroughput(seqsum)
#'
#' @export
SequencingSummaryTemporalThroughput <- function(
seqsum=NA, sampleHours = NA, sampleIntervalMinutes = 60) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(sampleHours)) {
sampleHours <- SequencingSummaryExtractRuntime()
}
binnedTemporalDataPerHour <- getTemporalDataset(
seqsum, sampleHours, sampleIntervalMinutes)
plot <- ggplot(binnedTemporalDataPerHour, aes_string("time")) + geom_line(
aes(y = binnedTemporalDataPerHour$fail, colour = "fail"), size = 1) +
geom_line(aes(y = binnedTemporalDataPerHour$pass, colour = "pass"),
size = 1) + scale_color_manual(name = "QV", values = c(fail =
brewer.pal(6, "Paired")[1], pass = brewer.pal(6, "Paired")[2])) +
xlab("Time (hours)") + ylab("Gigabases sequenced per hour") +
labs(title = "Plot showing sequence throughput against time")
return(ggplot2handler(plot))
}
#' plot cumulative volumes of sequence bases
#'
#' plots a ggplot2 graph of accumulated sequenced bases against time for run
#' and separates bases called from passed and failed sequence reads
#'
#' @usage SequencingSummaryCumulativeBases(seqsum,
#' sampleHours = NA,
#' sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryCumulativeBases(seqsum)
#'
#' @export
SequencingSummaryCumulativeBases <- function(seqsum=NA, sampleHours = NA,
sampleIntervalMinutes = 60) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(sampleHours)) {
sampleHours <- SequencingSummaryExtractRuntime()
}
binnedTemporalDataPerHour <- getTemporalDataset(
seqsum, sampleHours, sampleIntervalMinutes)
# binnedTemporalDataPerHour is scaled to Gbp per hour - rescale to raw for
# cumulative plotting
binnedTemporalDataPerHour$pass <- binnedTemporalDataPerHour$pass/60 *
sampleIntervalMinutes
binnedTemporalDataPerHour$fail <- binnedTemporalDataPerHour$fail/60 *
sampleIntervalMinutes
base50 <- SequencingSummaryBase50(seqsum, b = 0.5)
base90 <- SequencingSummaryBase50(seqsum, b = 0.9)
T50 <- SequencingSummaryT50(seqsum, t = 0.5, sampleHours = sampleHours,
sampleIntervalMinutes = sampleIntervalMinutes)
T90 <- SequencingSummaryT50(seqsum, t = 0.9, sampleHours = sampleHours,
sampleIntervalMinutes = sampleIntervalMinutes)
cumulativePlot <- ggplot(binnedTemporalDataPerHour, aes_string("time")) +
geom_line(aes(y = cumsum(binnedTemporalDataPerHour$fail), colour =
"fail"), size = 1) + geom_line(aes(y = cumsum(
binnedTemporalDataPerHour$pass), colour = "pass"), size = 1) +
scale_color_manual(name = "QV", values = c(fail = brewer.pal(6,
"Paired")[1], pass = brewer.pal(6, "Paired")[2])) + geom_segment(x =
T50$minimum, y = 0, xend = T50$minimum, yend = base50, colour =
"darkgray", size = 1) + geom_segment(x = 0, y = base50, xend =
T50$minimum, yend = base50, colour = "darkgray", size = 1) +
annotate("text", x = T50$minimum, y = base50, label = " T50", vjust =1,
hjust = 0, colour = "SteelBlue") + geom_segment(x = T90$minimum, y = 0,
xend = T90$minimum, yend = base90, colour = "darkgray", size = 1) +
geom_segment(x = 0, y = base90, xend = T90$minimum, yend = base90,
colour = "darkgray", size = 1) + annotate("text", x = T90$minimum,
y = base90, label = " T90", vjust = 1, hjust = 0, colour = "SteelBlue")+
xlab("Time (hours)") + ylab("Number of bases sequenced (Gigabases)") +
labs(title = "Plot showing cumulative bases sequenced against time")
return(ggplot2handler(cumulativePlot))
}
#' calculates the fractional number of bases according to
#' supplied b parameter
#'
#' an accessory method for various logicals; simple fractional base calculator
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param b is a fractional point through run against which time will be
#' calculated
#' @return a numeric value expressed in gigabases
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' Base50 <- SequencingSummaryBase50(seqsum)
#' Base50
#'
#' @export
SequencingSummaryBase50 <- function(seqsum, b = 0.5) {
passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
base50 <- sum(passedSeqs$sequence_length_template)/1e+09 * b
return(base50)
}
#' calculates the timepoint within a sequencing run where
#' 50percent of the total data is produced
#'
#' an accessory method for identifying a timepoint where a given amount of data
#' has been produced
#'
#' @importFrom stats approxfun
#' @importFrom stats optimize
#' @usage SequencingSummaryT50(seqsum=NA,
#' t = 0.5,
#' sampleHours = NA,
#' sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param t is a fractional point through run against which time will be
#' calculated
#' @param sampleHours is the number of hours to consider
#' @param sampleIntervalMinutes is the resolution of the plot in minutes
#' @return a numeric value expressed in hours
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' T50 <- SequencingSummaryT50(seqsum)
#' T50
#'
#' @export
SequencingSummaryT50 <- function(
seqsum=NA, t = 0.5, sampleHours = NA, sampleIntervalMinutes = 60) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(sampleHours)) {
sampleHours <- SequencingSummaryExtractRuntime()
}
binnedTemporalDataPerHour <- getTemporalDataset(
seqsum, sampleHours, sampleIntervalMinutes)
# binnedTemporalDataPerHour is scaled to Gbp per hour - rescale to raw
# for cumulative plotting
binnedTemporalDataPerHour$pass <- binnedTemporalDataPerHour$pass/60 *
sampleIntervalMinutes
# https://stackoverflow.com/questions/31404679/
# can-ggplot2-find-the-intersections-or-is-there-any-other-neat-way
acquireTimePoints <- which(binnedTemporalDataPerHour$pass > 0)
targetInterpolate <- approxfun(x = binnedTemporalDataPerHour[
acquireTimePoints, "time"], y = cumsum(binnedTemporalDataPerHour[
acquireTimePoints, "pass"]))
base50 <- SequencingSummaryBase50(seqsum, b = t)
T50 <- optimize(function(t0) abs(targetInterpolate(t0) - base50),
interval = range(binnedTemporalDataPerHour[acquireTimePoints, "time"]))
return(T50)
}
#' plot cumulative volumes of sequence reads
#'
#' plots a ggplot2 graph of accumulated sequence reads against time for run and
#' separates passed and failed sequence reads
#'
#' @usage SequencingSummaryCumulativeReads(seqsum,
#' sampleHours = NA,
#' sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryCumulativeReads(seqsum)
#'
#' @export
SequencingSummaryCumulativeReads <- function(seqsum=NA, sampleHours = NA,
sampleIntervalMinutes = 60) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(sampleHours)) {
sampleHours <- SequencingSummaryExtractRuntime()
}
breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
binass <- findInterval(seqsum$start_time, breaks)
mergeItReadsPerHour <- function(interval, binnedAssignments, filter) {
totalreads = 0
if (length(which(binnedAssignments == interval)) > 0) {
subset <- seqsum[which(binnedAssignments == interval), ]
if (length(which(subset$passes_filtering == filter)) > 0) {
totalreads = nrow(subset[which(
subset$passes_filtering == filter), ])
}
}
# scale results to mean millions of reads per hour
return(totalreads/1e+06/sampleIntervalMinutes * 60)
}
binnedTemporalDataReadsPerHour <- data.frame(cbind(time = breaks, pass =
unlist(lapply(seq(breaks), mergeItReadsPerHour, binnedAssignments =
binass, filter = TRUE)), fail = unlist(lapply(seq(breaks),
mergeItReadsPerHour, binnedAssignments = binass, filter = FALSE))))
binnedTemporalDataReadsPerHour$time <-
binnedTemporalDataReadsPerHour$time/60/60
# binnedTemporalDataReadsPerHour is scaled to Gbp per hour - rescale to raw
# for cumulative plotting
binnedTemporalDataReadsPerHour$pass <-
binnedTemporalDataReadsPerHour$pass/60 * sampleIntervalMinutes
binnedTemporalDataReadsPerHour$fail <-
binnedTemporalDataReadsPerHour$fail/60 * sampleIntervalMinutes
cumulativePlot <- ggplot(binnedTemporalDataReadsPerHour,aes_string("time"))+
geom_line(aes(y = cumsum(binnedTemporalDataReadsPerHour$fail),
colour = "fail"), size = 1) + geom_line(aes(y = cumsum(
binnedTemporalDataReadsPerHour$pass), colour = "pass"), size = 1) +
scale_color_manual(name = "QV", values = c(fail = brewer.pal(6,
"Paired")[1], pass = brewer.pal(6, "Paired")[2]))+xlab("Time (hours)")+
ylab("Number of reads sequenced (Millions)") + labs(title =
"Plot showing cumulative reads sequenced against time")
return(ggplot2handler(cumulativePlot))
}
#' plot speed of sequencing against time (bases per second distribution)
#'
#' plots a ggplot2 box-and-whisker plot for the distribution of sequencing
#' speeds against time
#'
#' @usage SequencingSummarySpeedPlot(seqsum = NA,
#' sampleHours = NA,
#' sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummarySpeedPlot()
#'
#' @export
SequencingSummarySpeedPlot <- function(
seqsum = NA, sampleHours = NA, sampleIntervalMinutes = 60) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(sampleHours)) {
sampleHours <- SequencingSummaryExtractRuntime()
}
breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
binass <- findInterval(seqsum$start_time, breaks)
speedTime <- data.frame(segment = binass, rate =
seqsum$sequence_length_template/(seqsum$duration))
speedplot <- ggplot(speedTime, aes_string(x = "segment", y = "rate", group=
"segment")) + geom_boxplot(fill = "steelblue", outlier.shape = NA) +
scale_x_continuous(name = "Time (hours)") +
ylab("Sequencing rate (bases per second)") + labs(title =
"boxplot showing distribution of translocation speed against time")
return(ggplot2handler(speedplot))
}
#' plot number of observed channels actively producing data against time
#'
#' plots a ggplot2 plot of active channels against time
#'
#' @usage SequencingSummaryActiveChannelPlot(seqsum,
#' sampleHours = NA,
#' sampleIntervalMinutes = 60
#' )
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param sampleHours is the number of hours to plot data for
#' @param sampleIntervalMinutes is the resolution to plot data at
#' @return ggplot2 showing temporal performance
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequencingSummaryActiveChannelPlot(seqsum)
#'
#' @export
SequencingSummaryActiveChannelPlot <- function(
seqsum=NA, sampleHours = NA, sampleIntervalMinutes = 60) {
seqsum <- handleSeqSumCache(seqsum)
if (is.na(sampleHours)) {
sampleHours <- SequencingSummaryExtractRuntime()
}
breaks = seq(0, sampleHours * 60 * 60, by = 60 * sampleIntervalMinutes)
binass <- findInterval(seqsum$start_time, breaks)
mergeActiveChannels <- function(interval, binnedAssignments) {
totalChannels = 0
if (length(which(binnedAssignments == interval)) > 0) {
subset <- seqsum[which(binnedAssignments == interval), ]
totalChannels = length(unique(subset$channel))
}
return(totalChannels)
}
binnedTemporalChannels <- data.frame(time = breaks, channels =unlist(lapply(
seq(breaks), mergeActiveChannels, binnedAssignments = binass)))
binnedTemporalChannels$time <- binnedTemporalChannels$time/60/60
activityPlot <- ggplot(binnedTemporalChannels, aes_string("time")) +
geom_step(aes_string(y = "channels"), size = 1, colour = "Steelblue") +
xlab("Time (hours)") + ylab("Number of channels producing data") +
labs(title = "Plot showing number of functional channels against time")
return(ggplot2handler(activityPlot))
}
#' present an infographic styled executive summary of sequence_summary.txt
#' content
#'
#' present an infographic styled executive summary of sequence_summary.txt
#' content
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @param flowcellId is a label for the plot
#' @return file path to ggplot2 format file
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequenceSummaryExecutiveSummary(seqsum)
#'
#' @export
SequenceSummaryExecutiveSummary <- function(
seqsum=NA, flowcellId = "undefined") {
# calculate some basic, but key, metrics
seqsum <- handleSeqSumCache(seqsum)
passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
readCount <- formatC(nrow(seqsum), big.mark = ",")
totalBases = sum(seqsum$sequence_length_template, na.rm = TRUE)/10^9
passedBases = sum(passedSeqs$sequence_length_template, na.rm = TRUE)/10^9
gigabases <- round(totalBases, 2)
# render an info-graphic-like plot for these observations
infoFile1 <- infoGraphicPlot3(identifier = "ExecutiveSummaryValueBoxes",
panelA = c(value = flowcellId, key = "flowcell", icon = "fa-qrcode"),
panelB = c(value = readCount, key = "Reads produced", icon="fa-filter"),
panelC = c(value = gigabases, key = "gigabases called", icon =
"fa-file-text-o"))
return(infoFile1)
}
#' present an infographic styled basic characteristics plot of
#' sequence_summary.txt content
#'
#' present an infographic styled basic characteristics plot of
#' sequence_summary.txt content
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return file path to ggplot2 format file
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' seqsum <- importSequencingSummary(seqsumFile)
#' plot <- SequenceSummaryBasicInfoPlot(seqsum)
#'
#' @export
SequenceSummaryBasicInfoPlot <- function(seqsum=NA) {
seqsum <- handleSeqSumCache(seqsum)
passedSeqs <- seqsum[which(seqsum$passes_filtering), ]
failedSeqs <- seqsum[which(!seqsum$passes_filtering), ]
passedMeanLength = round(mean(
passedSeqs$sequence_length_template), digits = 0)
N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
passedMeanQ = round(mean(passedSeqs$mean_qscore_template), digits = 1)
failedMeanQ = round(mean(failedSeqs$mean_qscore_template), digits = 1)
longestRead <- (scales::comma_format())(max(
passedSeqs$sequence_length_template))
# N50 length is the length of the shortest contig such that the sum of
# contigs of equal length oe longer is at least 50% of the total length of
# all contigs
infoFile2 <- infoGraphicPlot5(identifier="SequenceCharacteristicValueBoxes",
panelA = c(value = (scales::comma_format())(passedMeanLength), key =
"Mean Read Length (nt)", icon = "fa-bar-chart"), panelB = c(value =
(scales::comma_format())(N50), key = "N50", icon = "fa-play"), panelC =
c(value = passedMeanQ, key = "Mean Read Quality (QV)", icon =
"fa-area-chart"), panelD = c(value = failedMeanQ,key="Mean Failed QV",
icon = "fa-bug"), panelE = c(value = longestRead, key = "Longest Read",
icon = "fa-sort"))
return(infoFile2)
}
handleSeqSumCache <- function(seqsum) {
if (!is.data.frame(seqsum) && is.na(seqsum)) {
oname <- "seqsumdata"
if (hasCachedObject(oname)) {
seqsum <- getCachedObject(oname)
}
}
return(seqsum)
}
#' from sequencing_summary.txt return integer describing runtime in hours
#'
#' from provided sequencing_summary.txt file return an integer best describing
#' the runtime in hours
#'
#' @param seqsum is the data.frame object as prepared by importSequencingSummary
#' @return integer of runtime in hours
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' importSequencingSummary(seqsumFile)
#' SequencingSummaryExtractRuntime()
#'
#' @export
SequencingSummaryExtractRuntime <- function(seqsum=NA) {
seqsum <- handleSeqSumCache(seqsum)
runtime <- max(seqsum[,"start_time"]) / 60 / 60
expectedRuntimes <- c(4,8,12,24,36,48,64,72,96)
temporaldistance <- sqrt((expectedRuntimes - runtime)^2)
rruntime <- expectedRuntimes[[which(
temporaldistance==min(temporaldistance))]]
return(rruntime)
}
#' get flowcell id from sequencing_summary file
#'
#' The sequencing summary file contains a load of additional information;
#' this method will quickly extract and return the flowcell id
#'
#' @return character vector of flowcell id
#'
#' @examples
#' init()
#' seqsumFile <- system.file("extdata",
#' "sequencing_summary.txt.bz2", package = "nanopoRe")
#' FCid <- SequencingSummaryFlowCellID(seqsumFile)
#'
#' @export
SequencingSummaryFlowCellID <- function(seqsumFile) {
con <- file(seqsumFile, "r")
sample_lines <- readLines(con, n=2)
close(con)
columns <- strsplit(sample_lines[1], "\t")[[1]]
values <- strsplit(sample_lines[2], "\t")[[1]]
if ("filename" %in% columns) {
filename <- values[na.omit(match(columns, "filename"))]
filenamesplit <- strsplit(filename, "_")[[1]]
if (length(filenamesplit)==3) {
return(filenamesplit[[1]])
}
if (length(filenamesplit)==15) {
return(filenamesplit[[3]])
}
}
return("RuleMissing")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.