Nothing
#' @importFrom methods as
#' @importFrom tidyselect one_of contains ends_with
setAs(".FastqcFile", "FastqcData", function(from){
## Import each line as an element in a character vector then detect the
## modules and return as a list containing each module as an element
fqcLines <- .getFqcLines(from)
## The FastQC version number
## May not be present so is only returned if present
vers <- NA_character_ # Default to missing
nm <- tolower(names(fqcLines))
if (grepl("fastqc\\t", nm[1])) {
vers <- gsub("fastqc\\t(.+)", "\\1", nm[1])
## Remove the version module for easier module identification
fqcLines <- fqcLines[-1]
}
## Initialise an empty list with an empty element for each module present
## in the data, and the set of required modules
out <- .initialiseOutput(fqcLines)
## Now wrangle the data for each module and populate the output
## Missing or empty modules will return an empty dataframe
out[["Basic_Statistics"]] <- .getBasicStatistics(fqcLines)
out[["Per_base_sequence_quality"]] <- .getPerBaseSeqQuals(fqcLines)
out[["Per_tile_sequence_quality"]] <- .getPerTileSeqQuals(fqcLines)
out[["Per_sequence_quality_scores"]] <- .getPerSeqQualScores(fqcLines)
out[["Per_base_sequence_content"]] <- .getPerBaseSeqCont(fqcLines)
out[["Per_sequence_GC_content"]] <- .getPerSeqGcCont(fqcLines)
out[["Per_base_N_content"]] <- .getPerBaseNCont(fqcLines)
out[["Sequence_Length_Distribution"]] <- .getSeqLengthDist(fqcLines)
out[["Overrepresented_sequences"]] <- .getOverrepSeq(fqcLines)
out[["Adapter_Content"]] <- .getAdapterCont(fqcLines)
out[["Kmer_Content"]] <- .getKmerCont(fqcLines)
## Get the Sequence Duplication Levels. This is potentially a two element
## module within the parsed data, depending on the version of FastQC
seqDupLevels <- .getSeqDuplicationLevels(fqcLines)
dupMods <- names(seqDupLevels)
out[dupMods] <- seqDupLevels[dupMods]
## Get the summary information
Summary <- getSummary(from)
args <- list(
Class = "FastqcData",
path = path(from),
version = vers,
Summary = Summary
)
do.call("new", c(args, out))
})
#' @importFrom methods as
setAs("list", "FastqcDataList", function(from){
isFqcData <- vapply(from, is, logical(1), class2 = "FastqcData")
if (!all(isFqcData)) stop(
"All elements of the supplied list must be FastqcData objects"
)
new("FastqcDataList", from)
})
## This helper checks for compressed or extracted FastQC reports then
## imports the contents of fastqc_data.txt as a character vector.
## Comments (#) and lines denoting the end of a module are then removed
.getFqcLines <- function(x){
## Setup the path to the file & use tryCatch to ensure it exists
## when testing to see if it is compressed
path <- path(x)
stopifnot(file.exists(path))
comp <- isCompressed(path, type = "zip")
if (comp) {
## Get the internal path within the zip archive
fl <- file.path(gsub(".zip$", "", basename(path)), "fastqc_data.txt")
## Check the required file exists within the file
allFiles <- unzip(path, list = TRUE)$Name
stopifnot(fl %in% allFiles)
## # Open the connection & read the 12 lines
uz <- unz(path, fl)
fqcLines <- readLines(uz)
close(uz)
}
else{
## The existence of this file will have been checked at
## instantiation of the .FastqcFile. Check in case it has been
## deleted post-instantiation
fl <- file.path(path, "fastqc_data.txt")
if (!file.exists(fl)) stop("'fastqc_data.txt' could not be found.")
fqcLines <- readLines(fl)
}
## Modified from Repitools::readFastQC
## Remove any '#' symbols
fqcLines <- gsub("#", "", fqcLines)
## Remove any lines which specify '>>END_MODULE'
fqcLines <- fqcLines[!grepl(">>END_MODULE", fqcLines)]
## Split the data based on the '>>' pattern, which indicates the
## beginning of a new module
fqcLines <- split(fqcLines, cumsum(grepl("^>>", fqcLines)))
## Assign the module names
names(fqcLines) <- vapply(fqcLines, function(x){
## Get the first line before the tab separator
nm <- gsub(">>(.+)\\t.+", "\\1", x[[1]])
## Replace the space with underscore
gsub(" ", "_", nm)
}, character(1))
## Remove the module name (i.e. the 1st value) from each module's data
fqcLines <- lapply(fqcLines, function(x){x[-1]})
fqcLines
}
## This helper checks for the presence of required modules then initialises
## the empty list which will hold the module data
.initialiseOutput <- function(x){
## Get the module names from the lines that are present in the data
modules <- names(x)
## Define the standard modules in a FastQC report. These are required
reqModules <- c(
"Basic_Statistics",
"Per_base_sequence_quality",
"Per_tile_sequence_quality",
"Per_sequence_quality_scores",
"Per_base_sequence_content",
"Per_sequence_GC_content",
"Per_base_N_content",
"Sequence_Length_Distribution",
"Sequence_Duplication_Levels",
"Overrepresented_sequences",
"Adapter_Content",
"Kmer_Content"
)
## Check that at least one of the standard modules is present
## It's quite plausible for a report to be missing multiple modules
if (!any(modules %in% reqModules))
stop("None of the standard modules were found in the data.")
## Define the output to have the same structure as fastqcData,
## with an additional slot to account for the Sequence Duplication
## Levels output. Initialise an empty list based on the standard
## modules after checking for the Total_Deduplicated Percentage value.
allMods <- c(reqModules, modules, "Total_Deduplicated_Percentage")
allMods <- unique(allMods)
out <- vector("list", length(allMods))
names(out) <- allMods
## Now return the initialised output
out
}
# Define a series of quick functions for wrangling the data
# after splitting the input from readLines()
# These are all tested using testthat to ensure they're working correctly
.getBasicStatistics <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Basic_Statistics"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
## Form a data frame and check th structure
x <- .splitByTab(fqcLines[[mod]])
stopifnot(setequal(names(x), c("Measure", "Value")))
vals <- x[["Value"]]
names(vals) <- gsub(" ", "_", x[["Measure"]])
## Check for the required values
reqVals <- c(
"Filename",
"File_type",
"Encoding",
"Total_Sequences",
"Sequences_flagged_as_poor_quality",
"Sequence_length",
"%GC"
)
stopifnot(reqVals %in% names(vals))
## Setup the tibble with the correct value types
df <- as_tibble(as.list(vals))
df$Shortest_sequence <- gsub("(.*)-.*", "\\1", df$Sequence_length)
df$Longest_sequence <- gsub(".*-(.*)", "\\1", df$Sequence_length)
intVals <- c(
"Shortest_sequence",
"Longest_sequence",
"Total_Sequences",
"Sequences_flagged_as_poor_quality"
)
df[intVals] <- lapply(df[intVals], as.integer)
dplyr::select(
df,
"Filename",
"Total_Sequences",
contains("quality"),
ends_with("sequence"),
one_of("%GC", "File_type", "Encoding")
)
}
.getPerBaseSeqQuals <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Per_base_sequence_quality"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
names(df) <- gsub(" ", "_", names(df))
## Check for the required values
reqVals <- c(
"Base",
"Mean",
"Median",
"Lower_Quartile",
"Upper_Quartile",
"10th_Percentile",
"90th_Percentile"
)
stopifnot(reqVals %in% names(df))
## Change to numeric where appropriate
df[reqVals[-1]] <- lapply(df[reqVals[-1]], as.numeric)
as_tibble(df)
}
.getPerTileSeqQuals <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Per_tile_sequence_quality"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
## Check for the required values
reqVals <- c("Tile", "Base", "Mean")
stopifnot(reqVals %in% names(df))
df[["Mean"]] <- as.numeric(df[["Mean"]])
as_tibble(df)
}
.getPerSeqQualScores <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Per_sequence_quality_scores"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
## Check for the required values
reqVals <- c("Quality", "Count")
stopifnot(reqVals %in% names(df))
df <- lapply(df, as.integer)
as_tibble(df)
}
.getPerBaseSeqCont <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Per_base_sequence_content"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
## Check for the required values
reqVals <- c("Base", "G", "A", "T", "C")
stopifnot(reqVals %in% names(df))
## Set all as numeric except the 'Base'column
df[reqVals[-1]] <- lapply(df[reqVals[-1]], as.numeric)
as_tibble(df)
}
.getPerSeqGcCont <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Per_sequence_GC_content"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
names(df) <- gsub(" ", "_", names(df))
## Check for the required values
reqVals <- c("GC_Content", "Count")
stopifnot(reqVals %in% names(df))
## Convert to integers
df[["GC_Content"]] <- as.integer(df[["GC_Content"]])
df[["Count"]] <- as.numeric(df[["Count"]])
as_tibble(df)
}
.getPerBaseNCont <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Per_base_N_content"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
## Check for the required values
reqVals <- c("Base", "N-Count")
stopifnot(reqVals %in% names(df))
df[["N-Count"]] <- as.integer(df[["N-Count"]])
as_tibble(df)
}
.getSeqLengthDist <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Sequence_Length_Distribution"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
## Check for the required values
reqVals <- c("Length", "Count")
stopifnot(reqVals %in% names(df))
## If there is a range, separate into Lower & Upper values
df$Lower <- as.integer(gsub("(.*)-.*", "\\1", df$Length))
df$Upper <- as.integer(gsub(".*-(.*)", "\\1", df$Length))
df$Count <- as.integer(df$Count)
df <- as_tibble(df)
df[c("Length", "Lower", "Upper", "Count")]
}
.getSeqDuplicationLevels <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Sequence_Duplication_Levels"
if (!mod %in% names(fqcLines))
return(list(
Total_Deduplicated_Percentage = NA_real_,
Sequence_Duplication_Levels = data.frame(NULL)
))
if (length(fqcLines[[mod]]) == 0)
return(list(
Total_Deduplicated_Percentage = NA_real_,
Sequence_Duplication_Levels = data.frame(NULL)
))
x <- fqcLines[[mod]]
## Check for the presence of the Total Deduplicate Percentage value
hasTotDeDup <- grepl("Total Deduplicated Percentage", x)
Total_Deduplicated_Percentage <- NA_real_
if (any(hasTotDeDup)) {
Total_Deduplicated_Percentage <-
gsub(".+\\t(.*)", "\\1", x[hasTotDeDup])
Total_Deduplicated_Percentage <-
as.numeric(Total_Deduplicated_Percentage)
}
if (length(Total_Deduplicated_Percentage) > 1)
stop("Too many elements matched Total_Deduplicated_Percentage")
## Remove the Total value entry from the original object
df <- .splitByTab(x[!hasTotDeDup])
names(df) <- gsub(" ", "_", names(df))
## Check for the required values
reqVals <- c(
"Duplication_Level",
"Percentage_of_deduplicated",
"Percentage_of_total"
)
stopifnot(reqVals %in% names(df))
## Convert percentages to numeric
df[reqVals[-1]] <- lapply(df[reqVals[-1]], as.numeric)
## Return a list with both values
list(
Total_Deduplicated_Percentage = Total_Deduplicated_Percentage,
Sequence_Duplication_Levels = as_tibble(df)
)
}
.getOverrepSeq <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Overrepresented_sequences"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
names(df) <- gsub(" ", "_", names(df))
## Check for the required values
reqVals <- c("Sequence", "Count", "Percentage", "Possible_Source")
stopifnot(reqVals %in% names(df))
df$Count <- as.integer(df$Count)
df$Percentage <- as.numeric(df$Percentage)
as_tibble(df)
}
.getAdapterCont <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Adapter_Content"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
names(df) <- gsub(" ", "_", names(df))
## Check for the required values
reqVals <- "Position"
stopifnot(reqVals %in% names(df))
stopifnot(ncol(df) > 1)
numCols <- !names(df) %in% reqVals
df[numCols] <- lapply(df[numCols], as.numeric)
as_tibble(df)
}
.getKmerCont <- function(fqcLines){
## Return empty df if the module is missing
mod <- "Kmer_Content"
if (!mod %in% names(fqcLines)) return(data.frame(NULL))
if (length(fqcLines[[mod]]) == 0) return(data.frame(NULL))
df <- .splitByTab(fqcLines[[mod]])
names(df) <- gsub(" ", "_", names(df))
## Check for the required values
reqVals <-
c("Sequence", "Count", "PValue", "Obs/Exp_Max", "Max_Obs/Exp_Position")
stopifnot(reqVals %in% names(df))
df$Count <- as.integer(df$Count)
df$PValue <- as.numeric(df$PValue)
df$`Obs/Exp_Max` <- as.numeric(df$`Obs/Exp_Max`)
as_tibble(df)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.