setClassUnion("missingOrNULL", c("missing", "NULL"))
#' @rdname bcbioRNASeq-class
#' @aliases NULL
#' @exportClass bcbioRNASeq
#' @usage NULL
bcbioRNASeq <- setClass(
Class = "bcbioRNASeq",
contains = "RangedSummarizedExperiment"
)
# Constructors =================================================================
#' `bcbioRNASeq` Object and Constructor
#'
#' `bcbioRNASeq` is an S4 class that extends `RangedSummarizedExperiment`, and
#' is designed to store a [bcbio](https://bcbio-nextgen.readthedocs.org) RNA-seq
#' analysis.
#'
#' Simply point to the final upload directory generated by
#' [bcbio](https://bcbio-nextgen.readthedocs.io/), and this constructor function
#' will take care of the rest. It automatically imports RNA-seq counts,
#' metadata, and the program versions used.
#'
#' This class contains raw read counts and length-scaled
#' transcripts per million (TPM) generated by [tximport::tximport()]. Counts can
#' be loaded at gene or transcript level.
#'
#' @section Metadata:
#'
#' The [metadata()] accessor contains:
#'
#' - Sample quality control metrics.
#' - Ensembl annotations.
#' - Server run paths.
#' - R session information (e.g. [utils::sessionInfo()]).
#'
#' @section DESeq2:
#'
#' DESeq2 is run automatically when [bcbioRNASeq()] is called, and variance
#' stabilized counts are slotted into [assays()]. If the number of samples is
#' bigger than the `transformationLimit` argument, `rlog` and `vst` counts will
#' not be slotted into `assays()`. In this case, we recommend visualization
#' using [tmm()] counts, which are automatically calculated using edgeR.
#'
#' @section Remote Data:
#'
#' When working in RStudio, we recommend connecting to the bcbio run directory
#' as a remote connection over
#' [sshfs](https://github.com/osxfuse/osxfuse/wiki/SSHFS).
#'
#' @note `bcbioRNASeq` extended `SummarizedExperiment` prior to v0.2.0, where we
#' migrated to `RangedSummarizedExperiment`.
#'
#' @rdname bcbioRNASeq-class
#' @aliases bcbioRNASeq-class
#' @docType class
#' @family S4 Class Definition
#' @author Michael Steinbaugh, Lorena Pantano, Rory Kirchner, Victor Barrera
#'
#' @inheritParams bcbioBase::prepareSummarizedExperiment
#' @inheritParams general
#' @param uploadDir Path to final upload directory. This path is set when
#' running `bcbio_nextgen -w template`.
#' @param level Import counts as "`genes`" (default) or "`transcripts`".
#' @param caller Expression caller. Supports "`salmon`" (default), "`kallisto`",
#' or "`sailfish`".
#' @param organism Organism name. Use the full latin name (e.g.
#' "Homo sapiens"), since this will be input downstream to
#' AnnotationHub and ensembldb, unless `gffFile` is set. If set `NULL`
#' (*advanced use; not recommended*), the function call will skip loading
#' gene/transcript-level annotations into `rowRanges()`. This can be useful
#' for poorly annotation genomes or experiments involving multiple genomes.
#' @param samples *Optional.* Specify a subset of samples to load. The names
#' must match the `description` specified in the bcbio YAML metadata. If a
#' `sampleMetadataFile` is provided, that will take priority for sample
#' selection. Typically this can be left unset.
#' @param sampleMetadataFile *Optional.* Custom metadata file containing
#' sample information. Otherwise defaults to sample metadata saved in the YAML
#' file. Remote URLs are supported. Typically this can be left unset.
#' @param genomeBuild *Optional.* Ensembl genome build name (e.g. "GRCh38").
#' This will be passed to AnnotationHub for `EnsDb` annotation matching,
#' unless `gffFile` is set.
#' @param ensemblRelease *Optional.* Ensembl release version. If unset,
#' defaults to current release, and does not typically need to be
#' user-defined. Passed to AnnotationHub for `EnsDb` annotation matching,
#' unless `gffFile` is set.
#' @param gffFile *Advanced use; not recommended.* By default, we recommend
#' leaving this `NULL` for genomes that are supported on Ensembl. In this
#' case, the row annotations ([rowRanges()]) will be obtained automatically
#' from Ensembl by passing the `organism`, `genomeBuild`, and `ensemblRelease`
#' arguments to AnnotationHub and ensembldb. For a genome that is not
#' supported on Ensembl and/or AnnotationHub, a GFF/GTF (General Feature
#' Format) file is required. Generally, we recommend using a GTF (GFFv2) file
#' here over a GFF3 file if possible, although all GFF formats are supported.
#' The function will internally generate a `TxDb` containing
#' transcript-to-gene mappings and construct a `GRanges` object containing the
#' genomic ranges ([rowRanges()]).
#' @param transformationLimit Maximum number of samples to calculate
#' [DESeq2::rlog()] and [DESeq2::varianceStabilizingTransformation()] matrix.
#' For large datasets, DESeq2 is slow to apply variance stabilization. In this
#' case, we recommend loading up the dataset in a high-performance computing
#' environment. Use `Inf` to always apply and `-Inf` to always skip.
#' @param ... Additional arguments, slotted into the [metadata()] accessor.
#'
#' @return `bcbioRNASeq`.
#' @export
#'
#' @seealso
#' - [SummarizedExperiment::SummarizedExperiment()].
#' - [methods::initialize()].
#' - [methods::validObject()].
#' - [BiocGenerics::updateObject()].
#' - `.S4methods(class = "bcbioRNASeq")`.
#'
#' @examples
#' uploadDir <- system.file("extdata/bcbio", package = "bcbioRNASeq")
#'
#' # Gene level
#' x <- bcbioRNASeq(
#' uploadDir = uploadDir,
#' level = "genes",
#' caller = "salmon",
#' organism = "Mus musculus",
#' ensemblRelease = 87L
#' )
#' show(x)
#' is(x, "RangedSummarizedExperiment")
#' validObject(x)
#'
#' # Transcript level
#' x <- bcbioRNASeq(
#' uploadDir = uploadDir,
#' level = "transcripts",
#' caller = "salmon",
#' organism = "Mus musculus",
#' ensemblRelease = 87L
#' )
#' show(x)
#' validObject(x)
bcbioRNASeq <- function(
uploadDir,
level = c("genes", "transcripts"),
caller = c("salmon", "kallisto", "sailfish"),
organism,
samples = NULL,
sampleMetadataFile = NULL,
interestingGroups = "sampleName",
ensemblRelease = NULL,
genomeBuild = NULL,
transgeneNames = NULL,
spikeNames = NULL,
gffFile = NULL,
transformationLimit = 50L,
...
) {
dots <- list(...)
# Legacy arguments =========================================================
call <- match.call(expand.dots = TRUE)
# annotable
if ("annotable" %in% names(call)) {
stop("`annotable` is defunct. Consider using `gffFile` instead.")
}
# ensemblVersion
if ("ensemblVersion" %in% names(call)) {
warning("Use `ensemblRelease` instead of `ensemblVersion`")
ensemblRelease <- call[["ensemblVersion"]]
dots[["ensemblVersion"]] <- NULL
}
# organism missing
if (!"organism" %in% names(call)) {
stop("`organism` is now required")
}
dots <- Filter(Negate(is.null), dots)
# Assert checks ============================================================
assert_is_a_string(uploadDir)
assert_all_are_dirs(uploadDir)
level <- match.arg(level)
caller <- match.arg(caller)
assertIsAStringOrNULL(sampleMetadataFile)
assertIsCharacterOrNULL(samples)
assert_is_character(interestingGroups)
assertIsAStringOrNULL(organism)
assertIsAnImplicitIntegerOrNULL(ensemblRelease)
assertIsAStringOrNULL(genomeBuild)
assertIsCharacterOrNULL(transgeneNames)
assertIsCharacterOrNULL(spikeNames)
assertIsAStringOrNULL(gffFile)
if (is_a_string(gffFile)) {
assert_all_are_existing_files(gffFile)
}
assert_is_a_number(transformationLimit)
# Directory paths ==========================================================
uploadDir <- normalizePath(uploadDir, winslash = "/", mustWork = TRUE)
projectDir <- list.files(
path = uploadDir,
pattern = bcbioBase::projectDirPattern,
full.names = FALSE,
recursive = FALSE
)
assert_is_a_string(projectDir)
message(projectDir)
match <- str_match(projectDir, bcbioBase::projectDirPattern)
runDate <- as.Date(match[[2L]])
template <- match[[3L]]
projectDir <- file.path(uploadDir, projectDir)
assert_all_are_dirs(projectDir)
sampleDirs <- sampleDirs(uploadDir)
assert_all_are_dirs(sampleDirs)
# Sequencing lanes =========================================================
if (any(grepl(x = sampleDirs, pattern = bcbioBase::lanePattern))) {
lanes <- str_match(names(sampleDirs), bcbioBase::lanePattern) %>%
.[, 2L] %>%
unique() %>%
length()
message(paste(
lanes, "sequencing lane detected", "(technical replicates)"
))
} else {
lanes <- 1L
}
assert_is_an_integer(lanes)
# Project summary YAML =====================================================
yamlFile <- file.path(projectDir, "project-summary.yaml")
assert_all_are_existing_files(yamlFile)
yaml <- readYAML(yamlFile)
# Column data ==============================================================
colData <- readYAMLSampleData(yamlFile)
# Replace columns with external, user-defined metadata, if desired. This is
# nice for correcting metadata issues that aren't easy to fix by editing the
# bcbio YAML output.
if (is_a_string(sampleMetadataFile)) {
userColData <- readSampleData(sampleMetadataFile, lanes = lanes)
# Drop columns that are defined from the auto metadata
setdiff <- setdiff(colnames(colData), colnames(userColData))
# Note that we're allowing the user to subset the samples by the
# metadata input here
colData <- colData[rownames(userColData), setdiff, drop = FALSE]
colData <- cbind(userColData, colData)
}
# Allow easy subsetting by sample (must match description)
if (is.character(samples)) {
assert_is_subset(samples, colData[["description"]])
colData <- colData %>%
.[which(samples %in% .[["description"]]), , drop = FALSE]
}
# Sanitize into factors
colData <- sanitizeSampleData(colData)
# Sample metrics ===========================================================
# Note that sample metrics used for QC plots are not currently generated
# when using fast RNA-seq workflow. This depends upon MultiQC and aligned
# counts generated with STAR.
message("Reading sample metrics")
metrics <- readYAMLSampleMetrics(yamlFile)
assert_is_data.frame(metrics)
assert_are_disjoint_sets(colnames(colData), colnames(metrics))
# Now safe to add the metrics to colData
colData <- cbind(colData, metrics)
# Interesting groups =======================================================
interestingGroups <- camel(interestingGroups)
assert_is_subset(interestingGroups, colnames(colData))
# Subset sample directories by metadata ====================================
samples <- rownames(colData)
assert_is_subset(samples, names(sampleDirs))
if (length(samples) < length(sampleDirs)) {
message(paste(
"Loading a subset of samples:",
str_trunc(toString(samples), width = 80L),
sep = "\n"
))
allSamples <- FALSE
sampleDirs <- sampleDirs[samples]
} else {
allSamples <- TRUE
}
# bcbio run information ====================================================
dataVersions <- readDataVersions(
file = file.path(projectDir, "data_versions.csv")
)
assert_is_tbl_df(dataVersions)
programVersions <- readProgramVersions(
file = file.path(projectDir, "programs.txt")
)
assert_is_tbl_df(programVersions)
bcbioLog <- readLog(
file = file.path(projectDir, "bcbio-nextgen.log")
)
assert_is_character(bcbioLog)
bcbioCommandsLog <- readLog(
file = file.path(projectDir, "bcbio-nextgen-commands.log")
)
assert_is_character(bcbioCommandsLog)
# tximport =================================================================
# Run this step only after the required metadata has imported successfully
if (level == "transcripts") {
txOut <- TRUE
} else {
txOut <- FALSE
}
# tx2gene
tx2geneFile <- file.path(projectDir, "tx2gene.csv")
if (file.exists(tx2geneFile)) {
# CSV file always takes priority
tx2gene <- readTx2gene(tx2geneFile)
} else if (is_a_string(gffFile)) {
# Fall back to using GFF, if declared
tx2gene <- makeTx2geneFromGFF(gffFile)
} else {
stop(paste(
"tx2gene assignment failure.",
"tx2gene.csv or GFF file are required."
))
}
txi <- .tximport(
sampleDirs = sampleDirs,
type = caller,
txIn = TRUE,
txOut = txOut,
tx2gene = tx2gene
)
# TPM/abundance: transcripts per million
tpm <- txi[["abundance"]]
assert_is_matrix(tpm)
# counts: raw counts
counts <- txi[["counts"]]
assert_is_matrix(counts)
# length: average transcript length
length <- txi[["length"]]
assert_is_matrix(length)
# countsFromAbundance: character describing TPM
countsFromAbundance <- txi[["countsFromAbundance"]]
assert_is_character(countsFromAbundance)
# Ensure `colData` matches the colnames in `assays()`
colData <- colData[colnames(counts), , drop = FALSE]
# Row data =================================================================
rowRangesMetadata <- NULL
if (is_a_string(gffFile)) {
rowRanges <- makeGRangesFromGFF(gffFile)
} else if (is_a_string(organism)) {
# ah: AnnotationHub
ah <- makeGRangesFromEnsembl(
organism = organism,
format = level,
genomeBuild = genomeBuild,
release = ensemblRelease,
metadata = TRUE
)
assert_is_list(ah)
assert_are_identical(names(ah), c("data", "metadata"))
rowRanges <- ah[["data"]]
assert_is_all_of(rowRanges, "GRanges")
rowRangesMetadata <- ah[["metadata"]]
assert_is_data.frame(rowRangesMetadata)
} else {
rowRanges <- emptyRanges(rownames(counts))
}
# Gene-level variance stabilization ========================================
normalized <- NULL
rlog <- NULL
vst <- NULL
if (level == "genes") {
message(paste(
"Generating DESeqDataSet with DESeq2",
packageVersion("DESeq2")
))
dds <- DESeqDataSetFromTximport(
txi = txi,
colData = colData,
# Use an empty design formula
design = ~ 1 # nolint
)
# Suppress warning about empty design formula
dds <- suppressWarnings(DESeq(dds))
normalized <- counts(dds, normalized = TRUE)
if (nrow(colData) <= transformationLimit) {
message("Applying rlog transformation")
rlog <- assay(rlog(dds))
message("Applying variance stabilizing transformation")
vst <- assay(varianceStabilizingTransformation(dds))
} else {
warning(paste(
"Dataset contains many samples.",
"Skipping variance stabilizing transformations."
))
}
}
# Assays ===================================================================
assays <- list(
"counts" = counts,
"tpm" = tpm,
"length" = length,
"normalized" = normalized,
"rlog" = rlog,
"vst" = vst
)
# Metadata =================================================================
metadata <- list(
"version" = packageVersion,
"level" = level,
"caller" = caller,
"countsFromAbundance" = countsFromAbundance,
"uploadDir" = uploadDir,
"sampleDirs" = sampleDirs,
"sampleMetadataFile" = as.character(sampleMetadataFile),
"projectDir" = projectDir,
"template" = template,
"runDate" = runDate,
"interestingGroups" = interestingGroups,
"organism" = as.character(organism),
"genomeBuild" = as.character(genomeBuild),
"ensemblRelease" = as.integer(ensemblRelease),
"rowRangesMetadata" = rowRangesMetadata,
"gffFile" = as.character(gffFile),
"tx2gene" = tx2gene,
"lanes" = lanes,
"yaml" = yaml,
"dataVersions" = dataVersions,
"programVersions" = programVersions,
"bcbioLog" = bcbioLog,
"bcbioCommandsLog" = bcbioCommandsLog,
"allSamples" = allSamples,
"call" = match.call()
)
# Add user-defined custom metadata, if specified
if (length(dots)) {
assert_are_disjoint_sets(metadata, dots)
metadata <- c(metadata, dots)
}
# Return ===================================================================
.new.bcbioRNASeq(
assays = assays,
rowRanges = rowRanges,
colData = colData,
metadata = metadata,
transgeneNames = transgeneNames,
spikeNames = spikeNames
)
}
# TODO Consider erroring on `sampleID` and `description` columns in colData
# Validity Checks ==============================================================
setValidity(
"bcbioRNASeq",
function(object) {
stopifnot(metadata(object)[["version"]] >= 0.2)
assert_is_all_of(object, "RangedSummarizedExperiment")
assert_has_dimnames(object)
# Assays ===============================================================
# Note that `rlog` and `vst` DESeqTransform objects are optional
assert_is_subset(requiredAssays, assayNames(object))
# Check that all assays are matrices
assayCheck <- vapply(
X = assays(object),
FUN = is.matrix,
FUN.VALUE = logical(1L),
USE.NAMES = TRUE
)
if (!all(assayCheck)) {
stop(paste(
paste(
"Assays that are not matrix:",
toString(names(assayCheck[!assayCheck]))
),
bcbioBase::updateMessage,
sep = "\n"
))
}
# Gene-level specific
if (metadata(object)[["level"]] == "genes") {
assert_is_subset("normalized", names(assays(object)))
}
# Row data =============================================================
assert_is_all_of(rowRanges(object), "GRanges")
assert_is_all_of(rowData(object), "DataFrame")
# Column data ==========================================================
# metrics
if (is.data.frame(metadata(object)[["metrics"]])) {
stop(paste(
"`metrics` saved in `metadata()` instead of `colData()`.",
bcbioBase::updateMessage
))
}
assert_are_disjoint_sets(
x = colnames(colData(object)),
y = legacyMetricsCols
)
# Metadata =============================================================
metadata <- metadata(object)
# Detect legacy slots
legacyMetadata <- c(
"design",
"ensemblVersion",
"gtf",
"gtfFile",
"missingGenes",
"programs",
"yamlFile"
)
intersect <- intersect(names(metadata), legacyMetadata)
if (length(intersect)) {
stop(paste(
paste(
"Legacy metadata slots:",
toString(sort(intersect))
),
bcbioBase::updateMessage,
sep = "\n"
))
}
# Class checks (order independent)
requiredMetadata <- list(
"allSamples" = "logical",
"bcbioCommandsLog" = "character",
"bcbioLog" = "character",
"caller" = "character",
"countsFromAbundance" = "character",
"date" = "Date",
"devtoolsSessionInfo" = "session_info",
"ensemblRelease" = "integer",
"genomeBuild" = "character",
"gffFile" = "character",
"interestingGroups" = "character",
"lanes" = "integer",
"level" = "character",
"organism" = "character",
"programVersions" = "tbl_df",
"projectDir" = "character",
"runDate" = "Date",
"sampleDirs" = "character",
"sampleMetadataFile" = "character",
"template" = "character",
"tx2gene" = "data.frame",
"uploadDir" = "character",
"utilsSessionInfo" = "sessionInfo",
"version" = "package_version",
"wd" = "character",
"yaml" = "list"
)
classChecks <- invisible(mapply(
name <- names(requiredMetadata),
expected <- requiredMetadata,
MoreArgs = list(metadata = metadata),
FUN = function(name, expected, metadata) {
actual <- class(metadata[[name]])
if (!length(intersect(expected, actual))) {
FALSE
} else {
TRUE
}
},
SIMPLIFY = TRUE,
USE.NAMES = TRUE
))
if (!all(classChecks)) {
print(classChecks)
stop(paste(
"Metadata class checks failed.",
bcbioBase::updateMessage,
sep = "\n"
))
}
# Additional assert checks
# caller
assert_is_subset(
x = metadata[["caller"]],
y = validCallers
)
# level
assert_is_subset(
x = metadata[["level"]],
y = c("genes", "transcripts")
)
# tx2gene
tx2gene <- metadata[["tx2gene"]]
assertIsTx2gene(tx2gene)
TRUE
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.