#' Read 10X Genomics Cell Ranger Data
#'
#' Read [10x Genomics Chromium](https://www.10xgenomics.com/software/) cell
#' counts from `barcodes.tsv`, `genes.tsv`, and `matrix.mtx` files.
#'
#' @section Directory structure:
#' Cell Ranger can vary in its output directory structure, but we're requiring a
#' single, consistent data structure for datasets containing multiple samples.
#' Note that Cell Ranger data may not always contain per sample subdirectories,
#' or the "`outs`" subdirectory. We may make this more flexible in the future,
#' but for now we're making this strict to ensure reproducibility.
#'
#' \preformatted{
#' file.path(
#' "<uploadDir>",
#' "<sampleName>",
#' "outs",
#' "filtered_gene_bc_matrices*",
#' "outs",
#' "<genomeBuild>",
#' "matrix.mtx"
#' )
#' }
#'
#' @section Sample metadata:
#' A user-supplied sample metadata file defined by `sampleMetadataFile` is
#' required for multiplexed datasets. Otherwise this can be left `NULL`, and
#' minimal sample data will be used, based on the directory names.
#'
#' @section Reference data:
#' We strongly recommend supplying the corresponding reference data required for
#' Cell Ranger with the `refdataDir` argument. When set, the function will
#' detect the `organism`, `ensemblRelease`, and `genomeBuild` automatically,
#' based on the 10X `refdataDir` YAML metadata. Additionally, it will convert
#' the gene annotations defined in the GTF file into a `GRanges` object, which
#' get slotted in [rowRanges()]. Otherwise, the function will attempt to use the
#' most current annotations available from Ensembl, and some gene IDs may not
#' match, due to deprecation in the current Ensembl release.
#'
#' @family Read Functions
#' @author Michael Steinbaugh
#'
#' @inheritParams bcbioSingleCell
#' @inheritParams general
#' @param uploadDir Path to Cell Ranger output directory. This directory path
#' must contain `filtered_gene_bc_matrices*` as a child directory.
#' @param refdataDir Directory path to Cell Ranger reference annotation data.
#'
#' @return `SingleCellExperiment`.
#' @export
#'
#' @examples
#' uploadDir <- system.file("extdata/cellranger", package = "bcbioSingleCell")
#' x <- readCellRanger(uploadDir)
#' show(x)
readCellRanger <- function(
uploadDir,
sampleMetadataFile = NULL,
refdataDir = NULL,
interestingGroups = "sampleName",
transgeneNames = NULL,
spikeNames = NULL,
...
) {
assert_is_a_string(uploadDir)
assert_all_are_dirs(uploadDir)
uploadDir <- normalizePath(uploadDir, winslash = "/", mustWork = TRUE)
assertIsAStringOrNULL(sampleMetadataFile)
assertIsAStringOrNULL(refdataDir)
if (is_a_string(refdataDir)) {
assert_all_are_dirs(refdataDir)
refdataDir <- normalizePath(refdataDir, winslash = "/", mustWork = TRUE)
}
assert_is_character(interestingGroups)
assert_is_any_of(transgeneNames, c("character", "NULL"))
assert_is_any_of(spikeNames, c("character", "NULL"))
dots <- list(...)
pipeline <- "cellranger"
level <- "genes"
umiType <- "chromium"
# Legacy arguments =========================================================
# annotable
if ("annotable" %in% names(call)) {
stop("`annotable` argument is defunct")
}
dots <- Filter(Negate(is.null), dots)
# Directory paths ==========================================================
# This step can be slow when running over sshfs
matrixFiles <- list.files(
path = uploadDir,
pattern = "matrix.mtx",
include.dirs = FALSE,
full.names = TRUE,
recursive = TRUE
) %>%
# Subset to only include `filtered_gene_bc_matrices*`. Note that
# aggregation output is labeled `filtered_gene_bc_matrices_mex` by
# default.
.[grepl("filtered_gene_bc_matrices", .)]
assert_is_non_empty(matrixFiles)
# Combined or per sample matrix support
if (is_a_string(matrixFiles)) {
sampleDirs <- uploadDir
} else {
message(paste(length(matrixFiles), "samples detected"))
message(paste(
"Required per sample output directory structure:",
file.path(
"<uploadDir>",
"<sampleName>",
"outs",
"filtered_gene_bc_matrices*",
"<genomeBuild>",
"matrix.mtx"
),
sep = "\n"
))
assert_all_are_matching_regex(
x = matrixFiles,
pattern = file.path(
paste0("^", uploadDir),
"[^/]+", # sampleName
"outs",
"filtered_gene_bc_matrices([^/]+)?",
"[^/]+",
paste0("matrix.mtx", "$")
)
)
# Sample directories nest the matrix files 4 levels deep
sampleDirs <- matrixFiles %>%
dirname() %>%
dirname() %>%
dirname() %>%
dirname()
}
names(sampleDirs) <- makeNames(basename(sampleDirs), unique = TRUE)
# Sample metadata ==========================================================
if (is_a_string(sampleMetadataFile)) {
sampleData <- readSampleData(sampleMetadataFile)
} else {
sampleData <- minimalSampleData(basename(sampleDirs))
}
# Interesting groups =======================================================
# Ensure internal formatting in camelCase
interestingGroups <- camel(interestingGroups, strict = FALSE)
assertFormalInterestingGroups(sampleData, interestingGroups)
# Subset sample directories by metadata ====================================
if (nrow(sampleData) < length(sampleDirs)) {
message("Loading a subset of samples, defined by the metadata file")
allSamples <- FALSE
sampleDirs <- sampleDirs %>%
.[names(sampleDirs) %in% rownames(sampleData)]
message(paste(length(sampleDirs), "samples matched by metadata"))
} else {
allSamples <- TRUE
}
# Gene annotations =========================================================
refJSON <- NULL
ensemblRelease <- NULL
rowRangesMetadata <- NULL
# Stop on multiple genomes (not supported in a single SCE object)
genomeBuild <- matrixFiles %>%
dirname() %>%
basename() %>%
unique()
assert_is_a_string(genomeBuild)
organism <- detectOrganism(genomeBuild)
assert_is_a_string(organism)
# Prepare gene annotations as GRanges
if (is_a_string(refdataDir)) {
# JSON data
refJSONFile <- file.path(refdataDir, "reference.json")
assert_all_are_existing_files(refJSONFile)
refJSON <- read_json(refJSONFile)
# Convert the GTF file to GRanges
gffFile <- file.path(refdataDir, "genes", "genes.gtf")
assert_is_a_string(gffFile)
rowRanges <- makeGRangesFromGFF(gffFile)
# Get the Ensembl version from the GTF file name.
# Example: "Homo_sapiens.GRCh37.82.filtered.gtf"
ensemblRelease <- gffFile %>%
str_split("\\.", simplify = TRUE) %>%
.[1L, 3L] %>%
as.integer()
} else {
# CellRanger uses Ensembl refdata internally. Here we're fetching the
# annotations with AnnotationHub rather than pulling from the GTF file
# in the refdata directory. It will also drop genes that are now dead in
# the current Ensembl release. Don't warn about old Ensembl release
# version.
ah <- suppressWarnings(makeGRangesFromEnsembl(
organism = organism,
format = level,
genomeBuild = genomeBuild,
metadata = TRUE
))
assert_is_list(ah)
assert_are_identical(names(ah), c("data", "metadata"))
rowRanges <- ah[["data"]]
assert_is_all_of(rowRanges, "GRanges")
rowRangesMetadata <- ah[["metadata"]]
assert_is_data.frame(rowRangesMetadata)
}
assert_is_subset(rownames(counts), names(rowRanges))
assert_is_subset(c("geneID", "geneName"), names(mcols(rowRanges)))
rowData <- as.data.frame(rowRanges)
rownames(rowData) <- names(rowRanges)
# Counts ===================================================================
message("Reading counts at gene level")
# This step can be slow over sshfs, recommend running on an HPC
sparseCountsList <- .sparseCountsList(
sampleDirs = sampleDirs,
pipeline = pipeline,
umiType = umiType
)
counts <- do.call(cbind, sparseCountsList)
# Multiplexed sample check =================================================
# Check to see if multiplexed samples are present and require metadata
multiplexedPattern <- "^(.+)_(\\d+)_([ACGT]+)$"
if (any(grepl(multiplexedPattern, colnames(counts)))) {
# Prepare data.frame of barcode mappings
# Example:
# cellID: cellranger_AAACCTGGTTTACTCT_1
# description: cellranger
# barcode: AAACCTGGTTTACTCT
# index: 1
cellMap <- str_match(
string = colnames(counts),
pattern = "^(.+)_(\\d+)_([ACGT]+)$"
) %>%
as.data.frame() %>%
set_colnames(c(
"cellID",
"description",
"index",
"barcode"
)) %>%
mutate_all(as.factor)
# Check for single sample and fix sampleData automatically if necessary
if (
identical(levels(cellMap[["index"]]), "1") &&
!"index" %in% colnames(sampleData)
) {
sampleData[["index"]] <- factor("1")
rownames(sampleData) <- paste0(rownames(sampleData), "_1")
}
# Require user defined metadata
if (!"index" %in% colnames(sampleData)) {
stop(paste(
"`index` column must be defined using",
"`sampleMetadataFile` for multiplexed samples"
))
}
}
# Column data ==============================================================
# Always prefilter, removing very low quality cells with no UMIs or genes
metrics <- metrics(counts, rowData = rowData, prefilter = TRUE)
# Subset the counts to match the prefiltered metrics
counts <- counts[, rownames(metrics), drop = FALSE]
colData <- as(metrics, "DataFrame")
colData[["cellID"]] <- rownames(colData)
cell2sample <- mapCellsToSamples(
cells = rownames(colData),
samples = rownames(sampleData)
)
colData[["sampleID"]] <- cell2sample
sampleData[["sampleID"]] <- rownames(sampleData)
colData <- merge(
x = colData,
y = sampleData,
by = "sampleID",
all.x = TRUE
)
rownames(colData) <- colData[["cellID"]]
colData[["cellID"]] <- NULL
sampleData[["sampleID"]] <- NULL
# Metadata =================================================================
metadata <- list(
"version" = packageVersion,
"pipeline" = pipeline,
"level" = level,
"uploadDir" = uploadDir,
"sampleDirs" = sampleDirs,
"sampleMetadataFile" = as.character(sampleMetadataFile),
"sampleData" = sampleData,
"interestingGroups" = interestingGroups,
"cell2sample" = as.factor(cell2sample),
"organism" = organism,
"genomeBuild" = as.character(genomeBuild),
"ensemblRelease" = as.integer(ensemblRelease),
"rowRangesMetadata" = rowRangesMetadata,
"umiType" = umiType,
"allSamples" = allSamples,
# cellranger pipeline-specific -----------------------------------------
"refdataDir" = refdataDir,
"refJSON" = refJSON,
"call" = match.call()
)
# Add user-defined custom metadata, if specified
if (length(dots)) {
assert_are_disjoint_sets(names(metadata), names(dots))
metadata <- c(metadata, dots)
}
# Return ===================================================================
.new.SingleCellExperiment(
assays = list("counts" = counts),
rowRanges = rowRanges,
colData = colData,
metadata = metadata,
transgeneNames = transgeneNames,
spikeNames = spikeNames
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.