#' TENxH5: The HDF5 file representation class for 10X Data
#'
#' @description This class is designed to work with 10x Single Cell datasets.
#' It was developed using the PBMC 3k 10X dataset from the CellRanger v2
#' pipeline.
#'
#' @slot version character(1) There are currently two recognized versions
#' associated with 10X data, either version "2" or "3". See details for more
#' information.
#'
#' @slot group character(1) The HDF5 group embedded within the file structure,
#' this is usually either the "matrix" or "outs" group but other groups are
#' supported as well.
#'
#' @slot ranges character(1) The HDF5 internal folder location embedded within
#' the file that points to the ranged data information, e.g.,
#' "/features/interval".
#'
#' @details The data version "3" mainly includes a "matrix" group and "interval"
#' information within the file. Version "2" data does not include
#' ranged-based information and has a different directory structure compared
#' to version "3". See the internal `data.frame`: `TENxIO:::h5.version.map` for
#' a map of fields and their corresponding file locations within the H5 file.
#' This map is used to create the `rowData` structure from the file.
#'
#' @section import:
#' The `import` method uses `DelayedArray::TENxMatrix` to represent matrix
#' data. Generally, version 3 datasets contain associated genomic coordinates.
#' The associated feature data, as displayed by the `rowData` method, is
#' queried for the "Type" column which will indicate that a `splitAltExps`
#' operation is appropriate. If a `ref` input is provided to the constructor
#' function `TENxH5`, it will be used as the main experiment; otherwise, the
#' most frequent category in the "Type" column will be used. For example,
#' the Multiome ATAC + Gene Expression feature data contains both 'Gene
#' Expression' and 'Peaks' labels in the "Type" column.
#'
#' @return A `TENxH5` class object
#'
#' @seealso [TENxH5]
#'
#' @include TENxFile-class.R
#'
#' @exportClass TENxH5
.TENxH5 <- setClass(
Class = "TENxH5",
contains = "TENxFile",
slots = c(version = "character", group = "character", ranges = "character")
)
.get_h5_group <- function(fpath, remote) {
l1 <- rhdf5::h5ls(fpath, recursive = FALSE, s3 = remote)
l1[l1$otype == "H5I_GROUP", "name"]
}
.KNOWN_H5_GROUPS <- c("matrix", "outs")
.KNOWN_VERSIONS <- c("3", "2")
.check_h5_group <- function(group) {
g_msg <- paste(.KNOWN_H5_GROUPS, collapse = ", ")
if (!group %in% .KNOWN_H5_GROUPS)
warning("'group' not in known 10X groups: ", g_msg, call. = FALSE)
}
.getDim <- function(file, group, remote) {
rhdf5::h5read(file, paste0(group, "/", "shape"), s3 = remote)
}
.get_tenx_version <- function(group) {
.KNOWN_VERSIONS[match(group, .KNOWN_H5_GROUPS)]
}
# Constructor -------------------------------------------------------------
#' TENxH5: Represent H5 files from 10X
#'
#' This constructor function was developed using the PBMC 3K dataset from 10X
#' Genomics (version 3). Other versions are supported and input arguments
#' `version` and `group` can be overridden.
#'
#' @details The various `TENxH5` methods including `rowData` and `rowRanges`,
#' provide a snapshot of the data using a length 12 head and tail subset for
#' efficiency. In contrast, methods such as `dimnames` and `dim` give
#' a full view of the dimensions of the data. The `show` method provides
#' relevant information regarding the dimensions of the data including
#' metadata such as `rowData` and "Type" column, if available. The data files
#' are `import`ed as `SingleCellExperiment` class objects.
#'
#' An additional `ref` argument can be provided when the file contains
#' multiple `feature_type` in the file or "Type" in the `rowData`. By default,
#' the first type reported in `table()` is set as the `mainExpName` in the
#' `SingleCellExperiment` object.
#'
#' For data that do not contain genomic coordinate information, the `TENxH5`
#' will fail to read `"/features/interval"` and will set the `ranges`
#' argument to `NA_character_`.
#'
#' The data version "3" mainly includes a "matrix" group and "interval"
#' information within the file. Version "2" data does not include
#' ranged-based information and has a different directory structure compared
#' to version "3". See the internal `data.frame`: `TENxIO:::h5.version.map`
#' for a map of fields and their corresponding file locations within the H5
#' file. This map is used to create the `rowData` structure from the file.
#'
#' @inheritParams TENxFile
#'
#' @param version character(1) There are currently two recognized versions
#' associated with 10X data, either version "2" or "3". See details for more
#' information.
#'
#' @param group character(1) The HDF5 group embedded within the file structure,
#' this is usually either the "matrix" or "outs" group but other groups are
#' supported as well (e.g., "mm10").
#'
#' @param ranges character(1) The HDF5 internal folder location embedded within
#' the file that points to the ranged data information, e.g.,
#' "/features/interval". Set to `NA_character_` if range information is not
#' present.
#'
#' @param rowidx,colidx numeric() A vector of indices corresponding to either
#' rows or columns that will dictate the data imported from the file. The
#' indices will be passed on to the `[` method of the `TENxMatrix`
#' representation.
#'
#' @return Usually, a `SingleCellExperiment` instance
#'
#' @seealso `import` section in [TENxH5-class]
#'
#' @examples
#'
#' h5f <- system.file(
#' "extdata", "pbmc_granulocyte_ff_bc_ex.h5",
#' package = "TENxIO", mustWork = TRUE
#' )
#'
#' TENxH5(h5f)
#'
#' import(TENxH5(h5f))
#'
#' h5f <- system.file(
#' "extdata", "10k_pbmc_ATACv2_f_bc_ex.h5",
#' package = "TENxIO", mustWork = TRUE
#' )
#'
#' ## Optional ref input, most frequent Type used by default
#' th5 <- TENxH5(h5f, ranges = "/features/id", ref = "Peaks")
#' th5
#' TENxH5(h5f, ranges = "/features/id")
#' import(th5)
#'
#' @export
TENxH5 <-
function(resource, version, group, ranges, rowidx, colidx, ...)
{
remote <- R.utils::isUrl(resource)
group <- .get_h5_group(resource, remote)
.check_h5_group(group)
if (missing(version))
version <- .get_tenx_version(group)
dims <- .getDim(resource, group, remote)
ext <- list(...)[["extension"]]
if (is.null(ext))
ext <- .get_ext(resource)
if (!identical(tolower(ext), "h5"))
warning("File extension is not 'h5'; import may fail", call. = FALSE)
if (missing(ranges))
ranges <- .selectByVersion(h5.version.map, version, "Ranges")
ranges <- .validateRanges(resource, group, ranges, remote)
if (missing(rowidx))
rowidx <- seq_len(dims[[1L]])
if (missing(colidx))
colidx <- seq_len(dims[[2L]])
.TENxH5(
resource = resource, group = group, version = version, ranges = ranges,
rowidx = rowidx,
colidx = colidx,
remote = remote,
extension = ext
)
}
h5.version.map <- data.frame(
Version = c("3", "2"),
ID = c("/features/id", "/genes"),
Symbol = c("/features/name", "/gene_names"),
Type = c("/features/feature_type", NA_character_),
Ranges = c("/features/interval", NA_character_)
)
.selectByVersion <-
function(df, version, select = !names(df) %in% c("Version", "Ranges"))
{
if (missing(version) || is.na(version))
NA_character_
else
df[df[["Version"]] == version, select]
}
.validateRanges <- function(file, group, ranges, remote) {
if (!isScalarCharacter(ranges))
NA_character_
else
tryCatch({
rhdf5::h5read(
file,
name = paste0(group, ranges),
index = list(1L),
s3 = remote
)
ranges
}, error = function(e) {
warning(
simpleWarning(
message = conditionMessage(e), call = conditionCall(e)
)
)
NA_character_
})
}
.read_rowData <- function(x, nrows) {
gm <- .selectByVersion(h5.version.map, x@version)
remote <- x@remote
gm[] <- Filter(Negate(is.na), gm)
res <- lapply(gm, function(colval) {
readname <- paste0(x@group, colval)
as.character(
rhdf5::h5read(path(x), index = list(nrows), readname, s3 = remote)
)
})
DF <- as(res, "DataFrame")
if ("Type" %in% names(DF))
DF[["Type"]] <- as.factor(DF[["Type"]])
rownames(DF) <- nrows
DF
}
#' @describeIn TENxH5 Generate the rowData ad hoc from a TENxH5 file
#'
#' @param x A `TENxH5` object
#'
#' @inheritParams SummarizedExperiment::rowData
#'
#' @export
setMethod("rowData", "TENxH5", function(x, use.names = TRUE, ...) {
nrows <- list(...)[["rows"]]
## Implement a smaller index for display purposes only
mxrow <- max(x@rowidx)
if (is.null(nrows) && mxrow > 12)
nrows <- c(seq(6), mxrow - 5:0)
.read_rowData(x, nrows)
})
#' @describeIn TENxH5 Get the dimensions of the data as stored in the file
#' @export
setMethod("dim", "TENxH5", function(x) {
c(length(x@rowidx), length(x@colidx))
})
#' @describeIn TENxH5 Get the dimension names from the file
#' @export
setMethod("dimnames", "TENxH5", function(x) {
id <- .selectByVersion(h5.version.map, x@version, "ID")
list(
rhdf5::h5read(path(x), paste0(x@group, "/", id), s3 = x@remote),
rhdf5::h5read(path(x), paste0(x@group, "/", "barcodes"), s3 = x@remote)
)
})
#' @describeIn TENxH5 Read genome string from file
#' @importFrom GenomeInfoDb genome genome<-
#' @export
setMethod("genome", "TENxH5", function(x) {
group <- x@group
version <- x@version
remote <- x@remote
if (is.na(x@ranges))
stop("'rowRanges' data not available, e.g., in '/features/interval'")
gens <- rhdf5::h5read(
path(x), paste0(group, "/", "features/genome"), s3 = remote
)
ugens <- unique(gens)
intervals <- rhdf5::h5read(
path(x), paste0(group, "/", x@ranges), s3 = remote
)
splitints <- strsplit(intervals, ":", fixed = TRUE)
seqnames <- vapply(splitints, `[[`, character(1L), 1L)
if (any(seqnames == "NA"))
warning("'seqlevels' contain NA values")
if (identical(length(ugens), 1L)) {
useq <- unique(seqnames)
.setNames(rep(ugens, length(useq)), useq)
} else
vapply(split(gens, seqnames), unique, character(1L))
})
#' @describeIn TENxH5 Read interval data and represent as GRanges
#' @importFrom S4Vectors mcols<-
#' @export
setMethod("rowRanges", "TENxH5", function(x, ...) {
group <- x@group
version <- x@version
remote <- x@remote
if (is.na(x@ranges))
stop("'rowRanges' data not available, e.g., in '/features/interval'")
rows <- list(...)[["rows"]]
## Implement a smaller index for display purposes only
mxrow <- max(x@rowidx)
if (is.null(rows) && mxrow > 12)
rows <- c(seq(6), mxrow - 5:0)
interval <- rhdf5::h5read(
path(x), paste0(group, x@ranges), list(rows), s3 = remote
)
## Hack to allow NA ranges for later removal (keeping data parallel)
interval[interval == "NA"] <- "NA_character_:0"
gr <- as(as.character(interval), "GRanges")
names(gr) <- rows
mcols(gr) <- rowData(x, rows = rows)
genome(gr) <- genome(x)
cat("preview <= 12 rowRanges:", basename(x@resource), "\n")
gr
})
#' @describeIn TENxH5 Import TENxH5 data as a SingleCellExperiment; see section
#' below
#'
#' @importFrom MatrixGenerics rowRanges
#' @importFrom BiocBaseUtils isScalarCharacter
#' @importFrom S4Vectors isEmpty
#'
#' @inheritParams BiocIO::import
#'
#' @export
setMethod("import", "TENxH5", function(con, format, text, ...) {
matrixdata <-
HDF5Array::TENxMatrix(path(con), con@group)[con@rowidx, con@colidx]
dots <- list(...)
if (!con@version %in% c("2", "3"))
stop("Version not supported.")
sce <- SingleCellExperiment(
assays = list(counts = matrixdata),
metadata = metadata(con)
)
if (anyDuplicated(rownames(sce))) {
warning(
"'rownames' in assay are not unique; using 'make.unique()'"
)
rownames(sce) <- make.unique(rownames(sce))
}
if (identical(con@version, "3")) {
if (!is.na(con@ranges)) {
rr <- rowRanges(con, rows = con@rowidx)
names(rr) <- mcols(rr)[["ID"]]
rowRanges(sce) <- rr
okrows <- seqnames(rr) != "NA_character_"
## remove stand-in NA values
sce <- sce[okrows, ]
con@rowidx <- con@rowidx[as.logical(okrows)]
}
if (!isEmpty(rowData(con))) {
rowData(sce) <- .read_rowData(con, con@rowidx)
}
}
types <- rowData(sce)[["Type"]]
if (!is.null(types)) {
ref <- dots[["ref"]]
if (is.null(ref) || is.na(ref))
ref <- names(table(types)[1L])
if (isScalarCharacter(ref))
sce <- splitAltExps(sce, types, ref = ref)
}
sce
})
#' @describeIn TENxH5 Display a snapshot of the contents within a TENxH5 file
#' before import
#'
#' @param object A `TENxH5` class object
#'
#' @importFrom BiocBaseUtils selectSome
#' @export
setMethod("show", "TENxH5", function(object) {
rn <- cn <- NULL
rno <- rownames(object)
cno <- colnames(object)
if (length(rno))
rn <- selectSome(rno)
if (length(cno))
cn <- selectSome(cno)
rd <- rowData(object)
rdnames <- selectSome(names(rd))
rdnamecount <- paste0("names(", length(rd), "):")
cat(
class(object), "object",
"\nresource:", path(object),
"\ndim:", dim(object),
"\nrownames:", rn,
"\nrowData", rdnamecount, rdnames,
if (length(rd[["Type"]]))
"\n Type:", selectSome(levels(rd[["Type"]])),
"\ncolnames:", cn,
"\n", sep = " "
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.