R/read10xCounts.R

Defines functions .read_from_hdf5 .check_for_compressed .read_from_sparse .tenx_loader read10xCounts

Documented in read10xCounts

#' Load data from a 10X Genomics experiment
#' 
#' Creates a \linkS4class{SingleCellExperiment} from the CellRanger output directories for 10X Genomics data.
#' 
#' @param samples A character vector containing one or more directory names, each corresponding to a 10X sample.
#' Each directory should contain a matrix file, a gene/feature annotation file, and a barcode annotation file.
#' 
#' Alternatively, each string may contain a path to a HDF5 file in the sparse matrix format generated by 10X.
#' These can be mixed with directory names when \code{type="auto"}.
#'
#' Alternatively, each string may contain a prefix of names for the three-file system described above,
#' where the rest of the name of each file follows the standard 10X output.
#' @param sample.names A character vector of length equal to \code{samples}, containing the sample names to store in the column metadata of the output object.
#' If \code{NULL}, the file paths in \code{samples} are used directly.
#' @param col.names A logical scalar indicating whether the columns of the output object should be named with the cell barcodes.
#' @param type String specifying the type of 10X format to read data from.
#' @param version String specifying the version of the 10X format to read data from.
#' @param genome String specifying the genome if \code{type="HDF5"} and \code{version='2'}.
#' @param compressed Logical scalar indicating whether the text files are compressed for \code{type="sparse"} or \code{"prefix"}.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how loading should be parallelized for multiple \code{samples}.
#' 
#' @return A \linkS4class{SingleCellExperiment} object containing count data for each gene (row) and cell (column) across all \code{samples}.
#' \itemize{
#' \item Row metadata will contain the fields \code{"ID"} and \code{"Symbol"}.
#' The former is the gene identifier (usually Ensembl), while the latter is the gene name.
#' If \code{version="3"}, it will also contain the \code{"Type"} field specifying the type of feature (e.g., gene or antibody).
#' \item Column metadata will contain the fields \code{"Sample"} and \code{"Barcode"}.
#' The former contains the name of the sample (or if not supplied, the path in \code{samples}) from which each column was obtained.
#' The latter contains to the cell barcode sequence and GEM group for each cell library. 
#' \item Rows are named with the gene identifier.
#' Columns are named with the cell barcode in certain settings, see Details.
#' \item The assays will contain a single \code{"counts"} matrix, containing UMI counts for each gene in each cell.
#' Note that the matrix representation will depend on the format of the \code{samples}, see Details.
#' \item The metadata contains a \code{"Samples"} field, containing the input \code{samples} character vector.
#' }
#' 
#' @details
#' This function has a long and storied past.
#' It was originally developed as the \code{read10xResults} function in \pkg{scater}, inspired by the \code{Read10X} function from the \pkg{Seurat} package.
#' It was then migrated to this package in an effort to consolidate some 10X-related functionality across various packages.
#' 
#' If \code{type="auto"}, the format of the input file is automatically detected for each \code{samples} based on whether it ends with \code{".h5"}.
#' If so, \code{type} is set to \code{"HDF5"}; otherwise it is set to \code{"sparse"}.
#' \itemize{
#' \item If \code{type="sparse"}, count data are loaded as a \linkS4class{dgCMatrix} object.
#' This is a conventional column-sparse compressed matrix format produced by the CellRanger pipeline,
#' consisting of a (possibly Gzipped) MatrixMarket text file (\code{"matrix.mtx"})
#' with additional tab-delimited files for barcodes (\code{"barcodes.tsv"})
#' and gene annotation (\code{"features.tsv"} for version 3 or \code{"genes.tsv"} for version 2).
#' \item If \code{type="prefix"}, count data are also loaded as a \linkS4class{dgCMatrix} object.
#' This assumes the same three-file structure for each sample as described for \code{type="sparse"},
#' but each sample is defined here by a prefix in the file names rather than by being a separate directory.
#' For example, if the \code{samples} entry is \code{"xyx_"},
#' the files are expected to be \code{"xyz_matrix.mtx"}, \code{"xyz_barcodes.tsv"}, etc.
#' \item If \code{type="HDF5"}, count data are assumed to follow the 10X sparse HDF5 format for large data sets.
#' It is loaded as a \linkS4class{TENxMatrix} object, which is a stub object that refers back to the file in \code{samples}.
#' Users may need to set \code{genome} if it cannot be automatically determined when \code{version="2"}.
#' }
#'
#' When \code{type="sparse"} or \code{"prefix"} and \code{compressed=NULL},
#' the function will automatically search for both the unzipped and Gzipped versions of the files.
#' This assumes that the compressed files have an additional \code{".gz"} suffix.
#' We can restrict to only compressed or uncompressed files by setting \code{compressed=TRUE} or \code{FALSE}, respectively.
#' 
#' CellRanger 3.0 introduced a major change in the format of the output files for both \code{type}s.
#' If \code{version="auto"}, the version of the format is automatically detected from the supplied paths.
#' For \code{type="sparse"}, this is based on whether there is a \code{"features.tsv.gz"} file in the directory.
#' For \code{type="HDF5"}, this is based on whether there is a top-level \code{"matrix"} group with a \code{"matrix/features"} subgroup in the file.
#' 
#' Matrices are combined by column if multiple \code{samples} were specified.
#' This will throw an error if the gene information is not consistent across \code{samples}.
#' 
#' If \code{col.names=TRUE} and \code{length(sample)==1}, each column is named by the cell barcode.
#' For multiple samples, the index of each sample in \code{samples} is concatenated to the cell barcode to form the column name.
#' This avoids problems with multiple instances of the same cell barcodes in different samples.
#' 
#' Note that user-level manipulation of sparse matrices requires loading of the \pkg{Matrix} package.
#' Otherwise, calculation of \code{rowSums}, \code{colSums}, etc. will result in errors.
#' 
#' @author
#' Davis McCarthy, with modifications from Aaron Lun
#' 
#' @seealso
#' \code{\link{splitAltExps}}, to split alternative feature sets (e.g., antibody tags) into their own Experiments.
#' 
#' \code{\link{write10xCounts}}, to create 10X-formatted file(s) from a count matrix.
#' 
#' @examples
#' # Mocking up some 10X genomics output.
#' example(write10xCounts)
#' 
#' # Reading it in.
#' sce10x <- read10xCounts(tmpdir)
#' 
#' # Column names are dropped with multiple 'samples'.
#' sce10x2 <- read10xCounts(c(tmpdir, tmpdir))
#' 
#' @references
#' Zheng GX, Terry JM, Belgrader P, and others (2017).
#' Massively parallel digital transcriptional profiling of single cells. 
#' \emph{Nat Commun} 8:14049.
#' 
#' 10X Genomics (2017).
#' Gene-Barcode Matrices.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/2.2/output/matrices}
#' 
#' 10X Genomics (2018).
#' Feature-Barcode Matrices.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices}
#' 
#' 10X Genomics (2018).
#' HDF5 Gene-Barcode Matrix Format.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/2.2/advanced/h5_matrices}
#' 
#' 10X Genomics (2018).
#' HDF5 Feature Barcode Matrix Format.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices}
#'
#' @export
#' @importFrom S4Vectors DataFrame
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom BiocParallel SerialParam bplapply
read10xCounts <- function(samples, sample.names=names(samples), col.names=FALSE, 
    type=c("auto", "sparse", "HDF5", "prefix"), version=c("auto", "2", "3"), 
    genome=NULL, compressed=NULL, BPPARAM=SerialParam())
{
    type <- match.arg(type)
    version <- match.arg(version)
    if (is.null(sample.names)) {
        sample.names <- samples
    }

    load.out <- bplapply(samples, FUN=.tenx_loader, type=type, version=version, 
        genome=genome, compressed=compressed, BPPARAM=BPPARAM)

    nsets <- length(samples)
    full_data <- vector("list", nsets)
    gene_info_list <- vector("list", nsets)
    cell_info_list <- vector("list", nsets)

    for (i in seq_len(nsets)) { 
        current <- load.out[[i]]
        full_data[[i]] <- current$mat
        gene_info_list[[i]] <- current$gene.info
        cell.names <- current$cell.names
        cell_info_list[[i]] <- DataFrame(
            Sample = rep(sample.names[i], length(cell.names)), 
            Barcode = cell.names, row.names=NULL)
    }

    # Checking gene uniqueness. 
    if (nsets > 1 && length(unique(gene_info_list)) != 1L) {
        stop("gene information differs between runs")
    }
    gene_info <- gene_info_list[[1]]
    rownames(gene_info) <- gene_info$ID

    # Forming the full data matrix.
    full_data <- do.call(cbind, full_data)

    # Adding the cell data.
    cell_info <- do.call(rbind, cell_info_list)
    if (col.names) {
        if (nsets == 1L) {
            cnames <- cell_info$Barcode
        } else {
            sid <- rep(seq_along(cell_info_list), vapply(cell_info_list, nrow, 1L))
            cnames <- paste0(sid, "_", cell_info$Barcode)
        }
        colnames(full_data) <- cnames
    }

    SingleCellExperiment(list(counts = full_data), rowData = gene_info, colData = cell_info, metadata=list(Samples=samples))
}

.tenx_loader <- function(run, type, version, genome, compressed) {
    cur.type <- .type_chooser(run, type)
    if (cur.type=="sparse") {
        .read_from_sparse(run, version=version, compressed=compressed)
    } else if (cur.type=="prefix") {
        .read_from_sparse(run, version=version, is.prefix=TRUE, compressed=compressed)
    } else {
        .read_from_hdf5(run, genome=genome, version=version)
    }
}

#' @importFrom methods as
#' @importClassesFrom Matrix dgCMatrix
#' @importFrom Matrix readMM
#' @importFrom utils read.delim
.read_from_sparse <- function(path, version, is.prefix=FALSE, compressed=NULL) {
    FUN <- if (is.prefix) paste0 else file.path

    if (version=="auto") {
        target <- FUN(path, "features.tsv")
        target <- .check_for_compressed(target, compressed, error=FALSE)
        version <- if (file.exists(target)) "3" else "2"
    }

    bname <- "barcodes.tsv"
    mname <- "matrix.mtx"
    if (version=="3") {
        gname <- "features.tsv"
    } else {
        gname <- "genes.tsv"
    }

    barcode.loc <- FUN(path, bname)
    gene.loc <- FUN(path, gname)
    matrix.loc <- FUN(path, mname)

    barcode.loc <- .check_for_compressed(barcode.loc, compressed)
    gene.loc <- .check_for_compressed(gene.loc, compressed)
    matrix.loc <- .check_for_compressed(matrix.loc, compressed)

    gene.info <- read.delim(gene.loc, header=FALSE, colClasses="character", 
        stringsAsFactors=FALSE, quote="", comment.char="")
    if (version=="3") {
        colnames(gene.info) <- c("ID", "Symbol", "Type")
    } else {
        colnames(gene.info) <- c("ID", "Symbol")
    }

    list(
        mat=as(readMM(matrix.loc), "dgCMatrix"),
        cell.names=readLines(barcode.loc),
        gene.info=gene.info
    )
}

.check_for_compressed <- function(path, compressed, error=TRUE) {
    original <- path
    if (isTRUE(compressed)) {
        path <- paste0(path, ".gz")
    } else if (is.null(compressed) && !file.exists(path)) {
        path <- paste0(path, ".gz")
        if (error && !file.exists(path)) {
            # Explicit error here to avoid users getting confused.
            stop(sprintf("cannot find '%s' or its gzip-compressed form", original)) 
        }
    }
    path
}

#' @importFrom rhdf5 h5ls h5read
#' @importFrom HDF5Array TENxMatrix
#' @importFrom utils head
.read_from_hdf5 <- function(path, genome=NULL, version) {
    path <- path.expand(path) # eliminate tilde's.
    available <- h5ls(path, recursive=FALSE)
    available <- available[available$otype=="H5I_GROUP",]

    if (version=="auto") {
        version <- if ("matrix" %in% available$name) "3" else "2"
    }

    if (version=="2") {
        group <- genome
        if (is.null(group)) {
            if (nrow(available) > 1L) {
                to.see <- head(available$name, 3)
                if (length(to.see)==3L) {
                    to.see[3] <- "..."
                }
                stop("more than one available group (", paste(to.see, collapse=", "), ")")
            } else if (nrow(available) == 0L) {
                stop("no available groups")
            }
            group <- available$name
        }

        gene.info <- data.frame(
            ID=as.character(h5read(path, paste0(group, "/genes"))),
            Symbol=as.character(h5read(path, paste0(group, "/gene_names"))),
            stringsAsFactors=FALSE
        )
    } else {
        group <- "matrix"
        gene.info <- data.frame(
            ID=as.character(h5read(path, paste0(group, "/features/id"))),
            Symbol=as.character(h5read(path, paste0(group, "/features/name"))),
            Type=as.character(h5read(path, paste0(group, "/features/feature_type"))),
            stringsAsFactors=FALSE
        )
    }

    mat <- TENxMatrix(path, group)
    dimnames(mat) <- NULL # for consistency.
    list(
        mat=mat,
        cell.names=as.character(h5read(path, paste0(group, "/barcodes"))),
        gene.info=gene.info
    )
}
LTLA/crio documentation built on Dec. 18, 2021, 3:40 a.m.