R/readVisium.R

Defines functions .extract_indices .read_scale_factors .read_spot_pos counts2h5 read10Xh5 readVisium

Documented in counts2h5 .extract_indices read10Xh5 .read_spot_pos readVisium

#' Load a Visium spatial dataset as a SingleCellExperiment.
#'
#' @param dirname Path to spaceranger output directory (e.g. "sampleID/outs/").
#'   This directory must contain the counts matrix and feature/barcode TSVs in
#'   \code{filtered_feature_bc_matrix/} for \code{readVisium}, or in
#'   \code{filtered_feature_bc_matrix.h5} for \code{read10Xh5}. Besides, it
#'   must also contain a file for spot positions named
#'   \code{spatial/tissue_positions_list.csv} (before Space Ranger V2.0) or
#'   \code{spatial/tissue_positions.csv} (since Space Ranger V2.0), as well as
#'   a file containing scale factors named \code{spatial/scalefactors_json.json}.
#'   (To understand the output directory, refer to the corresponding
#'   \href{https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview}{10X Genomics help page}.)
#' @param rm.feats.pat Patterns for features (genes) to remove.
#' @param fname File name of the h5 file. It should be inside \code{dirname}.
#'   (By default "filtered_feature_bc_matrix.h5")
#'
#' @return SingleCellExperiment containing the counts matrix in \code{counts}
#'   and spatial data in \code{colData}. Array coordinates for each spot are
#'   stored in columns \code{array_row} and \code{array_col}, while image
#'   coordinates are stored in columns \code{pxl_row_in_fullres} and
#'   \code{pxl_col_in_fullres}.
#'
#' @details We store two variables associated with downstream BayesSpace
#'   functions in a list called \code{BayesSpace.data} in the
#'   SingleCellExperiment's \code{metadata}.
#'   \itemize{
#'     \item \code{platform} is set to "Visium", and is used to determine spot
#'       layout and neighborhood structure.
#'     \item \code{is.enhanced} is set to \code{FALSE} to denote the object
#'       contains spot-level data.
#'   }
#'
#' @examples
#' \dontrun{
#' sce <- readVisium("path/to/outs/")
#' }
#'
#' @name readVisium
NULL

#' @export
#' @importFrom utils read.csv read.table
#' @importFrom scater uniquifyFeatureNames
#' @importFrom Matrix readMM colSums
#' @importFrom SingleCellExperiment SingleCellExperiment counts
#' @importFrom S4Vectors metadata metadata<-
#' @rdname readVisium
readVisium <- function(
    dirname,
    rm.feats.pat = c("^NegControl.*", "^BLANK.*", "^DEPRECATED.*")
) {
    spatial_dir <- file.path(dirname, "spatial")
    matrix_dir <- file.path(dirname, "filtered_feature_bc_matrix")

    if (!dir.exists(matrix_dir)) {
        stop("Matrix directory does not exist:\n  ", matrix_dir)
    }
    if (!dir.exists(spatial_dir)) {
        stop("Spatial directory does not exist:\n  ", spatial_dir)
    }

    rowData <- read.table(file.path(matrix_dir, "features.tsv.gz"), header = FALSE, sep = "\t")
    colnames(rowData) <- c("gene_id", "gene_name", "feature_type")
    rowData <- rowData[, c("gene_id", "gene_name")]
    rownames(rowData) <- scater::uniquifyFeatureNames(rowData$gene_id, rowData$gene_name)

    .counts <- Matrix::readMM(file.path(matrix_dir, "matrix.mtx.gz"))
    barcodes <- read.table(file.path(matrix_dir, "barcodes.tsv.gz"), header = FALSE, sep = "\t")
    colData <- .read_spot_pos(spatial_dir, barcodes)
    colnames(.counts) <- barcodes$V1
    rownames(.counts) <- rownames(rowData)
    .counts <- .counts[, rownames(colData)]
    
    if (!is.null(rm.feats.pat) && length(rm.feats.pat) > 0) {
      .rm.feats.pat <- paste(rm.feats.pat, collapse = "|")
      rowData <- rowData[!grepl(.rm.feats.pat, rowData[["gene_name"]]), ]
      
      .counts <- .counts[rownames(rowData), ]
    }
    
    scalef <- .read_scale_factors(spatial_dir)

    sce <- SingleCellExperiment(
        assays = list(counts = .counts),
        rowData = rowData,
        colData = colData
    )

    # Remove spots with no reads for all genes.
    sce <- sce[, Matrix::colSums(counts(sce)) > 0]

    metadata(sce)$BayesSpace.data <- list()
    metadata(sce)$BayesSpace.data$platform <- "Visium"
    metadata(sce)$BayesSpace.data$is.enhanced <- FALSE
    metadata(sce)$BayesSpace.data$scalef <- scalef

    sce
}

#' @export
#' @importFrom Matrix sparseMatrix colSums readMM
#' @importFrom SingleCellExperiment SingleCellExperiment counts
#' @importFrom S4Vectors metadata metadata<-
#' @importFrom rhdf5 h5read
#' @importFrom dplyr %>% group_by mutate select n case_when
#' @importFrom tibble column_to_rownames
#' @importFrom rlang .data
#' @rdname readVisium
read10Xh5 <- function(
    dirname,
    fname = "filtered_feature_bc_matrix.h5",
    rm.feats.pat = c("^NegControl.*", "^BLANK.*", "^DEPRECATED.*")
) {
    spatial_dir <- file.path(dirname, "spatial")
    h5_file <- file.path(dirname, fname)

    if (!dir.exists(spatial_dir)) {
      stop("Spatial directory does not exist:\n  ", spatial_dir)
    }

    if (!file.exists(h5_file)) {
      stop("H5 file does not exist:\n  ", h5_file)
    }

    colData <- .read_spot_pos(spatial_dir)

    non.zero.indices <- .extract_indices(
      h5read(h5_file, "matrix/indices"),
      h5read(h5_file, "matrix/indptr")
    )

    rowData <- data.frame(
      gene_id = h5read(h5_file, "matrix/features/id"),
      gene_name = h5read(h5_file, "matrix/features/name")
    ) %>%
      group_by(
        .data$gene_name
      ) %>%
      mutate(
        idx = 1:n(),
        row_name = case_when(
          max(idx) > 1 ~ paste(gene_name, gene_id, sep = "_"),
          TRUE ~ gene_name
        )
      ) %>%
      column_to_rownames("row_name") %>%
      select(
        -.data$idx
      )

    .counts <- sparseMatrix(
      i = non.zero.indices$i,
      j = non.zero.indices$j,
      x = h5read(h5_file, "matrix/data"),
      dims = h5read(h5_file, "matrix/shape"),
      dimnames = list(
        rownames(rowData),
        h5read(h5_file, "matrix/barcodes")
      ),
      index1 = FALSE
    )
    .counts <- .counts[, rownames(colData)]
    
    if (!is.null(rm.feats.pat) && length(rm.feats.pat) > 0) {
      .rm.feats.pat <- paste(rm.feats.pat, collapse = "|")
      rowData <- rowData[!grepl(.rm.feats.pat, rowData[["gene_name"]]), ]
      
      .counts <- .counts[rownames(rowData), ]
    }
    
    scalef <- .read_scale_factors(spatial_dir)

    sce <- SingleCellExperiment(
      assays = list(
        counts = .counts
      ),
      rowData = rowData,
      colData = colData
    )

    # Remove spots with no reads for all genes.
    sce <- sce[, Matrix::colSums(counts(sce)) > 0]

    metadata(sce)$BayesSpace.data <- list()
    metadata(sce)$BayesSpace.data$platform <- "Visium"
    metadata(sce)$BayesSpace.data$is.enhanced <- FALSE
    metadata(sce)$BayesSpace.data$scalef <- scalef

    sce
}

#' @export
#' @importFrom rhdf5 h5read h5createFile h5createGroup h5write
#' @rdname readVisium
counts2h5 <- function(dirname) {
  h5_file <- file.path(dirname, "filtered_feature_bc_matrix.h5")
  spatial_dir <- file.path(dirname, "spatial")
  matrix_dir <- file.path(dirname, "filtered_feature_bc_matrix")
  
  if (file.exists(h5_file)) {
    stop("H5 file exists:\n ", h5_file)
  }
  
  if (!dir.exists(matrix_dir)) {
    stop("Matrix directory does not exist:\n  ", matrix_dir)
  }
  if (!dir.exists(spatial_dir)) {
    stop("Spatial directory does not exist:\n  ", spatial_dir)
  }
  
  rowData <- read.table(file.path(matrix_dir, "features.tsv.gz"), header = FALSE, sep = "\t")
  colnames(rowData) <- c("gene_id", "gene_name", "feature_type")
  
  .counts <- readMM(file.path(matrix_dir, "matrix.mtx.gz"))
  counts <- matrix(
    as.integer(as.matrix(.counts)),
    nrow = dim(.counts)[1]
  )
  .counts <- as(.counts, "CsparseMatrix")
  barcodes <- read.table(file.path(matrix_dir, "barcodes.tsv.gz"), header = FALSE, sep = "\t")
  colData <- .read_spot_pos(spatial_dir, barcodes)
  
  h5createFile(h5_file)
  
  h5createGroup(h5_file, "matrix")
  
  h5write(barcodes[[1]], h5_file, "matrix/barcodes")
  h5write(counts[counts > 0], h5_file, "matrix/data")
  
  h5createGroup(h5_file, "matrix/features")
  h5write("genome", h5_file, "/matrix/features/_all_tag_keys")
  h5write(rowData$feature_type, h5_file, "/matrix/features/feature_type")
  h5write(rowData$gene_id, h5_file, "/matrix/features/id")
  h5write(rowData$gene_name, h5_file, "/matrix/features/name")
  
  h5write(.counts@i, h5_file, "matrix/indices")
  h5write(.counts@p, h5_file, "matrix/indptr")
  h5write(dim(counts), h5_file, "matrix/shape")
  
  NULL
}

#' Load spot positions.
#'
#' @param dirname Path to spaceranger outputs of spatial pipeline, i.e., "outs/spatial".
#'     This directory must contain a file for the spot positions at
#'     \code{tissue_positions_list.csv} (before Space Ranger V2.0) or
#'     \code{tissue_positions.csv} (since Space Ranger V2.0).
#'
#' @return Data frame of spot positions.
#'
#' @keywords internal
#'
#' @importFrom utils read.csv
#' @importFrom magrittr %>%
#' @importFrom dplyr inner_join rename
#' @importFrom tidyr uncount
#' @importFrom arrow read_parquet
#' @importFrom stats cor
#' @importFrom methods as
#' @importFrom rlang .data
.read_spot_pos <- function(dirname, barcodes = NULL) {
  if (file.exists(file.path(dirname, "tissue_positions_list.csv"))) {
      message("Loading Visium with SpaceRanger version < V2.0")
      colData <- read.csv(file.path(dirname, "tissue_positions_list.csv"), header = FALSE)
      colnames(colData) <- c("barcode", "in_tissue", "array_row", "array_col", "pxl_row_in_fullres", "pxl_col_in_fullres")
  } else if (file.exists(file.path(dirname, "tissue_positions.csv"))) {
      message("Loading Visium with SpaceRanger version >= V2.0")
      colData <- read.csv(file.path(dirname, "tissue_positions.csv"))
  } else if (file.exists(file.path(dirname, "tissue_positions.parquet"))) {
      message("Loading Visium HD")
      colData <- read_parquet(file.path(dirname, "tissue_positions.parquet")) %>%
        as.data.frame()
  } else {
      stop("No file for spot positions found in ", dirname)
  }

  if (!is.null(barcodes)) {
    colData <- inner_join(
      colData,
      barcodes,
      by = c("barcode" = "V1")
    )
  }
  
  # Sanity check.
  if (
    abs(cor(colData$array_row, colData$pxl_row_in_fullres)) < abs(cor(colData$array_row, colData$pxl_col_in_fullres)) &&
    abs(cor(colData$array_col, colData$pxl_col_in_fullres)) < abs(cor(colData$array_col, colData$pxl_row_in_fullres))
  ) {
    message("Warning! The coordinates with indices and pixels do not match. Swapping the pixel values between the row and column...")
    
    colData <- colData %>% rename(
      pxl_row_in_fullres = .data$pxl_col_in_fullres,
      pxl_col_in_fullres = .data$pxl_row_in_fullres
    )
  }
  
  rownames(colData) <- colData$barcode
  colData <- colData[colData$in_tissue > 0, ]
  return(colData)
}

#' @keywords internal
#' 
#' @importFrom rjson fromJSON
.read_scale_factors <- function(dirname) {
  filename <- file.path(dirname, "scalefactors_json.json")
  
  if (!file.exists(filename)) {
    stop(paste(filename, "does not exist!"))
  }
  
  fromJSON(file = filename)
}

#' Extract row and column indices of the count matrix from h5 file.
#'
#' @param idx Row index of corresponding element in the non-zero count matrix.
#' @param new.start Index of the start of each column corresponding to
#'     \code{idx} and the non-zero count matrix.
#' @param zero.based Whether the \code{} and \code{} are zero-based or not.
#'     (By default is TRUE)
#'
#' @return List of row (i) and column (j) indices of the non-zero elements
#'     in the count matrix.
#'
#' @keywords internal
#'
#' @importFrom tibble as_tibble
#' @importFrom tidyr uncount
.extract_indices <- function(idx, new.start, zero.based = TRUE) {
    if (length(idx) < 1) {
        return(NULL)
    }

    idx.cnts <- do.call(
        rbind,
        lapply(
            seq_len(length(new.start))[-1],
            function(x) c(x - ifelse(zero.based, 2, 1), new.start[[x]] - new.start[[x - 1]])
        )
    )
    colnames(idx.cnts) <- c("id", "n")

    return(
        list(
            i = idx,
            j = as.integer(uncount(as_tibble(idx.cnts), n)[[1]]),
            new.start = new.start
        )
    )
}
edward130603/BayesSpace documentation built on Oct. 23, 2024, 7:04 a.m.