R/aggregate.R

Defines functions .make_grid_samples .aggregate_SFE .aggregate_sample .aggregate_sample_tx .aggregate_sample_cell .aggregate_num aggregateTxTech aggregateTx

Documented in aggregateTx aggregateTxTech

#' Aggregate transcript spots from file
#'
#' This function reads the transcript spot file from the standard output of the
#' commercial technologies (not GeoParquet) for spatial aggregation where the
#' spots are assigned to polygons such as cells or spatial bins. Presets for
#' Xenium, MERFISH, and CosMX are available. For Vizgen and Xenium, the images
#' can be added when \code{add_images = TRUE}.
#'
#' @param df If the file is already loaded into memory, a data frame (sf) with
#'   columns for the x, y, and optionally z coordinates and gene assignment of
#'   each transcript spot. If specified, then argument \code{file} will be
#'   ignored.
#' @param by A \code{sfc} or \code{sf} object for spatial aggregation.
#' @param new_geometry_name Name to give to the new \code{colGeometry} in the
#'   output. Defaults to "bins".
#' @param flip_geometry Logical, whether to flip the transcript spot geometries
#'   to match the images if added later.
#' @param image String, which image(s) to add to the output SFE object. Not
#'   applicable to CosMX. See \code{\link{readVizgen}} and
#'   \code{\link{readXenium}} for options and multiple images can be specified.
#'   If \code{NULL}, then the default from the read function for the technology
#'   will be used.
#' @param sparse Logical, whether the gene count matrix from aggregating
#'   transcript spots should be sparse. When the bins are large, the matrix will
#'   not be very sparse so using sparse matrix will not save memory, but when
#'   the bins are small, sparsity is worth it.
#' @param BPPARAM bpparam object to specify parallel computing over genes. If a
#'   lot of memory is used, then stick to `SerialParam()`.
#' @param .orig_nrows Only used internally in the SFE method of \code{aggregate}
#' @inheritParams formatTxTech
#' @inheritParams formatTxSpots
#' @inheritParams readXenium
#' @inheritParams sf::st_make_grid
#' @inheritParams SpatialFeatureExperiment
#' @note The resulting SFE object often includes geometries (e.g. grid cells)
#'   outside tissue, because there can be transcript spots detected outside the
#'   tissue. Also, bins at the edge of the tissue that don't fully overlap with
#'   the tissue will have lower transcript counts; this may have implications to
#'   downstream spatial analyses.
#' @return A SFE object with count matrix for number of spots of each gene in
#'   each geometry. Geometries with no spot are removed.
#' @importFrom data.table as.data.table
#' @importFrom sf st_make_grid
#' @importFrom rlang %||%
#' @concept Geometric operations
#' @export
aggregateTx <- function(file, df = NULL, by = NULL, sample_id = "sample01",
                        spatialCoordsNames = c("X", "Y", "Z"),
                        gene_col = "gene",
                        phred_col = "qv", min_phred = 20, flip_geometry = FALSE,
                        cellsize = NULL, square = TRUE, flat_topped = FALSE,
                        new_geometry_name = "bins", unit = "micron", sparse = FALSE,
                        BPPARAM = SerialParam(), .orig_nrows = NULL) {
    # This is only for one file, one sample
    if (!is.null(df)) file <- df
    mols <- .check_tx_file(file, spatialCoordsNames, gene_col, phred_col,
                           min_phred, flip_geometry)
    if (inherits(mols, "data.table"))
        mols <- mols[,c(spatialCoordsNames, "gene"), with=FALSE]
    else mols <- mols[,c(spatialCoordsNames, "gene")]
    mols <- df2sf(mols, spatialCoordsNames = spatialCoordsNames,
                  geometryType = "POINT")
    if (is.null(by))
        by <- st_make_grid(mols, cellsize = cellsize, square = square,
                           flat_topped = flat_topped)
    else if (inherits(by, "sf")) by <- st_geometry(by)
    mols <- split(st_geometry(mols), mols[["gene"]])
    if (sparse) { # TODO: This uses a lot of memory. Try Arrow dataset, querying one gene at a time, 
        # then create TileDB right from the beginning
        # Iterate over the genes, count number of transcripts in each bin for the gene
        
        # Special case from .aggregate_SFE, where numeric L1 is used for genes
        if (gene_col == "L1") {
            ml <- bplapply(names(mols), function(i) {
                x <- mols[[i]]
                ll <- lengths(st_intersects(by, x))
                j <- which(ll > 0) # When the spots fall outside all bins
                if (!length(j)) return(NULL)
                data.frame(i = as.integer(i), j = j, x = ll[j])
            }, BPPARAM = BPPARAM)
        } else {
            ml <- bplapply(seq_along(mols), function(i) {
                x <- mols[[i]]
                ll <- lengths(st_intersects(by, x))
                j <- which(ll > 0)
                data.frame(i = i, j = j, x = ll[j])
            }, BPPARAM = BPPARAM)
        }
        ml <- data.table::rbindlist(ml)
        if (gene_col == "L1") {
            nrows_use <- .orig_nrows %||% as.integer(tail(names(mols), 1))
            new_mat <- sparseMatrix(i = ml$i, j = ml$j, x = ml$x, dims = c(nrows_use, length(by)),
                                    dimnames = list(as.character(seq_len(nrows_use)), seq_along(by)))
        } else {
            new_mat <- sparseMatrix(i = ml$i, j = ml$j, x = ml$x, dims = c(length(mols), length(by)),
                                    dimnames = list(names(mols), seq_along(by)))
        }
    } else {
        ml <- bplapply(mols, function(x) {
            inds <- st_intersects(by, x)
            lengths(inds)
        }, BPPARAM = BPPARAM)
        new_mat <- matrix(unlist(ml), nrow = length(by), ncol = length(mols),
                          dimnames = list(seq_along(by), names(mols)))
        new_mat <- t(new_mat)
    }
    new_mat <- new_mat[,colSums(new_mat) > 0] # Remove empty grid cells
    grid_sf <- st_sf(geometry = by)
    cgs <- list(bins = grid_sf[colnames(new_mat), "geometry"])
    names(cgs) <- new_geometry_name
    SpatialFeatureExperiment(assays = list(counts = new_mat),
                             colGeometries = cgs, unit = unit, sample_id = sample_id)
}

#' @rdname aggregateTx
#' @export
aggregateTxTech <- function(data_dir, df = NULL, by = NULL,
                            tech = c("Vizgen", "Xenium", "CosMX"),
                            sample_id = "sample01",
                            image = NULL,
                            min_phred = 20, flip = c("geometry", "image", "none"),
                            max_flip = "50 MB",
                            cellsize = NULL, square = TRUE, flat_topped = FALSE,
                            new_geometry_name = "bins", sparse = FALSE,
                            BPPARAM = SerialParam()) {
    tech <- match.arg(tech)
    flip <- match.arg(flip)
    c(spatialCoordsNames, gene_col, cell_col, fn) %<-%
        getTechTxFields(tech, data_dir)
    if (tech == "Xenium") {
        c(xoa_version, major_version, minor_version, instrument_version) %<-%
            .get_XOA_version(data_dir)
    }
    img_choices <- image <- switch (
        tech,
        Vizgen = c("DAPI", "PolyT", "Cellbound"),
        Xenium = if (major_version > 1L) "morphology_focus" else c("morphology_focus", "morphology_mip"),
        NA
    )
    if (is.null(image)) {
        image <- img_choices
    } else {
        image <- match.arg(image, img_choices, several.ok = TRUE)
    }
    if (tech == "Xenium") {
        c(img_df, flip) %<-% .get_xenium_images(data_dir, image, major_version,
                                                flip, max_flip, sample_id)
    } else if (tech == "Vizgen") {
        c(img_df, flip) %<-% .get_vizgen_images(data_dir, image, flip, max_flip,
                                                z = "all", sample_id = sample_id)
    } else {
        img_df <- NULL
        flip <- "none"
    }
    sfe <- aggregateTx(file = fn, df = df, by = by, sample_id = sample_id,
                spatialCoordsNames = spatialCoordsNames,
                gene_col = gene_col,
                phred_col = "qv", min_phred = min_phred,
                flip_geometry = (flip == "geometry"),
                cellsize = cellsize, square = square, flat_topped = flat_topped,
                new_geometry_name = new_geometry_name, BPPARAM = BPPARAM, 
                sparse = sparse)
    imgData(sfe) <- img_df
    sfe
}

.aggregate_num <- function(mat, inds, FUN, fun_name, BPPARAM) {
    if (fun_name %in% c("sum", "mean")) {
        i <- unlist(inds) # cell index
        ll <- lengths(inds) # bin index
        j <- rep(seq_along(inds), times = ll)
        if (fun_name == "sum")
            mat_agg <- sparseMatrix(i = i, j = j, x = 1,
                                    dims = c(ncol(mat), length(inds)))
        else if (fun_name == "mean") {
            mat_agg <- sparseMatrix(i = i, j = j,
                                    x = rep(1/ll, times = ll),
                                    dims = c(ncol(mat), length(inds)))
        }
        # mat has genes in rows and cells in columns
        out_agg <- mat %*% mat_agg
        # Variance and standard deviation can also be expressed with matrix
        # algebra. Not implementing yet since most likely they're not commonly
        # used for aggregation purposes.
    } else if (fun_name %in% getNamespaceExports("sparseMatrixStats")) {
        out_agg <- bplapply(inds, function(i) {
            m <- mat[,i,drop = FALSE]
            FUN(m)
        }, BPPARAM = BPPARAM)
        out_agg <- matrix(unlist(out_agg), ncol = length(inds))
        rownames(out_agg) <- rownames(mat)
    } else stop("Function ", fun_name, " not supported for aggregating SFE.")
    if (inherits(out_agg, "dgeMatrix")) out_agg <- as.matrix(out_agg)
    out_agg
}

# For one sample
.aggregate_sample_cell <- function(x, by, FUN, fun_name, colGeometryName, join,
                                   new_geometry_name, BPPARAM) {
    cg <- colGeometry(x, type = colGeometryName)
    # Preliminary check of overlap
    if (!join(st_as_sfc(st_bbox(by)), st_as_sfc(st_bbox(cg)), sparse = FALSE))
        stop("`by` does not overlap with this sample")
    inds <- join(by, cg) # somewhat faster than join(cg, by) when by has fewer geometries than cg
    not_empty <- which(lengths(inds) > 0L)
    if (!length(not_empty)) stop("`by` does not overlap with this sample")
    by <- by[not_empty]
    inds <- inds[not_empty]
    cnts <- counts(x)
    # Aggregate the gene count matrix, only do the counts assay
    # Express this as a matrix multiplication for sum. Use sweep to deal with mean
    cnts_agg <- .aggregate_num(cnts, inds, FUN, fun_name, BPPARAM)
    # Aggregate numeric columns of colData; logical are converted to numeric
    df <- colData(x)
    df$sample_id <- NULL
    which_logical <- which(vapply(df, is.logical, FUN.VALUE = logical(1)))
    for (i in which_logical) {
        df[,i] <- as.integer(df[,i])
    }
    numeric_cols <- vapply(df, is.numeric, FUN.VALUE = logical(1))
    if (any(numeric_cols)) {
        cd <- t(as.matrix(df[,numeric_cols, drop = FALSE]))
        cd_agg <- .aggregate_num(cd, inds, FUN, fun_name, BPPARAM) |> Matrix::t()
        cd_agg <- as(cd_agg, "DFrame")
    } else cd_agg <- make_zero_col_DFrame(length(inds))
    # Deal with anything that is neither numerical nor logical
    # Primarily character and factor. What about list columns? I don't know.
    # For now, the list columns will be split just like categorical variables
    # and become nested lists
    if (!all(numeric_cols)) {
        df_bin <- data.frame(bin = rep(seq_along(inds), times = lengths(inds)),
                             index = unlist(inds))
        df_bin <- df_bin[order(df_bin$index),]
        names_not_num <- names(df)[!numeric_cols]
        cat_agg <- matrix(NA, nrow = length(inds), ncol = length(names_not_num))
        colnames(cat_agg) <- names_not_num
        cat_agg <- data.frame(cat_agg)
        if (nrow(df_bin) != nrow(df)) {
            df_inds <- data.frame(index = seq_len(nrow(df)))
            df_bin <- merge(df_inds, df_bin, all.x = TRUE, by = "index")
        }
        for (n in names_not_num)
            cat_agg[[n]] <- split(df[[n]], df_bin$bin)
        cd_agg <- cbind(cat_agg, cd_agg)
    }
    cgs <- list(bins = st_sf(geometry = by))
    names(cgs) <- new_geometry_name
    out <- SpatialFeatureExperiment(assays = list(counts = cnts_agg), colData = cd_agg,
                                    sample_id = sampleIDs(x),
                                    colGeometries = cgs)
    colnames(out) <- seq_along(by)
    rownames(out) <- rownames(x)
    out
}

# Might turn this into an exported function
.aggregate_sample_tx <- function(x, by, rowGeometryName, new_geometry_name, 
                                 sparse = FALSE, BPPARAM = SerialParam(),
                                 .orig_nrows = NULL) {
    rg <- rowGeometry(x, rowGeometryName)
    if (!is.null(st_z_range(rg)))
        rg <- st_zm(rg)
    if (st_geometry_type(rg, by_geometry = FALSE) == "GEOMETRY") {
        # Happens after cropping and producing empty geometries
        rg <- st_cast(rg, "MULTIPOINT") |> st_zm()
    }
    grid_sf <- st_sf(grid_id = seq_along(by), geometry = by)
    tx_coords <- st_coordinates(rg) |> as.data.frame()
    scn <- c("X", "Y")
    out <- aggregateTx(df = tx_coords, spatialCoordsNames = scn,
                       gene_col = "L1", by = by, sparse = sparse,
                       BPPARAM = BPPARAM, .orig_nrows = .orig_nrows,
                       sample_id = sampleIDs(x))
    rownames(out) <- rownames(x)
    out
}

.aggregate_sample <- function(x, by = NULL, FUN = sum, fun_name,
                              colGeometryName = 1L, rowGeometryName = NULL,
                              join = st_intersects, new_geometry_name = "bins",
                              sparse = FALSE,
                              BPPARAM = SerialParam(), .orig_nrows = NULL) {
    # Here x is an SFE object with one sample
    # by is sfc
    # Can't do S4 method with signature for `by` because the argument `by` isn't
    # in the generic and I don't want to mess with the `aggregate` function in
    # other packages
    if (!is.null(rowGeometryName)) {
        .aggregate_sample_tx(x, by, rowGeometryName, new_geometry_name, 
                             sparse = sparse, BPPARAM = BPPARAM, 
                             .orig_nrows = .orig_nrows)
    } else {
        .aggregate_sample_cell(x, by, FUN, fun_name, colGeometryName, join,
                               new_geometry_name, BPPARAM)
    }
}

.aggregate_SFE <- function(x, by = NULL, FUN = sum, sample_id = "all",
                           colGeometryName = 1L, rowGeometryName = NULL,
                           cellsize = NULL, square = TRUE, flat_topped = FALSE,
                           new_geometry_name = "bins", join = st_intersects,
                           sparse = FALSE,
                           BPPARAM = SerialParam()) {
    sample_id <- .check_sample_id(x, sample_id, one = FALSE)
    if (is.null(by) && is.null(cellsize)) {
        stop("Either `by` or `cellsize` must be specified.")
    }
    # Make grid for multiple samples if `by` is not specified
    if (is.null(by)) {
        by <- .make_grid_samples(x, sample_id,
                                 cellsize, square, flat_topped)
    }
    if (is.list(by) && !inherits(by, "sfc") && !inherits(by, "sf")) {
        if (!any(sample_id %in% names(by)))
            stop("None of the geometries in `by` correspond to sample_id")
        by <- by[intersect(sample_id, names(by))]
    } else {
        if (!inherits(by, "sfc") && !inherits(by, "sf"))
            stop("`by` must be either sf or sfc.")
        if (length(sample_id) > 1L) {
            if (inherits(by, "sfc") || !"sample_id" %in% names(by))
                stop("`by` must be an sf data frame with a column `sample_id`")
            by <- split(st_geometry(by), by$sample_id)
        } else if (inherits(by, "sf")) {
            by <- st_geometry(by)
        }
    }
    if (inherits(by, "sfc")) by <- setNames(list(by), sample_id)
    fun_name <- as.character(substitute(FUN))
    sfes <- splitSamples(x) # Output list should have sample IDs as names
    if (!is.null(rowGeometryName)) {
        any_empty <- vapply(sfes, function(x) any(st_is_empty(rowGeometry(x, rowGeometryName))),
                            FUN.VALUE = logical(1)) |> any()
        if (any_empty) sparse <- TRUE
    }
    sfes <- lapply(sample_id, function(s) {
        .aggregate_sample(sfes[[s]], by = by[[s]], FUN = FUN,
                          colGeometryName = colGeometryName,
                          rowGeometryName = rowGeometryName,
                          join = join, fun_name = fun_name,
                          new_geometry_name = new_geometry_name,
                          sparse = sparse, BPPARAM = BPPARAM, 
                          .orig_nrows = nrow(sfes[[s]]))
    })
    out <- do.call(cbind, sfes)
    # Add the original rowGeometries back
    rowGeometries(out) <- rowGeometries(x, sample_id = sample_id)
    # Keep imgData
    id_orig <- imgData(x)
    imgData(out) <- id_orig[id_orig$sample_id %in% sample_id,]
    out
}

#' Aggregate data in SFE using geometry
#'
#' Gene expression and numeric columns of \code{colData} will be aggregated with
#' the function specified in \code{FUN}, according to another geometry supplied
#' and a geometry predicate (such as \code{st_intersects}). For example, when
#' the predicate is \code{st_intersects} and a spatial grid is used to
#' aggregate, then the data associated with all cells that intersect with each
#' grid cell will be aggregated with \code{FUN}, such as \code{mean} or
#' \code{sum}. The categorical columns will be collected into list columns, and
#' logical columns will be converted into numeric before applying \code{FUN}.
#'
#' For smFISH-based data where the transcript spots are available, the
#' transcript spots can be used instead of cells to aggregate the gene count
#' matrix, in which case all assays other than \code{counts} will be dropped and
#' \code{FUN} only applies to \code{colData} because the transcript spots are
#' simply counted.
#'
#' What this function does is similar to
#' \href{https://github.com/JEFworks-Lab/SEraster}{SEraster} but more general
#' because any geometry and more aggregation function can be used, not just
#' regular grids, and the aggregation can be performed on the transcript spots.
#'
#' @inheritParams sf::st_make_grid
#' @inheritParams sf::aggregate
#' @inheritParams aggregateTx
#' @param x An SFE object to be aggregated.
#' @param by A \code{sf} data frame whose geometry column is used for
#'   aggregation or \code{sfc} or for multiple samples a list of \code{sfc}
#'   whose names are the sample IDs. For multiple samples, the \code{sf} data
#'   frame must have a column \code{sample_id} to indicate which geometry for
#'   which sample. This argument is optional if \code{cellsize} is specified.
#' @param FUN Function to aggregate the numerical columns in \code{colData} and
#'   the gene count matrix. This can be \code{sum}, \code{mean}, or any function
#'   that takes a numeric matrix as input and returns a numeric vector whose
#'   length is same as the number of rows in the input matrix, such as
#'   \code{rowMedians}. See package \code{matrixStats}. Depending on the
#'   function used for aggregation, numeric columns of \code{colData} may need
#'   to be interpreted differently after aggregation. Aggregation is not done
#'   when aggregating by transcript spots in \code{rowGeometry}. When it's sum
#'   or mean, matrix multiplication is used for aggregation rather than calling
#'   the sum or mean function itself; this is much faster than looping through
#'   the bins and calling the function on each of them.
#' @param sample_id Which samples to aggregate, defaults to "all".
#' @param colGeometryName Which \code{colGeometry} to spatially aggregate the
#'   data, by default the first one.
#' @param rowGeometryName Which \code{rowGeometry} to spatially aggregate
#' @param new_geometry_name Name to give to the new \code{colGeometry} in the
#'   output. Defaults to "bins".
#' @param BPPARAM A \code{\link{BiocParallelParam}} object specifying parallel
#'   computing when aggregating data with functions other than sum and mean when
#'   aggregating cells. When aggregating transcript spots, this specifies
#'   parallel computing over genes. Defaults to \code{SerialParam()}.
#' @return An SFE object with \code{colGeometry} the same as the geometry
#'   specified in \code{by} or same as the grid specified in \code{cellsize}.
#'   \code{rowGeometries} and \code{rowData} remain the same as in the input
#'   \code{x}. \code{reducedDims}, \code{localResults}, \code{colFeatureData}
#'   (and its \code{colGeometry}, \code{annotGeometry}, and \code{reducedDim}
#'   counterparts), and \code{spatialGraphs} are dropped because those results
#'   no longer apply after aggregation.
#' @note For developers: When debugging this function after calling
#'   \code{devtools::load_all(".")}, you may get an error that comes from S3
#'   dispatch of \code{aggregate.Vector} from the \code{S4Vectors} package. When
#'   that happens, either restart the R session, or run
#'   \code{setGeneric("aggregate", function(x, ...)
#'   standardGeneric("aggregate"))} in the console to make an S4 generic as done
#'   in the \code{terra} package to prioritize S4 dispatch.
#' @export
#' @importFrom stats aggregate
#' @concept Geometric operations
#' @examples
#' # example code
#' 
setMethod("aggregate", "SpatialFeatureExperiment", .aggregate_SFE)

# Function to make grid for multiple samples
.make_grid_samples <- function(x, sample_id, cellsize, square,
                               flat_topped) {
    bboxes <- .bbox2sf(bbox(x, sample_id = sample_id), sample_id)
    out <- lapply(st_geometry(bboxes), st_make_grid, cellsize = cellsize,
                  square = square, flat_topped = flat_topped)
    names(out) <- sample_id
    out
}
pachterlab/SpatialFeatureExperiment documentation built on Jan. 29, 2025, 10:32 p.m.