R/write_h5mu.R

Defines functions writeList write_elem writeAttribute writeDataset guess_chunk writeArrayToMuData MuDataFileRealizationSink registerHDF5ArrayMethods write_data_frame write_matrix writeH5MU writeH5AD

Documented in writeH5AD writeH5MU

#' @importClassesFrom Matrix Matrix
#' @importClassesFrom DelayedArray DelayedMatrix
setClassUnion("Matrix_OR_DelayedMatrix", c("matrix", "Matrix", "DelayedMatrix"))

#' @importClassesFrom S4Vectors DataFrame
setClassUnion("data.frame_OR_DataFrame", c("data.frame", "DataFrame"))

#' Save an experiment to an .h5ad file.
#'
#' Note that NA values are not supported by HDF5, and therefore by h5ad. The behavior of this
#' function if NAs are present is undefined.
#'
#' @param object The object to save.
#' @param file Name of the file to save to.
#' @param overwrite Currently unused.
#'
#' @returns NULL, invisibly
#'
#' @examples
#' data(miniACC, package="MultiAssayExperiment")
#' writeH5AD(miniACC[[1]], "miniacc.h5ad")
#'
#' @importFrom rhdf5 H5Iget_type H5Gcreate H5Gclose
#' @importFrom SummarizedExperiment colData assays
#' @importFrom S4Vectors metadata
#' @importFrom methods hasMethod as
#' @importFrom SingleCellExperiment altExps rowData colData colPairNames colPair rowPairNames rowPair reducedDims
#'
#' @export
writeH5AD <- function(object, file, overwrite) {
    need_finalize <- FALSE
    written_object <- FALSE
    if (is.character(file)) {
        file <- open_h5(file)
        need_finalize <- TRUE
    } else if (!(H5Iget_type(file) %in% c("H5I_FILE", "H5I_GROUP")))
        stop("file must be a character, file or group")

    cls <- class(object)
    if (is(object, "RangedSummarizedExperiment") && !is(object, "SingleCellExperiment")) {
        warning("Ranged data is currently unsupported. Coercing to SummarizedExperiment...")
        object <- as(object, "SummarizedExperiment")
    }
    if (is(object, "SingleCellExperiment")) {
        write_data_frame(file, "var", rowData(object))
        obsm <- reducedDims(object)
        if (length(obsm) > 0) {
            obsmgrp <- H5Gcreate(file, "obsm")
            mapply(function(name, data) {
                if (is(data, "data.frame_OR_DataFrame")) {
                    rownames(data) <- rownames(colData(object))
                    write_data_frame(obsmgrp, name, data)
                } else {
                    if (length(dim(data)) == 1)
                        data <- as.vector(data)
                    else
                        data <- data
                    write_matrix(obsmgrp, name, data)
                }
            }, names(obsm), obsm)
            H5Gclose(obsmgrp)
        }

        lapply(list(list(name="obsp", names=colPairNames, getter=colPair), list(name="varp", names=rowPairNames, getter=rowPair)), function(cp) {
            names <- cp$names(object)
            if (length(names) > 0) {
                pairgrp <- H5Gcreate(file, cp$name)
                lapply(names, function(cname) {
                    write_matrix(pairgrp, cname, cp$getter(object, cname, asSparse=TRUE))
                })
                H5Gclose(pairgrp)
            }
        })
        object <- as(object, "SummarizedExperiment")
        written_object <- TRUE
    }
    if (is(object, "SummarizedExperiment")) {
        write_data_frame(file, "obs", colData(object))
        rdata <- rowData(object)
        if (ncol(rdata) > 0 || !is.null(rownames(rdata)))
            write_data_frame(file, "var", rdata)

        assays <- assays(object)
        nassays <- length(assays)
        write_matrix(file, "X", assays[[1]], needTranspose=FALSE)
        if (nassays > 1) {
            layersgrp <- H5Gcreate(file, "layers")
            mapply(function(name, mat) {
                write_matrix(layersgrp, name, mat, needTranspose=FALSE)
            }, names(assays[2:nassays]), assays[2:nassays])
            H5Gclose(layersgrp)
        }

        writeList(file, "uns", metadata(object))

        if (hasMethod("altExps", class(object))) {
            naltexps <- length(altExps(object))
            if (naltexps > 1) {
                warning("Alternative experiments are currently unsupported. Construct a MultiAssayExperiment object or write them as individual H5AD files.")
            }
        }
        written_object <- TRUE
    }
    if (is(object, "Matrix_OR_DelayedMatrix")) {
        write_matrix(file, "X", object, needTranspose=FALSE)
        rownames <- rownames(object)
        colnames <- colnames(object)
        if (is.null(rownames))
            rownames <- as.character(seq_len(nrow(object)))
        if (is.null(colnames))
            colnames <- as.character(seq_len(ncol(object)))
        var <- data.frame(row.names=rownames)
        obs <- data.frame(row.names=colnames)
        write_data_frame(file, "obs", obs)
        write_data_frame(file, "var", var)
        written_object <- TRUE
    }

    if (!written_object) {
        warning("Objects of class ", class(object), " are currently unsupported, skipping...")
    } else {
        write_object_class(file, cls[1])
        finalize_anndata_internal(file)
        if (need_finalize)
            finalize_anndata(file)
    }
    invisible(NULL)
}

#' Save a \code{\linkS4class{MultiAssayExperiment}} to an .h5mu file.
#'
#' Note that NA values are not supported by HDF5, and therefore by h5mu. The behavior of this
#' function if NAs are present is undefined.
#'
#' @param object A \code{\linkS4class{MultiAssayExperiment}}.
#' @param file Name of the file to save to.
#' @param overwrite Currently unused.
#'
#' @returns NULL, invisibly
#'
#' @examples
#' data(miniACC, package="MultiAssayExperiment")
#' writeH5MU(miniACC, "miniacc.h5mu")
#'
#' @importFrom rhdf5 H5Gcreate H5Gclose
#' @importFrom MultiAssayExperiment colData experiments sampleMap metadata
#'
#' @export
writeH5MU <- function(object, file, overwrite) {
    if (!is(object, "MultiAssayExperiment"))
        stop("Only MultiAssayExperiment objects are currently supported.")
    if (is.character(file)) {
        h5 <- open_h5(file)
        need_finalize <- TRUE
    } else
        stop("file must be a character")

    obs <- as.data.frame(colData(object), stringsAsFactors = FALSE)
    write_data_frame(h5, "obs", obs)

    mods <- H5Gcreate(h5, "mod")
    obsmgrp <- H5Gcreate(h5, "obsm")
    obsmapgrp <- H5Gcreate(h5, "obsmap")
    samplemap <- sampleMap(object)
    globalrownames <- rownames(obs)
    vars <- mapply(function(mname, mod) {
        mod_group <- H5Gcreate(mods, mname)
        writeH5AD(mod, mod_group)
        H5Gclose(mod_group)

        cmap <- samplemap[samplemap$assay == mname,]
        cmaporder <- match(globalrownames, cmap$primary)
        localorder <- match(cmap$colname, colnames(mod))
        obsmap <- vapply(cmaporder, function(o)ifelse(is.na(o), 0L, localorder[o]), 0L)
        writeDataset(obsmapgrp, mname, obsmap)
        writeDataset(obsmgrp, mname, as.integer(!is.na(cmaporder)))

        data.frame(row.names = rownames(mod))
    }, names(object), object)
    writeAttribute(mods, "order", names(object))
    H5Gclose(mods)
    H5Gclose(obsmgrp)
    H5Gclose(obsmapgrp)

    var <- do.call(rbind, vars)
    write_data_frame(h5, "var", var)

    writeList(h5, "uns", metadata(object))

    write_object_class(h5, class(object)[1])
    finalize_mudata(h5)
    invisible(NULL)
}

#' @importFrom rhdf5 H5Gcreate H5Gclose
#' @importFrom methods is as
write_matrix <- function(parent, key, mat, needTranspose=TRUE) {
    if (is(mat, "dgeMatrix"))
        mat <- as.matrix(mat)
    if (is.matrix(mat) || is.vector(mat) || is.array(mat) || is.numeric(mat) || is.integer(mat) || is.logical(mat) || is.character(mat)) { # is.vector returns false for vectors with attributes
        isscalar <- length(mat) == 1 & !is.null(attr(mat, "encoding-scalar"))
        naidx <- is.na(mat)
        hasna <- any(naidx)
        if (needTranspose) {
            if (is.matrix(mat))
                mat <- t(mat)
            else if (is.array(mat))
                mat <- aperm(mat, length(dim(mat)):1)
        }
        if (hasna && is.double(mat)) {
            # FIXME: extend anndata spec to handle double NAs?
            mat[naidx] <- NaN
            hasna <- FALSE
        } else if (hasna && is.character(mat)) {
            # FIXME: extend anndata spec to handle string NAs?
            mat[naidx] <- ""
            hasna <- FALSE
        }

        if (isscalar || !hasna) {
            writeDataset(parent, key, mat, scalar=isscalar)
            dset <- h5autoclose(parent & key)

            if (!isscalar)
                writeAttribute(dset, "encoding-type", ifelse(is.character(mat), "string-array", "array"))
            else
                writeAttribute(dset, "encoding-type", ifelse(is.character(mat), "string", "numeric-scalar"))
            writeAttribute(dset, "encoding-version", "0.2.0")
        } else {
            grp <- H5Gcreate(parent, key)
            if (is.character(mat))
                mat[naidx] <- ""
            else
                mat[naidx] <- as(0, type(mat))
            write_matrix(grp, "values", mat)
            write_matrix(grp, "mask", naidx)
            writeAttribute(grp, "encoding-type", ifelse(is.logical(mat), "nullable-boolean", "nullable-integer"))
            writeAttribute(grp, "encoding-version", "0.1.0")
            H5Gclose(grp)
        }
    } else if (is.factor(mat)) {
        grp <- H5Gcreate(parent, key)
        codes <- as.integer(mat)
        codes[is.na(mat)] <- 0L
        write_matrix(grp, "codes", codes - 1L)
        write_matrix(grp, "categories", levels(mat))
        writeAttribute(grp, "ordered", is.ordered(mat))
        writeAttribute(grp, "encoding-type", "categorical")
        writeAttribute(grp, "encoding-version", "0.2.0")
        H5Gclose(grp)
    } else if (is(mat, "dgCMatrix") || is(mat, "dgRMatrix") || is(mat, "DelayedArray") && DelayedArray::is_sparse(mat)) {
        if (is(mat, "DelayedArray"))
            mat <- as(mat, "RsparseMatrix")

        grp <- H5Gcreate(parent, key)
        writeDataset(grp, "indptr", mat@p)
        writeDataset(grp, "data", mat@x)
        writeAttribute(grp, "shape", if (needTranspose) dim(mat) else rev(dim(mat)), scalar=FALSE)
        writeAttribute(grp, "encoding-version", "0.1.0")
        if (is(mat, "dgCMatrix")) {
            writeDataset(grp, "indices", mat@i)
            writeAttribute(grp, "encoding-type", ifelse(needTranspose, "csc_matrix", "csr_matrix"))
        } else {
            writeDataset(grp, "indices", mat@j)
            writeAttribute(grp, "encoding-type", ifelse(needTranspose, "csr_matrix", "csc_matrix"))
        }
        H5Gclose(grp)
    } else if (is(mat, "DelayedArray") && requireNamespace("HDF5Array", quietly=TRUE)) {
        writeArrayToMuData(mat, parent, key)
    } else {
        stop("Writing matrices of type ", class(mat), " is not implemented.")
    }
}

#' @importFrom rhdf5 H5Gcreate H5Gclose h5createAttribute
write_data_frame <- function(parent, key, df) {
    group <- H5Gcreate(parent, key)

    columns <- colnames(df)
    index <- rownames(df)
    if (is.null(index))
        index <- as.character(1:nrow(df))
    df[["_index"]] <- index
    for (col in colnames(df)) {
        write_matrix(group, col, df[[col]])
    }

    # Write attributes
    writeAttribute(group, "_index", "_index")
    writeAttribute(group, "encoding-type", "dataframe")
    writeAttribute(group, "encoding-version", "0.2.0")
    if (length(columns) > 0) {
        writeAttribute(group, "column-order", columns, scalar=FALSE)
    } else {
        # When there are no columns, null buffer can't be written to a file.
        h5createAttribute(group, "column-order", dims=0)
    }
    H5Gclose(group)
    invisible(NULL)
}

.registeredHDF5ArrayMethods <- new.env()
.registeredHDF5ArrayMethods$registered <- FALSE
#' @importFrom rhdf5 h5write H5Dclose
#' @importFrom DelayedArray write_block start width
#' @importFrom methods setClass
registerHDF5ArrayMethods <- function() {
    if (!.registeredHDF5ArrayMethods$registered) {
        haveHDF5Array <- requireNamespace("HDF5Array", quietly=TRUE)
        if (!haveHDF5Array)
            return(FALSE)

        setClass("MuDataFileRealizationSink",
                 contains="HDF5RealizationSink",
                 slots=c(parent="H5IdComponent",
                         datasetname="character"),
                 where=.registeredHDF5ArrayMethods)


        setMethod(write_block, "MuDataFileRealizationSink", function(sink, viewport, block) {
            if (!is.array(block))
                block <- as.array(block)
            h5write(block, sink@parent, sink@datasetname, start=start(viewport), count=width(viewport))
            sink
        }, where=.registeredHDF5ArrayMethods)

        .registeredHDF5ArrayMethods$registered <- TRUE
    }
    .registeredHDF5ArrayMethods$registered
}

#' @importFrom rhdf5 h5createDataset H5Fget_name H5Iget_name
#' @importFrom methods new
MuDataFileRealizationSink <- function(dim, type, parent, key, dimnames=NULL, as.sparse=FALSE) {
    chunkdim <- HDF5Array::getHDF5DumpChunkDim(dim)
    h5createDataset(parent, key, dim, storage.mode=type, chunk=chunkdim, level=9, shuffle=TRUE)
    file <- H5Fget_name(parent)
    path <- paste(H5Iget_name(parent), key, sep="/")
    new("MuDataFileRealizationSink", dim=dim, dimnames=dimnames, type=type, as_sparse=as.sparse,
                                     filepath=file, name=path, chunkdim=chunkdim, parent=parent, datasetname=key)
}

#' @importFrom DelayedArray is_sparse type BLOCK_write_to_sink
writeArrayToMuData <- function(x, parent, key, verbose=NA) {
    if (!registerHDF5ArrayMethods())
        stop("The HDF5Array packages must be installed to save DelayedArrays.")
    as.sparse <- is_sparse(x)
    sink_dimnames <- dimnames(x)
    sink <- MuDataFileRealizationSink(dim(x), type(x), parent, key, sink_dimnames, as.sparse)

    verbose <- DelayedArray:::normarg_verbose(verbose)
    sink <- BLOCK_write_to_sink(sink, x, verbose=verbose)
    invisible(NULL)
}

# this is a straight port of the h5py guess_chunk function
chunk_base <- 16 * 1024
chunk_min <- 8 * 1024
chunk_max <- 1024 * 1024
guess_chunk <- function(shape, storage.mode) {
    nbytes <- switch(storage.mode, double=8, integer=4, logical=1, character=8)
    ndims <- length(shape)

    dset_size <- prod(shape) * nbytes
    target_size <- chunk_base * (2^log10(dset_size / 1024^2))
    if (target_size > chunk_max)
        target_size <- chunk_max
    else if (target_size < chunk_min)
        target_size <- chunk_min

   idx <- 0
   while(TRUE) {
       chunk_bytes <- prod(shape) * nbytes
       if ((chunk_bytes < target_size || abs(chunk_bytes - target_size) / target_size < 0.5) && chunk_bytes < chunk_max)
           break
       if (prod(shape) == 1)
           break

       shape[idx %% ndims + 1] <- ceiling(shape[idx %% ndims + 1] / 2)
       idx <- idx + 1
   }
   as.integer(shape)
}

#' @importFrom rhdf5 h5createDataset h5writeDataset
writeDataset <- function(parent, key, data, scalar=FALSE) {
    shape <- dim(data)
    if (is.null(shape))
        shape <- length(data)
    if (length(shape) == 1 && shape == 1 && scalar) {
        shape <- chunksize <- NULL
        level <- 0
    } else {
        chunksize <- guess_chunk(shape, storage.mode(data))
        level <- 9
    }
    h5createDataset(parent, key, shape, storage.mode=storage.mode(data), chunk=chunksize, level=level, shuffle=TRUE, encoding="UTF-8")
    h5writeDataset(data, parent, key, variableLengthString=TRUE, encoding="UTF-8")
}

#' @importFrom rhdf5 h5writeAttribute
writeAttribute <- function(obj, name, value, scalar=TRUE) {
    if (is.logical(value))
        value <- as.integer(value) # rhdf5 hasn't implemented logical attributes yet
    args <- list(attr=value, h5obj=obj, name=name)
    if (is.character(value))
        args$variableLengthString <- TRUE
    if (length(value) == 1 && scalar)
        args$asScalar <- TRUE
    do.call(h5writeAttribute, args)
}

#' @importFrom rhdf5 H5Gcreate H5Gclose
#' @importFrom methods slotNames slot
write_elem <- function(parent, key, data) {
    if (is(data, "list_OR_List"))
        writeList(parent, key, data)
    else if (is(data, "Matrix_OR_DelayedMatrix") || is(data, "vector_OR_Vector"))
        write_matrix(parent, key, data)
    else if (is(data, "data.frame_OR_DataFrame"))
        write_data_frame(parent, key, data)
    else if (isS4(data)) {
        grp <- H5Gcreate(parent, key)
        for (slotnm in slotNames(data)) {
            write_elem(grp, slotnm, slot(data, slotnm))
        }
        writeAttribute(grp, "encoding-type", "dict")
        writeAttribute(grp, "encoding-version", "0.1.0")
        H5Gclose(grp)
    } else
        warning(paste("Cannot write object of class", class(data), "skipping..."))
}

#' @importFrom rhdf5 H5Gcreate H5Gclose
writeList <- function(parent, key, data) {
    if (length(data) > 0) {
        nms <- names(data)
        if (is.null(nms))
            nms <- as.character(seq_along(data))
        grp <- H5Gcreate(parent, key)
        mapply(function(name, data) {
            write_elem(grp, name, data)
        }, nms, data)
        writeAttribute(grp, "encoding-type", "dict")
        writeAttribute(grp, "encoding-version", "0.1.0")
        H5Gclose(grp)
    }
}
PMBio/MuDataMAE documentation built on Oct. 20, 2023, 12:14 p.m.