#' An S4 class to represent a dataset in a HDF5 file.
#' @import BiocGenerics httr methods rjson
#' @slot file An object of type HSDSFile; the file in which the dataset is resident.
#' @slot path The dataset's path in the internal HDF5 hiearchy.
#' @slot uuid The unique unit ID by which the dataset is accessed in the server
#' database system.
#' @slot shape The dimensions of the dataset
#' @slot type The dataset's HDF5 datatype
setClass("HSDSDataset", representation(file="HSDSFile", path="character", uuid="character",
shape="numeric", type="list"))
#' Construct an object of type HSDSDataset
#'
#' A HSDSDataset is a representation of a dataset in a HDF5 file.
#'
#' @name HSDSDataset
#' @param file An object of type HSDSFile which hosts the dataset
#' @param path The complete intrafile path to the dataset
#' @examples
#' src <- HSDSSource('http://hsdshdflab.hdfgroup.org')
#' f <- HSDSFile(src, '/home/spollack/testzero.h5')
#' d <- HSDSDataset(f, '/grpA/grpAB/dsetX')
#' @export
HSDSDataset <- function(file, path) {
idx <- which(file@dsetdf[,1] == path)
if (length(idx) == 0)
stop("no such dataset")
uuid <- file@dsetdf[idx,2]
request <- paste0(file@src@endpoint, '/datasets/', uuid, '?domain=', file@domain)
response <- submitRequest(request)
shape <- response$shape$dims
type <- list(class=response$type$class, base=response$type$base)
new("HSDSDataset", file=file, path=path, uuid=uuid,
shape=shape, type=type)
}
#' extract elements of a one or two-dimensional HSDSDataset
#'
#' @name [
#' @param x object of type HSDSDataset
#' @param i vector of indices (first dimension)
#' @aliases [,HSDSDataset-method [,HSDSDataset,numeric-method
#' [,HSDSDataset,numeric,numeric-method [,HSDSDataset,numeric,ANY-method
#' [,HSDSDataset,numeric,ANY,ANY-method
#' @docType methods
#' @rdname extract-methods
#'
# special case: one-dimensional arrays
setMethod('[', c("HSDSDataset", "numeric"),
function(x, i) {
getDataList(x, list(i), transfermode='JSON')
})
# special case: two-dimensional arrays
setMethod('[', c("HSDSDataset", "numeric", "numeric"),
function(x, i, j) {
getDataList(x, list(i, j), transfermode='JSON')
})
#' Fetch data from a remote dataset
#'
#' The servers require data to be fetched in slices, i.e., in sets of
#' for which the indices of each dimension are of the form start:stop:step.
#' More complex sets of indices will be split into slices and fetched in
#' multiple requests. This is opaque to the user, but may enter into
#' considerations of data access patterns, e.g., for performance-tuning.
#'
#' @param dataset An object of type HSDSDataset, the dataset to access.
#'
#' @param indices The indices of the data to fetch
#'
#' @param transfermode Either (default) 'JSON' or 'binary'
#'
#' @return an Array containing the data fetched from the server
#'
#' @docType methods
#' @rdname getData-methods
#'
#' @examples
#' s <- HSDSSource('http://hsdshdflab.hdfgroup.org')
#' f <- HSDSFile(s, '/shared/bioconductor/tenx_full.h5')
#' d <- HSDSDataset(f, '/newassay001')
#' x <- getData(d, c('1:4', '1:27998'), transfermode='JSON')
#' # x <- getData(d, c(1:4, 1:27998), transfermode='JSON') # method missing?
#' x <- d[1:4,1:27998]
#' @export
setGeneric("getData", function(dataset, indices, transfermode) standardGeneric("getData"))
#' @rdname getData-methods
#' @aliases getData,HSDSDataset,character,character-method
setMethod("getData", c("HSDSDataset", "character", "character"),
function(dataset, indices, transfermode) {
getDataVec(dataset, indices, transfermode)
})
#' @rdname getData-methods
#' @aliases getData,HSDSDataset,character,missing-method
setMethod("getData", c("HSDSDataset", "character", "missing"),
function(dataset, indices) {
getDataVec(dataset, indices, 'JSON')
})
#' @rdname getData-methods
#' @aliases getData,HSDSDataset,list,character-method
setMethod("getData", c("HSDSDataset", "list", "character"),
function(dataset, indices, transfermode) {
getDataList(dataset, indices, transfermode)
})
#' @rdname getData-methods
#' @aliases getData,HSDSDataset,list,,missing-method
setMethod("getData", c("HSDSDataset", "list", "missing"),
function(dataset, indices) {
getDataList(dataset, indices, 'JSON')
})
# private - perform a single fetch; indices is a vector of
# type character with one slice per dimension.
#' @useDynLib rhdf5client2
getDataVec <- function(dataset, indices, transfermode = 'JSON') {
indices <- checkSlices(dataset@shape, indices)
if (length(indices) == 0)
stop("bad slices")
if (!(transfermode %in% c('JSON', 'binary'))) {
warning('unrecognized transfermode, using JSON')
transfermode <- 'JSON'
}
sdims <- vapply(indices, slicelen, numeric(1))
# rdims is dimensions of response$value
singlefetch <- FALSE
if (all(sdims == 1)) {
# this is a single-value fetch
rdims <- c(1)
singlefetch <- TRUE
} else {
# h5pyd drops single-width dimensions at the end.
# dimensions of the result
rdims <- sdims[which(sdims != 1)]
while(sdims[length(sdims)] == 1)
sdims <- sdims[1:(length(sdims)-1)]
}
indices <- vapply(indices, function(s) { s }, character(1))
sel <- paste0('[', paste(indices, collapse=','), ']')
endpoint <- dataset@file@src@endpoint
domain <- dataset@file@domain
request <- paste0(endpoint, '/datasets/', dataset@uuid,
'/value?domain=', domain, '&select=', sel)
response <- submitRequest(request, transfermode=transfermode)
if (singlefetch) {
return(as.numeric(response$value))
}
if (length(rdims) == 1 && length(sdims) == 1) { # 1D array quick bypass
return(as.numeric(response$value))
}
if (length(rdims) == 2 && length(sdims) == 2) { # 2D array quick bypass
if (transfermode == 'JSON') {
result <- response$value
A <- matrix(nrow = rdims[1], ncol = rdims[2])
for (i in 1:rdims[1]) {
A[i,] <- as.numeric(result[[i]])
}
return(A)
} else if (transfermode == 'binary') {
result <- extractBinary(dataset@type, prod(rdims), response)
A <- matrix(data=result, nrow = rdims[2], ncol = rdims[1])
return(t(A))
}
}
nn <- prod(rdims)
A <- array(rep(0, nn))
if (transfermode == 'JSON') {
# unpack response into an R array
result <- response$value
# multi-dimensional data is returned as a list.
if (is.list(result)) {
A[1:nn] <- .Call("extractJSON", as.integer(length(sdims)), as.integer(sdims), result)
} else { # one-dimensional data is returned as a vector
A[1:nn] <- as.numeric(result)
}
} else if (transfermode == 'binary') {
result <- extractBinary(dataset@type, nn, response)
A[1:nn] <- .Call("extractBin", as.integer(length(sdims)), as.integer(sdims), result)
}
# Question: should AA be forced to rdims or sdims?
# For a slice of a multi-dimensional array, we want
# to force to the subset. (rdims)
# But if DelayedArray requires a 1 x N array, we
# want the returned value to be 1 x N, not a
# vector of N. (sdims)
# Arbitrary choice: force flat (rdims) and let
# the calling routine redimension.
AA <- array(A, dim = rdims)
AA
}
# private - split numeric vectors into slices and fetch
# data in one or more requests.
getDataList <- function(dataset, indices, transfermode = 'JSON') {
if (length(dataset@shape) != length(indices))
stop("wrong length of indexlist")
slicelist <- lapply(indices, slicify)
slclen <- lapply(slicelist, length)
if (any(unlist(slclen) > 1)) {
# assemble block arrays
AA <- multifetch(slicelist, dataset)
} else {
# simple case: one block
AA <- getDataVec(dataset, unlist(slicelist), transfermode)
}
AA
}
# private - in which we try to anticipate all the invalid things
# users will try to enter for indices
checkSlices <- function(shape, slices) {
ok <- TRUE
if (length(slices) != length(shape)) {
message("wrong number of indices")
ok <- FALSE
}
slicelist <- vector("list", length(slices))
for (i in seq_along(slices)) {
slice <- slices[i]
start <- -1
stop <- -1
step <- -1
st <- strsplit(slice, ':')[[1]]
ss <- as.numeric(st)
ss[which(is.na(ss))] <- -1
if (slice == ':') {
start <- 1
stop <- shape[i]
step <- 1
} else {
if (length(ss) == 1) { # this is a slice like '5:'
start <- ss[1]
stop <- shape[i]
step <- 1
} else if (length(ss) == 2) {
if (st[1] == '') { # ':5'
start <- 1
stop <- ss[2]
step <- 1
} else {
start <- ss[1]
stop <- ss[2]
step <- 1
}
} else if (length(ss) == 3) {
start <- ss[1]
stop <- ss[2]
step <- ss[3]
} else {
message(paste0("malformed slice ", i))
ok <- FALSE
}
}
slicevec <- c(start, stop, step)
if (any(slicevec < 0)) {
message(paste0("malformed slice ", i))
ok <- FALSE
}
if (any(slicevec != trunc(slicevec))) {
message(paste0("malformed slice ", i))
ok <- FALSE
}
slicelist[[i]] <- slicevec
if (0 >= start || start > shape[i]) {
message(paste0("slice start out of range in slice ", i))
ok <- FALSE
}
if (0 >= stop || stop > shape[i]) {
message(paste0("slice stop out of range in slice ", i))
ok <- FALSE
}
if (stop < start) {
message(paste0("slice stop less than slice start in slice ", i))
ok <- FALSE
}
}
if (!ok)
return(list())
strslices <- lapply(slicelist,
function(slc) { sprintf('%d:%d:%d', slc[1], slc[2], slc[3]) })
pyslices <- lapply(strslices, r2pyslice)
}
# private - convert R-slice (string) to python-slice (string)
r2pyslice <- function(slcstr) {
slc <- as.numeric(strsplit(slcstr, ':')[[1]])
# python indices from 0 instead of 1
slc[1] <- slc[1]-1
slc[2] <- slc[2]-1
# if (stop-start) % step == 0, python slice excludes stop
if ( (slc[2]-slc[1]) %% slc[3] == 0 )
slc[2] <- slc[2] + 1
slcstr <- sprintf('%d:%d:%d', slc[1], slc[2], slc[3])
}
# private - convert R-slice (vec) to python-slice (vec)
r2pyslcvec <- function(slc) {
# python indices from 0 instead of 1
slc[1] <- slc[1]-1
slc[2] <- slc[2]-1
# if (stop-start) % step == 0, python slice excludes stop
if ( (slc[2]-slc[1]) %% slc[3] == 0 )
slc[2] <- slc[2] + 1
slc
}
# Note: for more than two dimensions, "column-major" means
# "first-fastest" and "row-major" means "last-fastest"
# private - extract column-major subscripts for linear index
csub4idx <- function(D, ind) {
m <- length(D)
X <- rep(-1, m)
off <- ind-1
X <- vapply(0:(m-1), function(j) {
if (j == 0) {
s <- off %% D[1]
} else if (j == m-1) {
s <- off %/% prod(D[1:m-1])
} else {
s <- (off %/% prod(D[1:j])) %% D[j+1]
}
s
}, numeric(1))
X <- X+1
X
}
# private - extract column-major subscripts for linear index
rsub4idx <- function(D, ind) {
m <- length(D)
X <- rep(-1, m)
off <- ind-1
X <- vapply(0:(m-1), function(j) {
if (j == 0) {
s <- off %/% prod(D[2:m])
} else if (j == (m-1)) {
s <- off %% D[m]
} else {
s <- (off %/% prod(D[(j+2):m])) %% D[j+1]
}
s
}, numeric(1))
X <- X+1
X
}
# private - extract column-major linear index for subscripts
cidx4sub <- function(D, S) {
m <- length(D)
S <- S-1
off <- 0
for (j in 0:(m-1)) {
p <- ifelse(j == 0, 1, prod(D[1:j]))
off <- off + S[j+1]*p
}
off+1
}
# private - extract row-major linear index for subscripts
ridx4sub <- function(D, S) {
m <- length(D)
S <- S-1
off <- 0
for (j in 0:(m-1)) {
p <- ifelse(j == m-1, 1, prod(D[(j+2):m]))
off <- off + S[j+1]*p
}
off+1
}
# private - length of valid (Python) slice
slicelen <- function(slc) {
ss <- as.numeric(strsplit(slc, ':')[[1]])
sdim <- (ss[2]-ss[1]) %/% ss[3]
if ((ss[2]-ss[1]) %% ss[3] != 0)
sdim <- sdim + 1
sdim
}
# private - extract binary data from response
# reference: https://support.hdfgroup.org/HDF5/doc1.8/RM/PredefDTypes.html
#' @importFrom utils strcapture
extractBinary <- function(typ, nele, rsp) {
# standard defaults
what <- 'integer'
size <- NA_integer_
signed <- TRUE
endian <- .Platform$endian # just a guess - the server determines the endianness
df <- strcapture('H5T_([[:alnum:]]*)_([IFUBD])([[:digit:]]+)([LB]E)', typ$base,
data.frame(cl=character(), wh=character(), sz=integer(),
en=character(), stringsAsFactors=FALSE))
if (nrow(df) != 1)
stop(paste0("binary transfer for type ", typ$base, " not implemented yet"))
if (!(df[1,1] %in% c('STD', 'IEEE')))
stop(paste0("binary transfer for type ", typ$base, " not implemented yet"))
if (df[1,2] == 'I' && df[1,3] == '32') {
what <- 'integer'
size <- 4
} else if (df[1,2] == 'I' && df[1,3] == '64') {
what <- 'integer'
size <- 8
} else if (df[1,2] == 'F' && df[1,3] == '64') {
what <- 'double'
size <- 8
} else {
stop(paste0("binary transfer for type ", typ$base, " not implemented yet"))
}
endian <- ifelse(df[1,4] == 'LE', 'little', 'big')
result <- readBin(rsp$content, what=what, n=nele, size=size, signed=signed, endian=endian)
result
}
# slicify - convert an arbitrary vector into slices
# This is an unsightly kludge, but it is designed to ensure
# that there is no more than one slice of width one.
# All other singletons should end up squashed into
# pairs with each other. The idea is that the time
# required to execute the loop is dwarfed by the time
# required to execute an extra remote fetch.
slicelst <- function(v) {
ll <- vector("list", length = length(v))
vec <- rep(0, length(v))
il <- 1
if (length(v) <= 2) {
ll <- list(v)
} else {
vec[1] <- v[1]
vec[2] <- v[2]
nv <- 2
i <- 3
while (i <= length(v)) {
if (v[i]-v[i-1] == v[i-1]-v[i-2]) {
nv <- nv + 1
vec[nv] <- v[i]
i = i + 1
} else {
ll[[il]] <- vec[1:nv]
il <- il + 1
if (i < length(v)) {
vec[1] <- v[i]
vec[2] <- v[i+1]
nv <- 2
i = i + 2
} else {
vec[1] <- v[i]
nv <- 1
i = i + 1
}
}
}
if (nv > 0) {
ll[[il]] <- vec[1:nv]
}
ll <- ll[-which(sapply(ll, is.null))]
}
ll
}
# make list of vecs into string slices
slicify <- function(v) {
ll <- slicelst(v)
slices <- vapply(ll, function(vec) {
start <- vec[1]
stop <- vec[length(vec)]
step <- ifelse(length(vec) == 1, 1, vec[2]-vec[1])
sprintf("%d:%d:%d", start, stop, step)
}, character(1))
}
# private - fetch and assemble dataset blocks
multifetch <- function(LL, dataset) {
slicelen <- function(slc) {
ss <- as.numeric(strsplit(slc, ':')[[1]])
r <- (ss[2]-ss[1]) %/% ss[3]
r+1
}
# MM[[d]][[i]] is the length of the ith slice in dimension d
MM <- lapply(LL, function(L) lapply(L, function(slc) slicelen(slc)))
# N[d] is the length of the result array in dimension d
N <- unlist(lapply(seq_along(MM), function(m) sum(unlist(MM[[m]]))))
# B[d] is the number of slices in dimension d
B <- unlist(lapply(seq_along(MM), function(m) length(unlist(MM[[m]]))))
# pre-allocate result array
R <- array(rep(0,prod(N)), dim=N)
# the total number of fetches is the product of the numbers of slices
nf <- prod(B)
# loop over fetches (this can be vapply later)
for (i in 1:nf) {
# select ith block to fetch
sbs <- rsub4idx(B, i)
scs <- lapply(seq_along(sbs), function(j) LL[[j]][[sbs[j]]])
# fetch block
blk <- getData(dataset, unlist(scs)) # is scs right?
# put into correct subarray of R
nd <- length(LL)
arglst <- vector(mode="list", length = nd)
for(d in 1:nd) {
umm <- unlist(MM[[d]])
startpos <- 1
if (sbs[d] != 1) {
startpos <- 1+sum(umm[1:(sbs[d]-1)])
}
length <- MM[[d]][[sbs[d]]]
arglst[[d]] <- seq(startpos, startpos+length-1)
}
R <- do.call('[<-', c(list(R), arglst, list(blk)))
}
# squash flat dimensions out
# note: This is kind of a kludge, it would be better to
# figure out the final dimensions ahead of time and
# modify the for loop with conditionals. But if this
# works, it evades unnecessary code complexity.
NN <- N[which(N != 1)]
R <- array(R, dim=NN)
R
}
setMethod("show", "HSDSDataset", function(object) {
cat(paste("rhdf5client2 HSDSDataset instance, with shape "))
dput(object@shape)
cat(" use getData(...) or square brackets to retrieve content.\n")
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.