#' take a step in sklearn IncrementalPCA partial fit procedure
#' @importFrom methods is
#' @importFrom methods new
#' @param mat a matrix -- can be R matrix or numpy.ndarray
#' @param n_components number of PCA to retrieve
#' @param obj sklearn.decomposition.IncrementalPCA instance
#' @note if obj is missing, the process is initialized with the matrix provided
#' @return trained IncrementalPCA reference, to which 'transform' method can be applied to obtain projection for any compliant input
#' @examples
#' # these steps are not basilisk-compliant, you need to acquire references
#' irloc = system.file("csv/iris.csv", package="BiocSklearn")
#' np = reticulate::import("numpy", delay_load=TRUE, convert=FALSE)
#' irismat = np$genfromtxt(irloc, delimiter=',')
#' ta = np$take
#' ipc = skPartialPCA_step(ta(irismat,0:49,0L))
#' ipc = skPartialPCA_step(ta(irismat,50:99,0L), obj=ipc)
#' ipc = skPartialPCA_step(ta(irismat,100:149,0L), obj=ipc)
#' head(names(ipc))
#' ipc$transform(ta(irismat,0:5,0L))
#' fullproj = ipc$transform(irismat)
#' fullpc = prcomp(data.matrix(iris[,1:4]))$x
#' round(cor(fullpc,fullproj),3)
#' @export
skPartialPCA_step = function(mat, n_components, obj) {
if (is(mat, "matrix")) {
nr = nrow(mat)
nc = ncol(mat)
}
else if (is(mat, "numpy.ndarray")) {
d = py_to_r(mat$shape)
nr = d[[1]]
nc = d[[2]]
}
if (missing(n_components)) n_components = as.integer(
min(c(nc,nr)))
if (missing(obj)) {
skd <- reticulate::import("sklearn.decomposition", delay_load=TRUE)
obj = skd$IncrementalPCA(n_components=n_components)
}
obj$partial_fit(mat)
}
#h5gen = function(rows, cols) {
# HDF5_FILE = system.file("hdf5/tenx10kmat.h5", package="BiocSklearn")
# t( h5read(HDF5_FILE, DSNAME, list(rows, cols)) )
#}
#
#tgm =
# TENxMatrix("~/Research/TENEX/1M_neurons_filtered_gene_bc_matrices_h5.h5")
#
#mmgen = function(rows, cols) {
# t(as.matrix(tgm[rows, cols]))
#}
#
submatGenerator = function(srcfun, rows, cols) {
srcfun(rows=rows, cols=cols)
}
#
#
#ch = chunk(SAMP_INDS, chunk.size=CHUNK_SIZE)
#
#date()
#' optionally fault tolerant incremental partial PCA for projection of samples from SummarizedExperiment
#' @param se instance of SummarizedExperiment
#' @param chunksize integer number of samples per step
#' @param n_components integer number of PCs to compute
#' @param assayind not used, assumed set to 1
#' @param picklePath if non-null, incremental results saved here via joblib.dump, for each chunk. If NULL, no saving of incremental results.
#' @param matTx a function defaulting to force() that accepts a matrix and returns a matrix with identical dimensions, e.g., \code{function(x) log(x+1)}
#' @param \dots not used
#' @import SummarizedExperiment
#' @return python instance of \code{sklearn.decomposition.incremental_pca.IncrementalPCA}
#' @aliases skIncrPPCA,SummarizedExperiment-method
#' @note Will treat samples as records and all features (rows)
#' as attributes, projecting.
#' to an \code{n_components}-dimensional space. Method will
#' acquire chunk of assay data
#' and transpose before computing PCA contributions.
#' In case of crash, restore from \code{picklePath} using
#' \code{joblib$load} after loading reticulate. You can
#' use the \code{n_samples_seen_} component of the restored
#' python reference to determine where to restart.
#' You can manage resumption using \code{skPartialPCA_step}.
#' @examples
#' # demo SE made with TENxGenomics:
#' # mm = matrixSummarizedExperiment(h5path, 1:27998, 1:750)
#' # saveHDF5SummarizedExperiment(mm, "tenx_750")
#' #
#' if (FALSE) {
#' if (requireNamespace("HDF5Array")) {
#' se750 = HDF5Array::loadHDF5SummarizedExperiment(
#' system.file("hdf5/tenx_750", package="BiocSklearn"))
#' lit = skIncrPPCA(se750[, 1:50], chunksize=5, n_components=4)
#' round(cor(pypc <- lit$transform(dat <- t(as.matrix(assay(se750[,1:50]))))),3)
#' rpc = prcomp(dat)
#' round(cor(rpc$x[,1:4], pypc), 3)
#' } # this has to be made basilisk-compliant
#' } # and is blocked until then
#' @exportMethod skIncrPPCA
setGeneric("skIncrPPCA", function(se, chunksize, n_components, assayind=1, picklePath="./skIdump.pkl", matTx = force, ...)
standardGeneric("skIncrPPCA"))
setMethod("skIncrPPCA", "SummarizedExperiment",
function(se, chunksize, n_components, assayind=1, ...) {
stopifnot(assayind==1)
n_components = as.integer(n_components)
chunksize = as.integer(chunksize)
rowvec = seq_len(nrow(se))
colvec = seq_len(ncol(se))
chs = biosk_chunk(colvec, chunk.size=chunksize)
matgen = function(rows, cols) t(matTx(as.matrix(assay(se[rows, cols])))) # assayind handling?
joblib = reticulate::import("joblib", delay_load=TRUE) # could lead to trouble 6/2022
cur = skPartialPCA_step(
submatGenerator( matgen, rowvec, chs[[1]] ), n_components )
if (!is.null(picklePath)) {
message(paste("Will save pickled interim per-chunk results in", picklePath))
joblib$dump(cur, filename=picklePath)
}
for (i in 2:length(chs)) {
cat(i)
cur = skPartialPCA_step(
submatGenerator( matgen, rowvec, chs[[i]]), n_components, cur)
if (!is.null(picklePath)) joblib$dump(cur, filename=picklePath)
}
cur
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.