#' Start and stop \pkg{basilisk}-related processes
#'
#' Creates a \pkg{basilisk} process in which Python operations (via \pkg{reticulate})
#' can be safely performed with the correct versions of Python packages.
#'
#' @param env A \linkS4class{BasiliskEnvironment} object specifying the \pkg{basilisk} environment to use.
#'
#' Alternatively, a string specifying the path to an environment, though this should only be used for testing purposes.
#'
#' Alternatively, \code{NULL} to indicate that the base Conda installation should be used as the environment.
#' @param full.activation Logical scalar, see \code{\link{activateEnvironment}} for more details.
#' @param proc A process object generated by \code{basiliskStart}.
#' @param fork Logical scalar indicating whether forking should be performed on non-Windows systems,
#' see \code{\link{getBasiliskFork}}.
#' If \code{FALSE}, a new worker process is created using communication over sockets.
#' @param shared Logical scalar indicating whether \code{basiliskStart} is allowed
#' to load a shared Python instance into the current R process, see \code{\link{getBasiliskShared}}.
#' @param fun A function to be executed in the \pkg{basilisk} process.
#' This should return a \dQuote{pure R} object, see details.
#' @param persist Logical scalar indicating whether to pass a persistent store to \code{fun}.
#' If \code{TRUE}, \code{fun} should accept a \code{store} argument.
#' @param testload Character vector specifying the Python packages to load into the process during set-up.
#' This is used to check that packages can be correctly loaded, switching to a fallback on \code{GLIBCXX} dynamic linking failures.
#' @param ... Further arguments to be passed to \code{fun}.
#'
#' @return
#' \code{basiliskStart} returns a process object, the exact nature of which depends on \code{fork} and \code{shared}.
#' This object should only be used in \code{basiliskRun} and \code{basiliskStop}.
#'
#' \code{basiliskRun} returns the output of \code{fun(...)}, possibly executed inside the separate process.
#'
#' \code{basiliskStop} stops the process in \code{proc}.
#'
#' @details
#' These functions ensure that any Python operations in \code{fun} will use the environment specified by \code{envname}.
#' This avoids version conflicts in the presence of other Python instances or environments loaded by other packages or by the user.
#' Thus, \pkg{basilisk} clients are not affected by (and if \code{shared=FALSE}, do not affect) the activity of other R packages.
#'
#' It is good practice to call \code{basiliskStop} once computation is finished to terminate the process.
#' Any Python-related operations between \code{basiliskStart} and \code{basiliskStop} should only occur via \code{basiliskRun}.
#' Calling \pkg{reticulate} functions directly will have unpredictable consequences,
#' Similarly, it would be unwise to interact with \code{proc} via any function other than the ones listed here.
#'
#' If \code{proc=NULL} in \code{basiliskRun}, a process will be created and closed automatically.
#' This may be convenient in functions where persistence is not required.
#' Note that doing so requires specification of \code{pkgname} and \code{envname}.
#'
#' If the base Conda installation provided with \pkg{basilisk} satisfies the requirements of the client package,
#' developers can set \code{env=NULL} in this function to use that base installation rather than constructing a separate environment.
#'
#' @section Choice of process type:
#' \itemize{
#' \item If \code{shared=TRUE} and no Python version has already been loaded,
#' \code{basiliskStart} will load Python directly into the R session from the specified environment.
#' Similarly, if the existing environment is the same as the requested environment, \code{basiliskStart} will use that directly.
#' This mode is most efficient as it avoids creating any new processes,
#' but the use of a shared Python configuration may prevent non-\pkg{basilisk} packages from working correctly in the same session.
#' \item If \code{fork=TRUE}, no Python version has already been loaded and we are not on Windows,
#' \code{basiliskStart} will create a new process by forking.
#' In the forked process, \code{basiliskStart} will load the specified environment for operations in Python.
#' This is less efficient as it needs to create a new process
#' but it avoids forcing a Python configuration on other packages in the same R session.
#' \item Otherwise, \code{basiliskStart} will create a parallel socket process containing a separate R session.
#' In the new process, \code{basiliskStart} will load the specified environment for Python operations.
#' This is the least efficient as it needs to transfer data over sockets but is guaranteed to work.
#' }
#'
#' Developers can control these choices directly by explicitly specifying \code{shared} and \code{fork},
#' while users can control them indirectly with \code{\link{setBasiliskFork}} and related functions.
#'
#' @section Testing package loads:
#' If \code{testload} is provided, \code{basiliskStart} will attempt to load those Python packages into the newly created process.
#' This is used to detect loading failures due to differences in the versions of the shared libraries.
#' Most typically, a conda-supplied Python package (often \pkg{scipy} submodules) will have been compiled against a certain version of \code{libstdc++} but R is compiled against an older version.
#' R's version takes precedence when \pkg{reticulate} attempts to load the Python package, causing cryptic \dQuote{GLIBCXX version not found} errors.
#'
#' By checking the specified \code{testload}, \code{basiliskStart} can check for loading failures in potentially problematic packages.
#' Upon any failure, \code{basiliskStart} will fall back to a separate socket process running a conda-supplied R installation.
#' The idea is that, if both Python and R are sourced from conda, they will be using the same version of \code{libstdc++} and other libraries.
#' This avoids loading errors and/or segmentation faults due to version mismatches.
#'
#' Use of this "last resort fallback" overrides any choice of process type from \code{fork} and \code{shared}.
#' If no failures are encountered, a process will be created using the current R installation.
#'
#' Note that the fallback R installation is very minimalistic; only \pkg{reticulate} is guaranteed to be available.
#' This places some limitations on the code that can be executed inside \code{fun} for \pkg{basilisk} environments that might trigger use of the fallback.
#'
#' @section Constraints on user-defined functions:
#' In \code{basiliskRun}, there is no guarantee that \code{fun} has access to \code{basiliskRun}'s calling environment.
#' This has several consequences for code in the body of \code{fun}:
#' \itemize{
#' \item Variables used inside \code{fun} should be explicitly passed as an argument to \code{fun}.
#' Developers should not rely on closures to capture variables in the calling environment of \code{basiliskRun}.
#' \item Developers should \emph{not} attempt to pass complex objects to memory in or out of \code{fun}.
#' This mostly refers to objects that contain custom pointers to memory, e.g., file handles, pointers to \pkg{reticulate} objects.
#' Both the arguments and return values of \code{fun} should be pure R objects.
#' \item Functions or variables from non-base R packages should be prefixed with the package name via \code{::}, or those packages should be reloaded inside \code{fun}.
#' However, if \code{fun} loads Python packages that might trigger the last resort fallback, no functions or variables should be used from non-base R packages.
#' }
#'
#' Developers can test that their function behaves correctly in \code{basiliskRun} by setting \code{\link{setBasiliskShared}} and \code{\link{setBasiliskFork}} to \code{FALSE}.
#' This forces the execution of \code{fun} in a new process; any incorrect assumption of shared environments will cause errors.
#' If \code{fun} involves fallback-inducing Python packages, developers can further set \code{\link{setBasiliskForceFallback}} before running \code{basiliskRun}.
#' This tests that \code{fun} works with the minimal conda-supplied R installation.
#'
#' @section Persisting objects across calls:
#' Objects created inside \code{fun} can be persisted across calls to \code{basiliskRun} by setting \code{persist=TRUE}.
#' This will instruct \code{basiliskRun} to pass a \code{store} argument to \code{fun} that can be used to store arbitrary objects.
#' Those same objects can be retrieved from \code{store} in later calls to \code{basiliskRun} using the same \code{proc}.
#' Any object can be stored in \code{.basilisk.store} but will remain strictly internal to \code{proc}.
#'
#' This capability is primarily useful when a Python workflow is split across multiple \code{basiliskRun} calls.
#' Each subsequent call can pick up from temporary intermediate objects generated by the previous call.
#' In this manner, \pkg{basilisk} enables modular function design where developers can easily mix and match different \code{basiliskRun} invocations.
#' See Examples for a working demonstration.
#'
#' @section Use of lazy installation:
#' If the specified \pkg{basilisk} environment is not present and \code{env} is a \linkS4class{BasiliskEnvironment} object, the environment will be created upon first use of \code{basiliskStart}.
#' If the base Conda installation is not present, it will also be installed upon first use of \code{basiliskStart}.
#' We do not provide Conda with the \pkg{basilisk} package binaries to avoid portability problems with hard-coded paths (as well as potential licensing issues from redistribution).
#'
#' By default, both the base conda installation and the environments will be placed in an external user-writable directory defined by \pkg{rappdirs} via \code{\link{getExternalDir}}.
#' The location of this directory can be changed by setting the \code{BASILISK_EXTERNAL_DIR} environment variable to the desired path.
#' This may occasionally be necessary if the file path to the default location is too long for Windows, or if the default path has spaces that break the Miniconda/Anaconda installer.
#'
#' Advanced users may consider setting the environment variable \code{BASILISK_USE_SYSTEM_DIR} to 1 when installing \pkg{basilisk} and its client packages from source.
#' This will place both the base installation and the environments in the R system directory, which simplifies permission management and avoids duplication in enterprise settings.
#'
#' @section Persistence of environment variables:
#' When \code{shared=TRUE} and if no Python instance has already been loaded into the current R session,
#' a side-effect of \code{basiliskStart} is that it will modify a number of environment variables.
#' This is done to mimic activation of the Conda environment located at \code{env}.
#' Importantly, old values for these variables will \emph{not} be restored upon \code{basiliskStop}.
#'
#' This behavior is intentional as (i) the correct use of the Conda-derived Python depends on activation and (ii) the loaded Python persists for the entire R session.
#' It may not be safe to reset the environment variables and \dQuote{deactivate} the environment while the Conda-derived Python instance is effectively still in use.
#' (In practice, lack of activation is most problematic on Windows due to its dependence on correct \code{PATH} specification for dynamic linking.)
#'
#' If persistence is not desirable, users should set \code{shared=FALSE} via \code{\link{setBasiliskShared}}.
#' This will limit any modifications to the environment variables to a separate R process.
#'
#' @author Aaron Lun
#'
#' @seealso
#' \code{\link{setupBasiliskEnv}}, to set up the conda environments.
#'
#' \code{\link[basilisk.utils]{activateEnvironment}} in the
#' \pkg{basilisk.utils} package.
#'
#' \code{\link{getBasiliskFork}} and \code{\link{getBasiliskShared}}, to control various global options.
#'
#' @examples
#' if (.Platform$OS.type != "windows") {
#' \dontshow{basilisk.utils::installConda()}
#'
#' # Creating an environment (note, this is not necessary
#' # when supplying a BasiliskEnvironment to basiliskStart):
#' tmploc <- file.path(tempdir(), "my_package_A")
#' if (!file.exists(tmploc)) {
#' setupBasiliskEnv(tmploc, c('pandas=1.4.3'))
#' }
#'
#' # Pulling out the pandas version, as a demonstration:
#' cl <- basiliskStart(tmploc, testload="pandas")
#' basiliskRun(proc=cl, function() {
#' X <- reticulate::import("pandas"); X$`__version__`
#' })
#' basiliskStop(cl)
#'
#' # This happily co-exists with our other environment:
#' tmploc2 <- file.path(tempdir(), "my_package_B")
#' if (!file.exists(tmploc2)) {
#' setupBasiliskEnv(tmploc2, c('pandas=1.4.2'))
#' }
#'
#' cl2 <- basiliskStart(tmploc2, testload="pandas")
#' basiliskRun(proc=cl2, function() {
#' X <- reticulate::import("pandas"); X$`__version__`
#' })
#' basiliskStop(cl2)
#'
#' # Persistence of variables is possible within a Start/Stop pair.
#' cl <- basiliskStart(tmploc)
#' basiliskRun(proc=cl, function(store) {
#' store$snake.in.my.shoes <- 1
#' invisible(NULL)
#' }, persist=TRUE)
#' basiliskRun(proc=cl, function(store) {
#' return(store$snake.in.my.shoes)
#' }, persist=TRUE)
#' basiliskStop(cl)
#' }
#'
#' @export
#' @importFrom parallel makePSOCKcluster clusterCall makeForkCluster
#' @importFrom reticulate py_config py_available
#' @importFrom basilisk.utils activateEnvironment getFallbackREnv
basiliskStart <- function(env, full.activation=NA, fork=getBasiliskFork(), shared=getBasiliskShared(), testload=NULL) {
envpath <- obtainEnvironmentPath(env)
# Last-resort fallback uses the internal conda-supplied R.
if (getBasiliskForceFallback() || isTRUE(glibcxx_failed$failures[[envpath]])) {
rscript <- file.path(getFallbackREnv(), "bin", "Rscript")
proc <- makePSOCKcluster(1, rscript=rscript) # can't suppress the warning, oh well.
# Transmit internals required for useBasiliskEnv to work properly
# inside the mini-R. This requires some work to strip the function
# environment so that we avoid sending the basilisk namespace to the
# child. Doing so would drag in the current session's reticulate in the
# child session, which defeats the purpose of the fallback.
envstripper <- function(fun) {
environment(fun) <- .GlobalEnv
fun
}
clusterCall(proc, envstripper(function() {
requireNamespace("reticulate", lib.loc = file.path(R.home(), "library"))
}))
assigner <- envstripper(function(name, value) assign(name, value, envir=.GlobalEnv))
clusterCall(proc, assigner, name=".activate_condaenv", value=envstripper(basilisk.utils:::.activate_condaenv))
clusterCall(proc, assigner, name="isWindows", value=isWindows)
clusterCall(proc, envstripper(activateEnvironment), envpath=envpath, full.activation=full.activation, loc=getCondaDir())
clusterCall(proc, assigner, name="activateEnvironment", value=list) # no-op as we already ran it.
clusterCall(proc, envstripper(useBasiliskEnv), envpath=envpath)
clusterCall(proc, envstripper(.instantiate_store))
return(proc)
}
if (shared) {
ok <- FALSE
if (py_available()) {
if (.same_as_loaded(envpath)) {
ok <- TRUE
}
} else {
useBasiliskEnv(envpath, full.activation)
ok <- TRUE
}
proc <- new.env()
if (ok) {
proc <- new.env()
proc <- .activate_fallback(proc, testload, env=env, envpath=envpath, full.activation=full.activation)
return(proc)
}
}
# Create a separate R process if the shared instance doesn't work.
if (fork && !isWindows() && (!py_available() || .same_as_loaded(envpath))) {
proc <- makeForkCluster(1)
} else {
proc <- makePSOCKcluster(1)
}
clusterCall(proc, useBasiliskEnv, envpath=envpath, full.activation=full.activation)
proc <- .activate_fallback(proc, testload, env=env, envpath=envpath, full.activation=full.activation)
clusterCall(proc, .instantiate_store)
proc
}
.activate_fallback <- function(proc, testload, env, envpath, full.activation) {
if (!is.null(testload)) {
test <- try({
basiliskRun(proc, function(pkgs) {
for (pkg in pkgs) {
reticulate::import(pkg)
}
}, pkgs=testload)
})
if (is(test, "try-error")) {
# Switching to the last-resort fallback upon detecting GLIBCXX errors.
msg <- attr(test, "condition")$message
if (grepl("ImportError:.*version.*LIB.*not found", msg)) {
glibcxx_failed$failures[[envpath]] <- TRUE
basiliskStop(proc)
proc <- basiliskStart(env, full.activation=full.activation)
} else {
stop(msg)
}
} else {
glibcxx_failed$failures[[envpath]] <- FALSE
}
}
proc
}
glibcxx_failed <- new.env()
glibcxx_failed$failures <- list()
.instantiate_store <- function() {
assign(".basilisk.store", new.env(), .GlobalEnv)
}
#' @export
#' @rdname basiliskStart
#' @importFrom parallel stopCluster
basiliskStop <- function(proc) {
if (!is.environment(proc)) {
stopCluster(proc)
}
}
#' @export
#' @rdname basiliskStart
#' @importFrom parallel clusterCall
basiliskRun <- function(proc=NULL, fun, ..., env, full.activation=NA, persist=FALSE, fork=getBasiliskFork(), shared=getBasiliskShared(), testload=NULL) {
if (is.null(proc)) {
proc <- basiliskStart(env, full.activation=full.activation, fork=fork, shared=shared, testload=testload)
on.exit(basiliskStop(proc), add=TRUE)
}
if (is.environment(proc)) {
if (persist) {
output <- fun(..., store=proc)
} else {
output <- fun(...)
}
} else {
if (persist) {
wrapper <- function(.fun, ...) .fun(..., store=get(".basilisk.store", envir=.GlobalEnv))
output <- clusterCall(proc, fun=wrapper, .fun=fun, ...)[[1]]
} else {
output <- clusterCall(proc, fun=fun, ...)[[1]]
}
}
output
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.