Nothing
#' Multi-sample splitting
#'
#' The data is randomly split in two halves w.r.t. the observations and
#' variable selection using Lasso is performed on one half. Whereas the second
#' half and the selected variables are later used for testing by the function
#' \code{\link{test_only_hierarchy}}. This is repeated multiple times.
#'
#' @param x a matrix or list of matrices for multiple data sets. The matrix or
#' matrices have to be of type numeric and are required to have column names
#' / variable names. The rows and the columns represent the observations and
#' the variables, respectively.
#' @param y a vector, a matrix with one column, or list of the aforementioned
#' objects for multiple data sets. The vector, vectors, matrix, or matrices
#' have to be of type numeric. For \code{family = "binomial"}, the response
#' is required to be a binary vector taking values 0 and 1.
#' @param clvar a matrix or list of matrices of control variables.
#' @param B number of sample splits.
#' @param proportion.select proportion of variables to be selected by Lasso in
#' the multi-sample splitting step.
#' @param standardize a logical value indicating whether the variables should be
#' standardized.
#' @param family a character string naming a family of the error distribution;
#' either \code{"gaussian"} or \code{"binomial"}.
#' @param parallel type of parallel computation to be used. See the 'Details' section.
#' @param ncpus number of processes to be run in parallel.
#' @param cl an optional \strong{parallel} or \strong{snow} cluster used if
#' \code{parallel = "snow"}. If not supplied, a cluster on the local machine is created.
#' @param check.input a logical value indicating whether the function should
#' check the input. This argument is used to call
#' \code{\link{multisplit}} within
#' \code{\link{test_hierarchy}}.
#'
#' @details A given data with \code{nobs} is randomly split in two halves w.r.t.
#' the observations and \code{nobs * proportion.select} variables are selected
#' using Lasso (implemented in \code{\link{glmnet}}) on one half.
#' Control variables are not penalized if supplied
#' using the argument \code{clvar}. This is repeated \code{B} times for each
#' data set if multiple data sets are supplied. Those splits (i.e. second
#' halves of observations) and corresponding selected variables are used to
#' perform hierarchical testing by the function
#' \code{\link{test_only_hierarchy}}.
#'
#' The multi-sample split step can be run in parallel across the different
#' sample splits (\code{B} corresponds to number of sample splits) by
#' specifying the arguments \code{parallel} and \code{ncpus}.
#' There is an optional argument \code{cl} if \code{parallel = "snow"}.
#' There are three possibilities to set the argument \code{parallel}:
#' \code{parallel = "no"} for serial evaluation (default),
#' \code{parallel = "multicore"} for parallel evaluation
#' using forking, and \code{parallel = "snow"} for parallel evaluation
#' using a parallel socket cluster. It is recommended to select
#' \code{\link{RNGkind}("L'Ecuyer-CMRG")} and set a seed to ensure that
#' the parallel computing of the package \code{hierinf} is reproducible.
#' This way each processor gets a different substream of the pseudo random
#' number generator stream which makes the results reproducible if the arguments
#' (as \code{sort.parallel} and \code{ncpus}) remain unchanged. See the vignette
#' or the reference for more details.
#'
#' @return The returned value is an object of class \code{"hierM"}, consisting
#' of a list with number of elements corresponding to the number of data sets.
#' Each element (corresponding to a data set
#' % with \code{nobs} observations) contains a list with two matrices.
#' The first matrix
#' % of size \code{B x [nobs / 2]}
#' contains the indices of the second half of variables (which were not used
#' to select the variables). The second matrix
#' % of size \code{B x [nobs * proportion.select]}
#' contains the column names / variable names of the selected variables.
#'
#'
# A data frame with 2 components. A matrix of size \code{B x [nobs/2]}
# containing the second subsample of each split, and a matrix of size
# \code{B x [nobs/6]} containing the selected variables in each split.
#'
#' @seealso \code{\link{cluster_var}},
#' \code{\link{cluster_position}},
#' \code{\link{test_only_hierarchy}},
#' \code{\link{test_hierarchy}}, and
#' \code{\link{compute_r2}}.
#'
#' @examples
#' n <- 200
#' p <- 500
#' library(MASS)
#' set.seed(3)
#' x <- mvrnorm(n, mu = rep(0, p), Sigma = diag(p))
#' colnames(x) <- paste0("Var", 1:p)
#' beta <- rep(0, p)
#' beta[c(5, 20, 46)] <- 1
#' y <- x %*% beta + rnorm(n)
#'
#' set.seed(84)
#' res.multisplit <- multisplit(x = x, y = y, family = "gaussian")
#'
#' @references Renaux, C. et al. (2018), Hierarchical inference for genome-wide
#' association studies: a view on methodology with software. (arXiv:1805.02988)
#'
#' Meinshausen, N., Meier, L. and Buhlmann, P. (2009), P-values for
#' high-dimensional regression, Journal of the American Statistical Association
#' 104, 1671-1681.
#'
#' @name multisplit
#' @export
multisplit <- function(x, y, clvar = NULL, B = 50, proportion.select = 1/6,
standardize = FALSE, family = c("gaussian", "binomial"),
parallel = c("no", "multicore", "snow"), ncpus = 1L,
cl = NULL, check.input = TRUE) {
family <- match.arg(family)
parallel <- match.arg(parallel)
do.parallel <- (parallel != "no" && ncpus > 1L)
if (do.parallel && parallel == "multicore" && .Platform$OS.type == "windows") {
stop("The argument parallel = 'multicore' is not available for windows. Use parallel = 'snow' for parallel computing or parallel = 'no' for serial execution of the code.")
}
## check input
if (check.input) {
res <- check_input_testing(x = x, y = y, clvar = clvar, family = family,
# check result of the function multisplit
check_res_multisplit = FALSE,
res.multisplit = NULL,
# arguments for the function multisplit
check_multisplit_arguments = TRUE,
B = B, proportion.select = proportion.select,
standardize = standardize,
# arguments for the function
# test_hierarchy_given_multisplit
check_testing_arguments = FALSE,
dendr = NULL, block = NULL, alpha = NULL,
global.test = NULL, agg.method = NULL,
verbose = NULL)
x <- res$x
y <- res$y
clvar <- res$clvar
rm(list = c("res"))
}
## create the splits
len.x <- length(x)
ret.split <- lapply(seq_len(len.x), create_splits_one_data, y = y, B = B)
## run the function multisplit_one_data for each of the data sets
ret.sel <- lapply(seq_len(len.x), multisplit_one_data, x = x, y = y, clvar = clvar,
B = B, proportion.select = proportion.select,
standardize = standardize, family = family,
ret.split = ret.split, parallel = parallel,
ncpus = ncpus, cl = cl, do.parallel = do.parallel)
RET <- do.call(cbind, ret.sel)
resM.all <- structure(RET["resM", ], class = c("hierM", "list"))
names(resM.all) <- NULL
# do.call() returns NULL if there occurred no errors.
attr(resM.all,"errorMsgs") <- do.call(c, RET["errorMsgs", ])
attr(resM.all, "warningMsgs") <- do.call(c, RET["warningMsgs", ])
if (!is.null(attr(resM.all, "errorMsgs"))) {
warning("There occurred some errors while multi-sample splitting. See attribute 'errorMsgs' of the return object for more details.")
}
if (!is.null(attr(resM.all, "warningMsgs"))) {
warning("There occurred some warnings while multi-sample splitting. See attribute 'warningMsgs' of the return object for more details.")
}
# return the values
return(resM.all)
} # {multisplit}
# Create \code{B} splits for a given data set
#
# Returns indices for the \code{B} splits, i.e. in.sample and out.sample.
create_splits_one_data <- function(ind, y, B){
# create the splits
no.samples <- length(y[[ind]])
in.sample <- lapply(seq_len(B), function(i, no.samples) {
sort(sample(seq_len(no.samples), floor(no.samples/2)))
}, no.samples = no.samples)
out.sample <- lapply(in.sample, function(in.sample, no.samples) {
setdiff(seq_len(no.samples), in.sample)
}, no.samples = no.samples)
return(list("in.sample" = in.sample, "out.sample" = out.sample))
} # {create_splits_one_data}
# Multi-sample split for a givne data set
#
# For each split, variables are selected using Lasso on the \code{in.sample}.
# A matrix of the selected variables (column names) and a matrix of
# indices of \code{out.sample} is returned. Both have \code{B} rows.
multisplit_one_data <- function(ind, x, y, clvar, B, proportion.select,
standardize, family, ret.split, parallel,
ncpus, cl, do.parallel){
## prepare variables
in.out.sample <- ret.split[[ind]]
y.one <- y[[ind]]
no.samples <- length(y.one)
xc <- cbind(clvar[[ind]], x[[ind]])
no.xc <- ncol(xc)
no.clvar <-
if (is.null(clvar[[ind]])) {
0
} else {
ncol(clvar[[ind]])
}
# don't penalize the control variables
pen <- c(rep(0, no.clvar), rep(1, (no.xc - no.clvar)))
# The concept of how to elegantly parallelize a function call (and save
# all warning and error messages) is taken from the package boot
# respectively lme4. Both are nearly identical in that respect.
# See the source code of the package boot: R/bootfuns.q in the function
# boot().
# See the source code of the package lme4: R/bootMer.R in the function
# bootMer().
# Using a closure, the function below can access all the variables of the
# environment in which it was created. This makes parallel computation
# leaner or simpler, i.e. there are less arguments or we do not have to
# export objects to the workers in the PSOCKcluster case
one_split <- local({
xc
y.one
ind
no.clvar
pen
family
proportion.select
standardize
no.samples
in.out.sample
function(i) {
tryCatch_W_E(lasso_select(x = xc, y = y.one, no.clvar = no.clvar,
pen = pen, family = family,
proportion.select = proportion.select,
standardize = standardize,
no.samples = no.samples,
in.sample = in.out.sample$in.sample[[i]],
out.sample = in.out.sample$out.sample[[i]]),
ret.obj = list("out.sample" = rep(NA, no.samples - floor(no.samples/2)),
"sel.coef" = rep(NA, no.samples * proportion.select)))
}})
ret <- if (do.parallel) {
if (parallel == "multicore") {
parallel::mclapply(seq_len(B), one_split, mc.cores = ncpus)
} else if (parallel == "snow") {
if (is.null(cl)) {
cl <- parallel::makePSOCKcluster(rep("localhost", ncpus))
# export the namespace of hierinf in order for the use the functions
# of the package hierinf on the workers
parallel::clusterExport(cl, varlist = getNamespaceExports("hierinf"))
if(RNGkind()[1L] == "L'Ecuyer-CMRG")
parallel::clusterSetRNGStream(cl)
res <- parallel::parLapply(cl, seq_len(B), one_split)
parallel::stopCluster(cl)
res
} else parallel::parLapply(cl, seq_len(B), one_split)
}
} else lapply(seq_len(B), one_split)
# Simplify the result
ret <- do.call(cbind, ret)
# prepare the return object
rr <- do.call(cbind, ret["value", ])
resM <- list("out.sample" = do.call(rbind, rr[1, ]),
"sel.coef" = do.call(rbind, rr[2, ]))
return(list("resM" = resM,
"errorMsgs" = do.call(c, ret["error", ]),
"warningMsgs" = do.call(c, ret["warning", ])))
} # {multisplit_one_data}
# Selects coefficents using Lasso
#
# For a given split, variables are selected using Lasso based on the
# observations / rows \code{in.sample}.
#
# @param pen a vector of 0 and 1 encoding the penalty. Control variables are
# unpenalized.
#
# @return a list with two elements, i.e. the indices of the second half of
# observations (\code{out.sample}) and the column / variable names of the
# selected variables (\code{sel.coef}).
lasso_select <- function(x, y, no.clvar, pen, family, proportion.select,
standardize, no.samples, in.sample, out.sample) {
# perform lasso on the in.sample
lasso.output <- glmnet::glmnet(x = x[in.sample, ], y = y[in.sample],
family = family, standardize = standardize,
intercept = TRUE, penalty.factor = pen)
# save the first no.samples * proportion.select regression coefficients that
# enter the lasso path
lasso.coef.path <- lasso.output$beta@i + 1
unique.lasso.coef <- unique(lasso.coef.path)
# Note that no.samples * proportion.select may not be an integer but numeric
# values are coerced to integer as by as.integer(), i.e. truncated towards zero.
selected.lasso.coef <-
if (no.clvar > 0) {
setdiff(unique.lasso.coef, seq(1, no.clvar))[seq_len(no.samples * proportion.select)]
} else {
unique.lasso.coef[seq_len(no.samples * proportion.select)]
}
return(list("out.sample" = out.sample,
"sel.coef" = rownames(lasso.output$beta)[selected.lasso.coef]))
} # {lasso_select}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.