#############################################################################################################
# Authors:
# Florian Rohart, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
# Kim-Anh Le Cao, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
# Benoit Gautier, The University of Queensland, The University of Queensland Diamantina Institute, Translational Research Institute, Brisbane, QLD
# Francois Bartolo, Institut National des Sciences Appliquees et Institut de Mathematiques, Universite de Toulouse et CNRS (UMR 5219), France
#
# created: 2013
# last modified: 05-10-2017
#
# Copyright (C) 2013
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#############################################################################################################
# ========================================================================================================
# tune.spls: chose the optimal number of parameters per component on a spls method
# ========================================================================================================
# TODO details on nrepeat
# TODO details on pls vs spls
# TODO tidy outputs preferably in an array
#' Tuning functions for sPLS and PLS functions
#'
#' @template description/tune
#'
#' @template section/folds
#' @template section/nrepeat
#' @template section/measure-pls
#' @template section/t-test-process
#'
#' @section more:
#' See also \code{?perf} for more details.
#'
#' @inheritParams spls
#' @template arg/test.keepX-X.matrix
#' @template arg/test.keepY
#' @template arg/validation
#' @template arg/folds
#' @template arg/nrepeat
#' @param measure The tuning measure to use. See details.
#' @template arg/progressBar
#' @template arg/BPPARAM
#' @param seed set a number here if you want the function to give reproducible outputs.
#' Not recommended during exploratory analysis. Note if RNGseed is set in 'BPPARAM', this will be overwritten by 'seed'.
#' @param limQ2 Q2 threshold for recommending optimal \code{ncomp}.
#' @param ... Optional parameters passed to \code{\link{spls}}
#' @return A list that contains: \item{cor.pred}{The correlation of predicted vs
#' actual components from X (t) and Y (u) for each
#' component}\item{RSS.pred}{The Residual Sum of Squares of predicted vs
#' actual components from X (t) and Y (u) for each component}
#' \item{choice.keepX}{returns the number of variables selected for X (optimal
#' keepX) on each component.} \item{choice.keepY}{returns the number of
#' variables selected for Y (optimal keepY) on each component.}
#' \item{choice.ncomp}{returns the optimal number of components for the model
#' fitted with \code{$choice.keepX} and \code{$choice.keepY} } \item{call}{The
#' functioncal call including the parameteres used.}
#' @author Kim-Anh Lê Cao, Al J Abadi, Benoit Gautier, Francois Bartolo,
#' Florian Rohart,
#' @seealso \code{\link{splsda}}, \code{\link{predict.splsda}} and
#' http://www.mixOmics.org for more details.
#' @references mixOmics article:
#'
#' Rohart F, Gautier B, Singh A, Lê Cao K-A. mixOmics: an R package for 'omics
#' feature selection and multiple data integration. PLoS Comput Biol 13(11):
#' e1005752
#'
#' PLS and PLS citeria for PLS regression: Tenenhaus, M. (1998). La regression
#' PLS: theorie et pratique. Paris: Editions Technic.
#'
#' Chavent, Marie and Patouille, Brigitte (2003). Calcul des coefficients de
#' regression et du PRESS en regression PLS1. Modulad n, 30 1-11. (this is the
#' formula we use to calculate the Q2 in perf.pls and perf.spls)
#'
#' Mevik, B.-H., Cederkvist, H. R. (2004). Mean Squared Error of Prediction
#' (MSEP) Estimates for Principal Component Regression (PCR) and Partial Least
#' Squares Regression (PLSR). Journal of Chemometrics 18(9), 422-429.
#'
#' sparse PLS regression mode:
#'
#' Lê Cao, K. A., Rossouw D., Robert-Granie, C. and Besse, P. (2008). A sparse
#' PLS for variable selection when integrating Omics data. Statistical
#' Applications in Genetics and Molecular Biology 7, article 35.
#'
#' One-sided t-tests (suppl material):
#'
#' Rohart F, Mason EA, Matigian N, Mosbergen R, Korn O, Chen T, Butcher S,
#' Patel J, Atkinson K, Khosrotehrani K, Fisk NM, Lê Cao K-A&, Wells CA&
#' (2016). A Molecular Classification of Human Mesenchymal Stromal Cells. PeerJ
#' 4:e1845.
#' @keywords regression multivariate
#' @export
#' @examples
#'
#' \dontrun{
#' data(liver.toxicity)
#' X <- liver.toxicity$gene
#' Y <- liver.toxicity$clinic
#' set.seed(42)
#' tune.res = tune.spls( X, Y, ncomp = 3,
#' test.keepX = c(5, 10, 15),
#' test.keepY = c(3, 6, 8), measure = "cor",
#' folds = 5, nrepeat = 3, progressBar = TRUE)
#' tune.res$choice.ncomp
#' tune.res$choice.keepX
#' tune.res$choice.keepY
#' # plot the results
#' plot(tune.res)
#' }
# TODO change this so that it simply wraps perf
tune.spls <-
function(X,
Y,
test.keepX = NULL,
test.keepY = NULL,
ncomp,
validation = c('Mfold', 'loo'),
nrepeat = 1,
folds,
mode = c('regression', 'canonical', 'classic'),
measure = NULL,
BPPARAM = SerialParam(),
seed = NULL,
progressBar = FALSE,
limQ2 = 0.0975,
...
) {
out = list()
mode <- match.arg(mode)
BPPARAM$RNGseed <- seed
X <- .check_numeric_matrix(X, block_name = 'X')
Y <- .check_numeric_matrix(Y, block_name = 'Y')
check_cv <- .check_cv_args(validation = validation,
nrepeat = nrepeat, folds = folds,
N = nrow(X))
validation <- check_cv$validation
nrepeat <- check_cv$nrepeat
folds <- check_cv$folds
test.keepX <- .change_if_null(arg = test.keepX, default = ncol(X))
test.keepY <- .change_if_null(arg = test.keepY, default = ncol(Y))
test.keepX <- unique(test.keepX)
test.keepY <- unique(test.keepY)
spls.model <- !(length(test.keepX) == 1 & length(test.keepY) == 1)
if (ncol(Y) == 1) {
res <- tune.spls1(X = X,
Y = Y,
ncomp = ncomp,
test.keepX = test.keepX,
validation = validation,
folds = folds,
measure = measure, # can do a R2 per Y (correlation, linear regression R2), need to call MSEP (see perf.spls).
progressBar = progressBar,
nrepeat = nrepeat,
BPPARAM = BPPARAM,
seed = seed,
...
)
## --- call
res$call <- NULL
## eval all but X and Y
mc <- mget(names(formals())[-1:-2], sys.frame(sys.nframe()))
## replace function, X and Y with unevaluated call
mc <- as.call(c(as.list(match.call())[1:3], mc))
res <- c(list(call = mc), res)
class(res) <- 'tune.spls1'
return(res)
}
measure <- match.arg(measure, choices = c('cor', 'RSS'))
choice.keepX = choice.keepY = NULL
measure.table.cols <- c('comp', 'keepX', 'keepY', 'repeat', 't', 'u', 'cor', 'RSS')
# if (spls.model)
# {
measure.pred <- expand.grid(keepX = test.keepX,
keepY = test.keepY,
V = c('u', 't'),
measure = c('cor', 'RSS'),
comp = seq_len(ncomp),
optimum.keepA = FALSE)
# } else {
# # Q2 only
# measure.pred <- expand.grid(
# comp = seq_len(ncomp),
# optimum = FALSE)
# }
measure.pred <- data.frame(measure.pred)
measure.pred <- cbind(measure.pred,
value.v = I(rep(list(data.frame(matrix(NA_real_, ncol= 1, nrow = nrepeat))),
times = nrow(measure.pred))),
value.Q2.total = I(rep(list(data.frame(matrix(NA_real_, ncol= 1, nrow = nrepeat))),
times = nrow(measure.pred)))
)
# measure.pred$value <- lapply(measure.pred$value, as.data.frame)
use_progressBar <- progressBar & (is(BPPARAM, 'SerialParam'))
n_keepA <- length(test.keepX) * length(test.keepY)
## initialise optimal keepX/Y
measure.pred$optimum.keepA <- FALSE
measure.pred[
measure.pred$keepX == test.keepX[1] &
measure.pred$keepY == test.keepY[1]
,]$optimum.keepA <- TRUE
# list of all keepX/keepY passed to this via bplapply. calculates the
# correlation and RSS of components for given input keepX/Y and returns
# a subset of measure.pred containing these updated values
iter_keep.vals <- function(keep.vals) { # keep.vals = list(keepX, keepY)
keepX <- keep.vals$keepX # set keep values
keepY <- keep.vals$keepY
if (use_progressBar) {
n_tested <- n_tested + 1
prog_level <- n_tested / n_keepA
.progressBar(prog_level, title = 'of features tested')
}
# generate model using input parameters
pls.model <- spls(X = X, Y = Y,
keepX = c(choice.keepX, keepX),
keepY = c(choice.keepY, keepY),
ncomp = comp, mode = mode, ...)
# run perf in serial to avoid nested parallel processes, but make sure seed is passed into perf
pls.perf <- perf(pls.model, validation = validation, folds = folds, nrepeat = nrepeat,
BPPARAM = SerialParam(), seed = seed)
# ensures the only rows that are returned are those manipulated in this
# iteration. otherwise, later component iterations will undo this work
adjusted.rows <- c()
for (measure_i in c('cor', 'RSS')) # for each measure...
{
for (v in c('u', 't')) # for each set of components ...
{
## extract the relevant row index in measure.pred FOR VALUE.V
vpred.idx <- rownames(measure.pred[measure.pred$comp == comp &
measure.pred$keepX == keepX &
measure.pred$keepY == keepY &
measure.pred$V == v &
measure.pred$measure == measure_i,])
# extract relevant values
measure.vpred <- pls.perf$measures[[sprintf("%s.%spred", measure_i, v)]]$values
measure.vpred <- measure.vpred[measure.vpred$comp == comp,]
measure.pred[vpred.idx,]$value.v[[1]] <- measure.vpred$value # adjust measure.pred
## extract the relevant row index in measure.pred FOR Q2
Q2.idx <- rownames(measure.pred[measure.pred$comp == comp &
measure.pred$keepX == keepX &
measure.pred$keepY == keepY &
# TODO the following filtering criteria needs review - not sure if Q2 is per c('u' , 't')
measure.pred$V == v &
measure.pred$measure == measure_i
,])
# extract relevant values
value.Q2.total <- pls.perf$measures$Q2.total$values
value.Q2.total <- filter(value.Q2.total, comp == comp)$value
measure.pred[Q2.idx,]$value.Q2.total[[1]] <- value.Q2.total # adjust measure.pred
adjusted.rows <- c(adjusted.rows, vpred.idx, Q2.idx) # append adjusted rows to list
}
}
return(measure.pred[unique(adjusted.rows),])
}
# takes in the final measure.preds data.frame and adjusts the optimum.keepA
# feature to reflect the optimal values depending on the t.test
DetermineOptimalKeepVals <- function(measure.pred, comp, keep.vals, nrepeat) {
## output TRUE if value improves opt for cor or RSS, else FALSE
.check_improvement <- function(opt, value, measure, nrepeat) {
if (nrepeat > 2) {
t.test.res <- tryCatch(t.test(x = opt, y = value, alternative = ifelse(measure == 'cor', 'less', 'greater')),
error = function(e) e)
improved <- t.test.res$p.value < 0.05
} else
{
## compare values
improved <- if (measure == 'cor') mean(opt) < mean(value) else mean(opt) > mean(value)
}
improved
}
for (keep.val in keep.vals) {
trial.keepX <- keep.val$keepX # set keep values
trial.keepY <- keep.val$keepY
# current best for X components
optimum.u <- measure.pred[measure.pred$comp == comp &
measure.pred$optimum.keepA == TRUE &
measure.pred$V == 'u' &
measure.pred$measure == measure
,]$value.v[[1]]
# current best for Y components
optimum.t <- measure.pred[measure.pred$comp == comp &
measure.pred$optimum.keepA == TRUE &
measure.pred$V == 't' &
measure.pred$measure == measure
,]$value.v[[1]]
# new keepX/Y pair to try for X components
value.u <- measure.pred[measure.pred$comp == comp &
measure.pred$keepX == trial.keepX &
measure.pred$keepY == trial.keepY &
measure.pred$V == 'u' &
measure.pred$measure == measure
,]$value.v[[1]]
# new keepX/Y pair to try for Y components
value.t <- measure.pred[measure.pred$comp == comp &
measure.pred$keepX == trial.keepX &
measure.pred$keepY == trial.keepY &
measure.pred$V == 't' &
measure.pred$measure == measure
,]$value.v[[1]]
## workaround for constant values in t.test
offset.eps <- seq(1, 2, length.out = length(optimum.u))/1e6
optimum.t <- optimum.t + offset.eps
optimum.u <- optimum.u + offset.eps
value.t <- value.t + offset.eps
value.u <- value.u + offset.eps
# see if there is significant improvement in measure based on new keepX/Y pair
u.improved <-.check_improvement(opt = optimum.u, value = value.u, measure = measure, nrepeat = nrepeat)
t.improved <-.check_improvement(opt = optimum.t, value = value.t, measure = measure, nrepeat = nrepeat)
improved <- if (mode == 'canonical') u.improved & t.improved else u.improved
if (improved) # if so ...
{
## set previous to FALSE
measure.pred[measure.pred$comp == comp
,]$optimum.keepA <- FALSE
## update optimum
measure.pred[measure.pred$comp == comp &
measure.pred$keepX == trial.keepX &
measure.pred$keepY == trial.keepY
,]$optimum.keepA <- TRUE
}
}
return(measure.pred)
}
# generate list of lists containing all keepX/Y pairs to test
keep.vals <- list()
i <- 1
for(keepX in test.keepX){
for(keepY in test.keepY){
keep.vals[[i]] <- list(keepX = keepX, keepY=keepY)
i<-i+1
}
}
for (comp in seq_len(ncomp)){ # for each component to test...
if (use_progressBar) {
n_tested <- 0
cat(sprintf("\ntuning component: %s\n", comp))
}
# calculate measure values for specific comp, keepX/Y
m.p <- bplapply(X=keep.vals, FUN = iter_keep.vals,
BPPARAM = BPPARAM)
for (d in m.p) { # output will be a list, so adjust global measure.pred
# to contain the desired values
measure.pred[rownames(d), ] <- d
}
# changes measure.pred$optimum.keepA to reflect results of t.tests
measure.pred <- DetermineOptimalKeepVals(measure.pred, comp, keep.vals, nrepeat)
# extract optimal keepX for this component
choice.keepX.ncomp <- measure.pred[measure.pred$comp == comp &
measure.pred$optimum.keepA == TRUE &
measure.pred$measure == measure &
measure.pred$V == 't' ## doesn't matter t or u
,]$keepX
# extract optimal keepY for this component
choice.keepY.ncomp <- measure.pred[measure.pred$comp == comp &
measure.pred$optimum.keepA == TRUE &
measure.pred$measure == measure &
measure.pred$V == 't'
,]$keepY
choice.keepX = c(choice.keepX, choice.keepX.ncomp)
choice.keepY = c(choice.keepY, choice.keepY.ncomp)
} # end comp
choice.ncomp <- 1
for (comp in seq_len(ncomp))
{
Q2.total <- measure.pred[measure.pred$comp == comp &
measure.pred$keepX == keepX &
measure.pred$keepY == keepY &
measure.pred$V == 'u'
,]$value.Q2.total[[1]]
Q2.opt <- measure.pred[measure.pred$comp == choice.ncomp &
measure.pred$keepX == keepX &
measure.pred$keepY == keepY &
measure.pred$V == 'u'
,]$value.Q2.total[[1]]
keep.comp <- mean(Q2.total) >= limQ2
if (keep.comp)
# we want the first component that drops Q2 below limQ2
choice.ncomp <- comp + 1
}
names(choice.keepX) <- names(choice.keepY) <- paste0('comp', seq_len(ncomp))
out$choice.keepX = choice.keepX
out$choice.keepY = choice.keepY
out$choice.ncomp = choice.ncomp
## add mean and sd
val <- unclass(measure.pred$value.v)
measure.pred$mean <- sapply(measure.pred$value.v, function(x){
mean(c(as.matrix(x)), na.rm = TRUE)
})
measure.pred$sd <- sapply(measure.pred$value.v, function(x){
sd(c(as.matrix(x)), na.rm = TRUE)
})
out$measure.pred = measure.pred
### evaluate all for output except X and Y to save memory
## eval all but X and Y
mc <- mget(names(formals())[-1:-2], sys.frame(sys.nframe()))
## replace function, X and Y with unevaluated call
mc <- as.call(c(as.list(match.call())[1:3], mc))
out <- c(list(call = mc), out)
class <- c('tune.pls')
class(out) <- if (spls.model) class <- c(class, 'tune.spls')
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.