Nothing
#' @title Ensemble Classification & Feature Selection
#' @description Applies ensembles of models to high-dimensional data to both
#' classify and determine important features for classification. The
#' function bootstraps a user-specified number of times to facilitate
#' stability metrics of features selected thereby providing an important
#' metric for biomarker investigations, namely whether the important
#' variables can be identified if the models are refit on 'different' data.
#' @param X A matrix containing numeric values of each feature
#' @param Y A factor vector containing group membership of samples
#' @param method A vector listing models to be fit.
#' Available options are \code{"plsda"} (Partial Least Squares
#' Discriminant Analysis), \code{"rf"} (Random Forest), \code{"gbm"}
#' (Gradient Boosting Machine), \code{"svm"} (Support Vector Machines),
#' \code{"glmnet"} (Elastic-net Generalized Linear Model),
#' and \code{"pam"} (Prediction Analysis of Microarrays)
#' @param k Number of bootstrapped interations
#' @param p Percent of data to by 'trained'
#' @param f Number of features desired. Default is top 10%
#' \code{"f = ceiling(ncol(variables)/10)"}.
#' If rank correlation is desired, set \code{"f = NULL"}
#' @param bags Number of iterations for ensemble bagging. Default
#' \code{"bags = 40"}
#' @param aggregation.metric String indicating which aggregation metric
#' for features selected during bagging. Avialable options are \code{"CLA"}
#' (Complete Linear), \code{"EM"} (Ensemble Mean), \code{"ES"}
#' (Ensemble Stability), and \code{"EE"} (Ensemble Exponential)
#' @param stability.metric string indicating the type of stability metric.
#' Avialable options are \code{"jaccard"} (Jaccard Index/Tanimoto Distance),
#' \code{"sorensen"} (Dice-Sorensen's Index), \code{"ochiai"} (Ochiai's Index),
#' \code{"pof"} (Percent of Overlapping Features), \code{"kuncheva"}
#' (Kuncheva's Stability Measures), \code{"spearman"} (Spearman Rank
#' Correlation), and \code{"canberra"} (Canberra Distance)
#' @param optimize Logical argument determining if each model should
#' be optimized. Default \code{"optimize = TRUE"}
#' @param optimize.resample Logical argument determining if each resample
#' should be re-optimized. Default \code{"optimize.resample = FALSE"} - Only
#' one optimization run, subsequent models use initially determined parameters
#' @param tuning.grid Optional list of grids containing parameters to optimize
#' for each algorithm. Default \code{"tuning.grid = NULL"} lets function
#' create grid determined by \code{"res"}
#' @param k.folds Number of folds generated during cross-validation. May
#' optionally be set to \code{"LOO"} for leave-one-out cross-validation.
#' Default \code{"k.folds = 10"}
#' @param repeats Number of times cross-validation repeated.
#' Default \code{"repeats = 3"}
#' @param resolution Optional - Resolution of model optimization grid.
#' Default \code{"res = 3"}
#' @param metric Criteria for model optimization. Available options are
#' \code{"Accuracy"} (Predication Accuracy), \code{"Kappa"} (Kappa Statistic),
#' and \code{"AUC-ROC"} (Area Under the Curve - Receiver Operator Curve)
#' @param model.features Logical argument if should have number of features
#' selected to be determined by the individual model runs. Default
#' \code{"model.features = FALSE"}
#' @param allowParallel Logical argument dictating if parallel processing is
#' allowed via foreach package. Default \code{allowParallel = FALSE}
#' @param verbose Character argument specifying how much output progress
#' to print. Options are 'none', 'minimal' or 'full'.
#' @param ... Extra arguments that the user would like to apply to the models
#'
#' @return \item{Methods}{Vector of models fit to data}
#' @return \item{performance}{Performance metrics of each model and
#' bootstrap iteration}
#' @return \item{RPT}{Robustness-Performance Trade-Off as defined in
#' Saeys 2008}
#' @return \item{features}{List concerning features determined via each
#' algorithms feature selection criteria.}
#' @return \itemize{
#' \item{metric: Stability metric applied}
#' \item{features: Matrix of selected features}
#' \item{stability: Matrix of pairwise comparions and average stability}
#' }
#' @return \item{stability.models}{Function perturbation metric - i.e. how
#' similar are the features selected by each model.}
#' @return \item{all.tunes}{If \code{"optimize.resample = TRUE"} then returns
#' list of optimized parameters for each bagging and bootstrap interation.}
#' @return \item{final.best.tunes}{If \code{"optimize.resample = TRUE"} then
#' returns list of optimized parameters for each bootstrap of the bagged models
#' refit to aggregated selected features.}
#' @return \item{specs}{List with the
#' following elements:}
#' @return \itemize{
#' \item{total.samples: Number of samples in original dataset}
#' \item{number.features: Number of features in orginal dataset}
#' \item{number.groups: Number of groups}
#' \item{group.levels: The specific levels of the groups}
#' \item{number.observations.group: Number of observations in each group}}
#' @author Charles Determan Jr
#' @references Saeys Y., Abeel T., et. al. (2008) \emph{Machine Learning and
#' Knowledge Discovery in Databases}. 313-325.
#' http://link.springer.com/chapter/10.1007/978-3-540-87481-2_21
#' @examples
#' \dontrun{
#' fits <- fs.ensembl.stability(vars,
#' groups,
#' method = c("plsda", "rf"),
#' f = 10,
#' k = 3,
#' k.folds = 10,
#' verbose = 'none')
#' }
#' @import DiscriMiner
#' @import randomForest
#' @import plyr
#' @importFrom caret createDataPartition
#' @import e1071
#' @import gbm
#' @import pamr
#' @import glmnet
#' @export
fs.ensembl.stability <-
function(X,
Y,
method,
k = 10,
p = 0.9,
f = ceiling(ncol(X)/10),
bags=40,
aggregation.metric = "CLA",
stability.metric = "jaccard",
optimize = TRUE,
optimize.resample = FALSE,
tuning.grid = NULL,
k.folds = if(optimize) 10 else NULL,
repeats = if(k.folds=="LOO") NULL else if(optimize) 3 else NULL,
resolution = if(optimize) 3 else NULL,
metric = "Accuracy",
model.features = FALSE,
allowParallel = FALSE,
verbose = 'none',
...
)
{
theDots <- list(...)
verify(x = X, y = Y, method = method, na.rm = FALSE)
f <- verify_fs(f = as.integer(f),
stability.metric = stability.metric,
model.features = model.features,
no.fs=FALSE)
if (is.null(colnames(X))){
colnames(X) = paste(rep("X",ncol(X)),seq_len(ncol(X)), sep='')
}
if (is.null(rownames(X))) rownames(X) = 1:nrow(X)
if(model.features == TRUE){
stop("Error... Model derived features cannot be used for
ensemble because all features are ranked and then subset.
\nSet model.features = FALSE")
}
raw.data <- as.data.frame(X)
raw.data$.classes <- Y
method <- tolower(method)
nr<-nrow(X)
nc<-ncol(X)
# number of groups
num.group <- nlevels(Y)
# what the groups are
grp.levs <- levels(Y)
# how many obs in each group
num.obs.group <- as.vector(table(Y))
xNames <- colnames(X)
# Create empty list for features identified by each chosen algorithm
features <- vector("list", k)
names(features) <- paste("Resample", seq(k), sep = ".")
if(optimize == TRUE & optimize.resample == TRUE){
bagged.tunes <- vector("list", k)
names(bagged.tunes) <- paste("Resample", seq(k), sep=".")
}else{
bagged.tunes <- NULL
}
inTrain <- rlply(k, createDataPartition(Y, p = p, list = FALSE))
#inTrain <- rlply(k, sample(nr, round(p*nr)))
outTrain <- lapply(inTrain,
function(inTrain, total) total[-unique(inTrain)],
total = seq(nr))
#i <- 1
### Stability Loop
for (i in 1:k){
trainX <- X[inTrain[[i]],, drop=FALSE]
trainY <- Y[inTrain[[i]], drop=FALSE]
trainData <- as.data.frame(trainX)
trainData$.classes <- trainY
## Bagging loop
results.bagging <-
bagging.wrapper(X = trainX,
Y = trainY,
method = method,
bags = bags,
f = f,
aggregation.metric = aggregation.metric,
k.folds = k.folds,
repeats = repeats,
res = resolution,
tuning.grid = tuning.grid,
optimize = optimize,
optimize.resample = optimize.resample,
metric = metric,
model.features = model.features,
verbose = verbose,
allowParallel = allowParallel,
theDots = theDots)
# store the best tune parameters for each iteration
if(!is.null(bagged.tunes)){
bagged.tunes[[i]] <- results.bagging$bestTunes
}
# Store the features selected for stability analysis
features[[i]] <- as.list(results.bagging$results$ensemble.results)
### Re-fitting models to reduced features
# subset only those features which were selected
trainData.new <-
lapply(features[[i]],
FUN = function(x){
trainData[,colnames(trainData)
%in% c(as.vector(rownames(x)), ".classes")]
})
if(optimize == TRUE){
if(optimize.resample == TRUE){
tunedModel.new <- vector("list", length(method))
for(m in seq(along = method)){
tunedModel.new[[m]] <-
optimize.model(
trainVars = trainData.new[[m]][,!colnames(
trainData.new[[m]]) %in% c(".classes")],
trainGroup = trainData.new[[m]]$.classes,
method = method[m],
k.folds = k.folds,
repeats = repeats,
res = resolution,
grid = tuning.grid,
metric = metric,
allowParallel = allowParallel,
verbose = verbose,
theDots = theDots)
}
if(i == 1){
finalModel.new <-
sapply(tunedModel.new,
FUN = function(x) x$finalModel)
new.best.tunes <-
sapply(tunedModel.new,
FUN = function(x) x$bestTune)
names(new.best.tunes) <- method
}else{
tmp.model <-
sapply(tunedModel.new,
FUN = function(x) x$finalModels)
tmp.tunes <-
sapply(tunedModel.new,
FUN = function(x) x$bestTune)
names(tmp.tunes) <- method
finalModel.new <- c(finalModel.new, tmp.model)
new.best.tunes <- append(new.best.tunes, tmp.tunes)
}
# end of full optimize loop
}else{
if(i == 1){
tunedModel.new <- vector("list", length(method))
for(m in seq(along = method)){
tunedModel.new[[m]] <-
optimize.model(
trainVars =
trainData.new[[m]][,!colnames(
trainData.new[[m]]
) %in% c(".classes")],
trainGroup = trainData.new[[m]]$.classes,
method = method[m],
k.folds = k.folds,
repeats = repeats,
res = resolution,
grid = tuning.grid,
metric = metric,
allowParallel = allowParallel,
verbose = verbose,
theDots = theDots)
}
}else{
tmp <- vector("list", length(method))
names(tmp) <- method
for(d in seq(along = method)){
tmp[[d]] <-
training(
data = trainData.new[[d]],
method = method[d],
tuneValue = data.frame(
t(
unlist(
tunedModel.new[[d]]$bestTune
)
)
),
obsLevels = grp.levs,
theDots = theDots)$fit
}
}
if(i == 1){
finalModel.new <-
sapply(tunedModel.new,
FUN = function(x) x$finalModels)
new.best.tunes <-
sapply(tunedModel.new,
FUN = function(x) x$bestTune)
}else{
tmp.model <- lapply(tmp, FUN = function(x) x)
finalModel.new <- c(finalModel.new, tmp.model)
}
} # end of single optimize loop
# end of optimize loop
}else{
names(theDots) <- paste(".", names(theDots), sep="")
# sequester appropriate parameters to fit models
args.seq <- sequester(theDots, method)
# remove arguments used from theDots - also remove '.'
# from each
names(theDots) <- sub(".", "", names(theDots))
moreDots <- theDots[!names(theDots) %in% args.seq$pnames]
if(length(moreDots) == 0){
moreDots <- NULL
}
tunedModel.new <- vector("list", length(method))
for(q in seq(along = method)){
tunedModel.new[[q]] <-
training(data = trainData.new[[q]],
method = method[q],
tuneValue = args.seq$parameters[[q]],
obsLevels = grp.levs,
theDots = moreDots)$fit
}
if(i == 1){
finalModel.new <-
sapply(tunedModel.new, FUN = function(x) x)
}else{
tmp.model <- lapply(tunedModel.new, FUN = function(x) x)
finalModel.new <- c(finalModel.new, tmp.model)
}
} # end of non-optimized loop
} # end stability loop
# print(features)
# print(str(features))
if(stability.metric %in% c("spearman", "canberra")){
final.features <- lapply(unlist(features, recursive=FALSE),
function(x) as.numeric(as.vector(unlist(x[xNames,]))))
}else{
final.features <- lapply(unlist(features, recursive=FALSE),
function(x) as.vector(rownames(x)))
}
names(final.features) <-
rep(method, length(finalModel.new)/length(method))
### Performance Metrics of Reduced Models
final.metrics <- prediction.metrics(finalModel = finalModel.new,
method = method,
raw.data = raw.data,
inTrain = inTrain,
outTrain = outTrain,
bestTune = new.best.tunes,
features = final.features,
grp.levs = grp.levs,
stability.metric = stability.metric)
### Extract Performance Metrics
if(optimize == TRUE){
if(optimize.resample == TRUE){
colnames(final.metrics) <-
gsub("^\\.", "", colnames(final.metrics))
performance <- vector("list", length(method))
names(performance) <- method
final.metrics <-
final.metrics[,!grepl("^cell",
colnames(final.metrics)),
drop = FALSE]
# sort the list elements so applied in the proper order
method.names <-
unlist(
lapply(method,
FUN = function(x){
c(rep(x,
length(new.best.tunes)/length(method)
)
)
}))
new.best.tunes <-
new.best.tunes[order(names(new.best.tunes),
levels = method.names)]
# dataframe retains colnames after split
rownames(final.metrics) <- letters[seq(nrow(final.metrics))]
test.final.metrics <- split(as.data.frame(final.metrics),
rownames(final.metrics))
for(i in seq(along = test.final.metrics)){
rownames(test.final.metrics[[i]]) <- 1
}
rownames(final.metrics) <- method.names
all.model.perfs <-
mapply(new.best.tunes,
FUN = function(x, y) list(x,y),
y = test.final.metrics,
SIMPLIFY = FALSE)
# name resamples within all.model.perfs
samps <-
rep(
rep(
paste("Resample", seq(k), sep = "."),
each = 2),
length(method)
)
for(i in seq(length(all.model.perfs))){
if(i == 1){
start = i
end = i+1
}
names(all.model.perfs[[i]]) <- samps[start:end]
start = start + 2
end = end + 2
}
for(h in seq(along = method)){
x <- final.metrics[,!grepl("^cell",
colnames(final.metrics)),
drop = FALSE]
tmp <- subset(x, rownames(x) == method[h])
performance[[h]] <- c(colMeans(tmp, na.rm = TRUE),
apply(tmp, 2, sd, na.rm = TRUE))
names(performance[[h]])[-(1:ncol(tmp))] <-
paste(names(performance[[h]])[-(1:ncol(tmp))],
"SD",
sep = " ")
performance[[h]] <- t(data.frame(performance[[h]]))
rownames(performance[[h]]) <- 1
}
}else{
colnames(final.metrics) <-
gsub("^\\.", "", colnames(final.metrics))
performance <- vector("list", length(method))
names(performance) <- method
for(h in seq(along = method)){
x <- final.metrics[,!grepl("^cell",
colnames(final.metrics)),
drop = FALSE]
tmp <- subset(x, rownames(x) == method[h])
performance[[h]] <- c(colMeans(tmp, na.rm = TRUE),
apply(tmp, 2, sd, na.rm = TRUE))
names(performance[[h]])[-(1:ncol(tmp))] <-
paste(names(performance[[h]])[-(1:ncol(tmp))],
"SD",
sep = " ")
performance[[h]] <-
do.call(cbind,
c(as.vector(new.best.tunes[[h]]),
performance[[h]]))
colnames(performance[[h]]) <-
gsub("^\\.", "", colnames(performance[[h]]))
rownames(performance[[h]]) <- 1
}
}
}else{
colnames(final.metrics) <-
gsub("^\\.", "", colnames(final.metrics))
performance <- vector("list", length(method))
names(performance) <- method
for(h in seq(along = method)){
x <- final.metrics[,!grepl("^cell",
colnames(final.metrics)),
drop = FALSE]
tmp <- subset(x, rownames(x) == method[h])
performance[[h]] <- c(colMeans(x, na.rm = TRUE),
apply(tmp, 2, sd, na.rm = TRUE))
names(performance[[h]])[-(1:ncol(tmp))] <-
paste(names(performance[[h]])[-(1:ncol(tmp))],
"SD",
sep = " ")
performance[[h]] <-
do.call(cbind,
c(as.vector(args.seq$parameters[[h]]),
performance[[h]]))
colnames(performance[[h]]) <-
gsub("^\\.", "", colnames(performance[[h]]))
rownames(performance[[h]]) <- 1
}
}
# need to split features into length(method) dataframes
# for pairwise.stability
results.stability <- vector("list", length(method))
names(results.stability) <- method
for(c in seq(along = method)){
if(stability.metric
%in% c("jaccard", "kuncheva", "pof", "ochiai", "sorenson")){
results.stability[[c]] <-
do.call("cbind",
final.features[which(
names(final.features) == method[c]
)]
)
}else{
results.stability[[c]] <-
final.features[which(
names(final.features) == method[c]
)]
results.stability[[c]] <-
sapply(results.stability[[c]],
setNames,
colnames(X))
}
}
# Calculate All Pairwise Stability Measurements for each
# algorithm's set of features
stability <- lapply(results.stability,
pairwise.stability,
stability.metric = stability.metric,
nc= nc)
# stability across algorithms
# (i.e. 'function perturbation' ensemble analysis)
if(length(method) > 1){
stability.models <-
pairwise.model.stability(features = results.stability,
stability.metric = stability.metric,
nc = nc)
}else{
stability.models <- NULL
}
# harmonic mean of stability and performance
rpt.stab <- lapply(stability, FUN = function(x) x$overall)
rpt.perf <- lapply(performance,
FUN = function(x) as.data.frame(x)$Accuracy)
rpt <-
mapply(rpt.stab,
FUN = function(x,y) RPT(stability = x, performance = y),
y = rpt.perf)
# add stability metric to each respective algorithm
for(i in 1:length(method)){
results.stability[[i]] <-
list(metric = stability.metric,
features = results.stability[[i]],
stability = stability[[i]])
}
specs = list(total.samples=nr,
number.features=nc,
number.groups=num.group,
group.levels=grp.levs,
number.observations.group=num.obs.group)
## add remainder of data
overall <-
list(methods = method,
performance = performance,
RPT = rpt,
features = results.stability,
stability.models = stability.models,
all.tunes = bagged.tunes,
final.best.tunes = if(optimize.resample){all.model.perfs
}else{NULL},
specs = specs
)
return(overall)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.