#' Lasso, Ridge, and Elasticnet Regularized Generalized Linear Models for Binary Outcomes
#'
#' @description `PomaLasso` performs LASSO, Ridge, and Elasticnet regression for feature selection and prediction purposes for binary outcomes.
#'
#' @param data A `SummarizedExperiment` object.
#' @param alpha Numeric. Indicates the elasticnet mixing parameter. alpha = 1 is the LASSO penalty and alpha = 0 is the Ridge penalty.
#' @param ntest Numeric. Indicates the percentage of observations that will be used as test set. Default is NULL (no test set).
#' @param nfolds Numeric. Indicates number of folds for cross-validation (default is 10). Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds = 3.
#' @param lambda Numeric. Indicates the user supplied lambda sequence. Typical usage is to have the program compute its own lambda sequence based on `nlambda` and `lambda.min.ratio`. See `?glmnet::glmnet()`.
#' @param labels Logical. Indicates if feature names should be plotted in coefficient plot or not. Default is FALSE.
#'
#' @export
#'
#' @return A `list` with results.
#' @references Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent. Journal of Statistical Software, 33(1), 1-22. URL http://www.jstatsoft.org/v33/i01/.
#' @author Pol Castellano-Escuder
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' data <- POMA::st000336 %>% # Example SummarizedExperiment object included in POMA
#' PomaImpute() %>%
#' PomaNorm()
#'
#' ## Output is a list with objects `coefficients` (tibble), `coefficients_plot` (ggplot2 object), `cv_plot` (ggplot2 object), and `model` (cv.glmnet object)
#' # LASSO
#' data %>%
#' PomaLasso(alpha = 1,
#' ntest = NULL,
#' nfolds = 10,
#' lambda = NULL,
#' labels = TRUE)
#'
#' # Elasticnet
#' data %>%
#' PomaLasso(alpha = 0.5,
#' ntest = NULL,
#' nfolds = 10,
#' lambda = NULL,
#' labels = TRUE)
#'
#' # Ridge Regression
#' data %>%
#' PomaLasso(alpha = 0,
#' ntest = NULL,
#' nfolds = 10,
#' lambda = NULL,
#' labels = FALSE)
PomaLasso <- function(data,
alpha = 1,
ntest = NULL,
nfolds = 10,
lambda = NULL,
labels = FALSE){
if (!is(data, "SummarizedExperiment")){
stop("data is not a SummarizedExperiment object. \nSee POMA::PomaCreateObject or SummarizedExperiment::SummarizedExperiment")
}
if (alpha > 1 | alpha < 0) {
stop("alpha must be a number between 0 and 1")
}
if(!is.null(ntest)){
if (ntest > 50 | ntest < 5) {
stop("ntest must be a number between 5 and 50 (%)")
}
}
features <- t(SummarizedExperiment::assay(data))
group_factor <- as.factor(SummarizedExperiment::colData(data)[,1])
to_lasso <- cbind(group_factor, features)
if (length(table(group_factor)[table(group_factor) != 0]) != 2) {
stop("Grouping factor must have exactly 2 levels (first column of the metadata file)")
}
if (!is.null(ntest)){
repeat {
idx_test <- sample(1:nrow(to_lasso), (ntest/100) * nrow(to_lasso), replace = FALSE)
test <- to_lasso[idx_test ,]
test_x <- test[,-1]
test_y <- test[,1]
train <- to_lasso[-idx_test ,]
train_x <- train[,-1]
train_y <- train[,1]
if(length(levels(as.factor(train_y))) == 2 & length(levels(as.factor(test_y))) == 2){
break
}
}
cv_fit <- glmnet::cv.glmnet(data.matrix(train_x),
as.matrix(train_y),
family = "binomial",
nfolds = nfolds,
lambda = lambda,
alpha = alpha)
} else {
cv_fit <- glmnet::cv.glmnet(features,
group_factor,
family = "binomial",
nfolds = nfolds,
lambda = lambda,
alpha = alpha)
}
tidied_cv <- broom::tidy(cv_fit)
glance_cv <- broom::glance(cv_fit)
cvlasso <- ggplot2::ggplot(tidied_cv, ggplot2::aes(lambda, estimate)) +
ggplot2::geom_line(color = "blue") +
ggplot2::geom_ribbon(ggplot2::aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
ggplot2::scale_x_log10() +
ggplot2::labs(x = "log10(Lambda)",
y = "Estimate") +
ggplot2::geom_vline(xintercept = glance_cv$lambda.min, lty = 2) +
theme_poma()
tmp_coeffs <- glmnet::coef.glmnet(cv_fit, s = "lambda.min")
final_coef <- data.frame(feature = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = tmp_coeffs@x) %>%
dplyr::as_tibble()
if (!is.null(ntest)){
lasso_pred <- predict(cv_fit, s = cv_fit$lambda.min, newx = data.matrix(test_x), type = "class")
cm <- caret::confusionMatrix(as.factor(lasso_pred), as.factor(test_y))
}
tidied_cv2 <- broom::tidy(cv_fit$glmnet.fit)
tidied_cv2_names <- tidied_cv2 %>%
dplyr::arrange(dplyr::desc(abs(estimate))) %>%
dplyr::group_by(term) %>%
dplyr::slice(1)
coefficientplot <- ggplot2::ggplot(tidied_cv2, ggplot2::aes(lambda, estimate, color = term)) +
ggplot2::scale_x_log10() +
ggplot2::labs(x = "log10(Lambda)",
y = "Coefficients") +
ggplot2::geom_line() +
ggplot2::geom_vline(xintercept = glance_cv$lambda.min, lty = 2) +
ggplot2::theme_bw() +
{if(labels)ggplot2::geom_label(data = tidied_cv2_names, ggplot2::aes(label = term))} +
theme_poma(legend_position = "none") +
scale_color_poma_d()
if(!is.null(ntest)){
return(list(coefficients = final_coef,
coefficients_plot = coefficientplot,
cv_plot = cvlasso,
confusion_matrix = cm,
train_x = train_x,
train_y = train_y,
test_x = test_x,
test_y = test_y,
model = cv_fit))
} else {
return(list(coefficients = final_coef,
coefficients_plot = coefficientplot,
cv_plot = cvlasso,
model = cv_fit))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.