#' Partial Least Squares Methods
#'
#' @description `PomaPLS` performs Partial Least Squares (PLS) regression, Partial Least Squares Discriminant Analysis (PLS-DA) to classify samples, and Sparse Partial Least Squares Discriminant Analysis (sPLS-DA) to classify samples (supervised analysis) and select variables.
#'
#' @param data A `SummarizedExperiment` object.
#' @param method Character. PLS method. Options include "pls", "plsda", and "splsda".
#' @param y Character. Indicates the name of `colData` columns to be used as dependent variable. If it's set to NULL, the first variable in `colData` will be used as the dependent variable.
#' @param ncomp Numeric. Number of components in the model. Default is 5.
#' @param labels Logical. Indicates if sample names should be displayed.
#' @param ellipse Logical. Indicates whether a 95 percent confidence interval ellipse should be displayed. Default is TRUE.
#' @param cross_validation Logical. Indicates if cross-validation should be performed for PLS-DA ("plsda") and sPLS-DA ("splsda") methods. Default is FALSE.
#' @param validation Character. (Only for "plsda" and "splsda" methods). Indicates the cross-validation method. Options are "Mfold" and "loo" (Leave-One-Out).
#' @param folds Numeric. (Only for "plsda" and "splsda" methods). Number of folds for "Mfold" cross-validation method (default is 5). If the validation method is "loo", this value is set to 1.
#' @param nrepeat Numeric. (Only for "plsda" and "splsda" methods). Number of times the cross-validation process is repeated.
#' @param vip Numeric. (Only for "plsda" method). Indicates the variable importance in the projection (VIP) cutoff.
#' @param num_features Numeric. (Only for "splsda" method). Number of features to discriminate groups.
#' @param theme_params List. Indicates `theme_poma` parameters.
#'
#' @export
#'
#' @return A `list` with results including plots and tables.
#' @author Pol Castellano-Escuder
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' data <- POMA::st000284 %>% # Example SummarizedExperiment object included in POMA
#' PomaImpute() %>%
#' PomaNorm()
#'
#' ## Output is a list with objects `factors` (tibble), `factors_plot` (ggplot2 object), `loadings` (tibble), and `loadings_plot` (ggplot2 object)
#' # PLS
#' data %>%
#' PomaPLS(method = "pls",
#' y = NULL,
#' ncomp = 5,
#' labels = FALSE,
#' ellipse = FALSE)
#'
#' ## Output is a list with objects `factors` (tibble), `factors_plot` (ggplot2 object), `vip_values` (tibble), and `vip_plot` (ggplot2 object)
#' # PLS-DA
#' data %>%
#' PomaPLS(method = "plsda",
#' y = NULL,
#' ncomp = 5,
#' labels = FALSE,
#' ellipse = TRUE,
#' cross_validation = FALSE,
#' vip = 1)
#'
#' # Alternative outcome (dependent variable)
#' data %>%
#' PomaPLS(method = "plsda",
#' y = "gender",
#' ncomp = 5,
#' labels = FALSE,
#' ellipse = TRUE,
#' cross_validation = FALSE,
#' vip = 1)
#'
#' ## Output is a list with objects `factors` (tibble), `factors_plot` (ggplot2 object), `vip_values` (tibble), `vip_plot` (ggplot2 object), `errors` (tibble), and `errors_plot` (ggplot2 object)
#' # PLS-DA with Cross-Validation
#' data %>%
#' PomaPLS(method = "plsda",
#' y = NULL,
#' ncomp = 5,
#' labels = FALSE,
#' ellipse = TRUE,
#' cross_validation = TRUE,
#' validation = "Mfold",
#' folds = 5,
#' nrepeat = 10,
#' vip = 1)
#'
#' ## Output is a list with objects `factors` (tibble), `factors_plot` (ggplot2 object), `selected_features` (tibble), and `selected_features_plot` (ggplot2 object)
#' # sPLS-DA
#' data %>%
#' PomaPLS(method = "splsda",
#' y = NULL,
#' ncomp = 5,
#' labels = FALSE,
#' ellipse = TRUE,
#' cross_validation = FALSE,
#' num_features = 10)
#'
#' ## Output is a list with objects `factors` (tibble), `factors_plot` (ggplot2 object), `selected_features` (tibble), `selected_features_plot` (ggplot2 object), `errors` (tibble), `errors_plot` (ggplot2 object), `optimal_components` (numeric value), and `optimal_features` (vector with optimal features per component)
#' # sPLS-DA with Cross-Validation
#' data %>%
#' PomaPLS(method = "splsda",
#' y = NULL,
#' ncomp = 3,
#' labels = FALSE,
#' ellipse = TRUE,
#' cross_validation = TRUE,
#' validation = "Mfold",
#' folds = 5,
#' nrepeat = 10,
#' num_features = 10)
PomaPLS <- function(data,
method = "pls",
y = NULL,
ncomp = 5,
labels = FALSE,
ellipse = TRUE,
cross_validation = FALSE,
validation = "Mfold",
folds = 5,
nrepeat = 10,
vip = 1,
num_features = 10,
theme_params = list()) {
if(!is(data, "SummarizedExperiment")){
stop("data is not a SummarizedExperiment object. \nSee POMA::PomaCreateObject or SummarizedExperiment::SummarizedExperiment")
}
if (!(method %in% c("pls", "plsda", "splsda"))) {
stop("Incorrect value for method argument")
}
if (!(validation %in% c("Mfold", "loo"))) {
stop("Incorrect value for validation argument. Options are: 'Mfold' and 'loo'")
}
to_pls <- t(SummarizedExperiment::assay(data))
if (method == "pls") {
dependent_variable <- SummarizedExperiment::colData(data) %>%
dplyr::as_tibble() %>%
dplyr::select_if(is.numeric)
if (ncol(dependent_variable) == 0) {
stop("No numeric variables to be used as dependent variable in metadata file")
}
if (is.null(y)) {
y <- colnames(dependent_variable)[1]
}
dependent_variable <- dependent_variable %>%
dplyr::select(dplyr::all_of(y)) %>%
as.matrix()
pls_res <- mixOmics::pls(X = to_pls,
Y = dependent_variable,
ncomp = ncomp)
# factors
pls_res_df <- data.frame(y = as.numeric(dependent_variable), pls_res$variates$X) %>%
tibble::rownames_to_column("sample_id") %>%
dplyr::as_tibble()
# factors plot
factors_plot <- ggplot2::ggplot(pls_res_df, ggplot2::aes(x = comp1, y = comp2))+
{if(!labels)ggplot2::geom_point(ggplot2::aes(fill = y), size = 3, alpha = 0.8, pch = 21)} +
ggplot2::labs(x = "Component 1",
y = "Component 2",
fill = "Dependent\nvariable (Y)",
color = "Dependent\nvariable (Y)") +
{if(ellipse)ggplot2::stat_ellipse(type = "norm", show.legend = FALSE)} +
{if(labels)ggplot2::geom_text(ggplot2::aes(color = y, label = sample_id))} +
do.call(theme_poma, theme_params) +
scale_fill_poma_c() +
scale_color_poma_c()
# loadings
pls_loadings_df <- data.frame(pls_res$loadings$X) %>%
tibble::rownames_to_column("feature") %>%
dplyr::as_tibble()
# loadings plot
loadings_plot <- pls_loadings_df %>%
dplyr::arrange(dplyr::desc(abs(comp1))) %>%
dplyr::select(feature, comp1:paste0("comp", ncomp)) %>%
dplyr::slice(1L:10L) %>%
tidyr::pivot_longer(cols = -feature) %>%
ggplot2::ggplot(ggplot2::aes(x = reorder(feature, value),
y = value,
fill = name)) +
ggplot2::geom_col(position = "dodge2") +
ggplot2::labs(x = NULL,
y = "Loadings",
fill = NULL) +
theme_poma(axis_x_rotate = TRUE) +
scale_fill_poma_d()
return(list(factors = pls_res_df,
factors_plot = factors_plot,
loadings = pls_loadings_df,
loadings_plot = loadings_plot)
)
}
else if (method == "plsda") {
dependent_variable <- SummarizedExperiment::colData(data) %>%
dplyr::as_tibble() %>%
dplyr::select_if(is.factor)
if (ncol(dependent_variable) == 0) {
stop("No factor variables to be used as dependent variable in metadata file")
}
if (is.null(y)) {
y <- colnames(dependent_variable)[1]
}
dependent_variable <- dependent_variable %>%
dplyr::select(dplyr::all_of(y[1])) %>%
dplyr::pull(1)
plsda_res <- mixOmics::plsda(X = to_pls,
Y = dependent_variable,
ncomp = ncomp)
# factors
plsda_res_df <- data.frame(y = dependent_variable, plsda_res$variates$X) %>%
tibble::rownames_to_column("sample_id") %>%
dplyr::as_tibble()
# factors plot
factors_plot <- ggplot2::ggplot(plsda_res_df, ggplot2::aes(x = comp1, y = comp2))+
{if(!labels)ggplot2::geom_point(ggplot2::aes(fill = y), size = 3, alpha = 0.8, pch = 21)} +
ggplot2::labs(x = "Component 1",
y = "Component 2",
fill = "Dependent\nvariable (Y)",
color = "Dependent\nvariable (Y)") +
{if(ellipse)ggplot2::stat_ellipse(ggplot2::aes(color = y), type = "norm", show.legend = FALSE)} +
{if(labels)ggplot2::geom_text(ggplot2::aes(color = y, label = sample_id))} +
do.call(theme_poma, theme_params) +
scale_fill_poma_d() +
scale_color_poma_d()
# VIP
plsda_vip_df <- data.frame(mixOmics::vip(plsda_res)) %>%
tibble::rownames_to_column("feature") %>%
dplyr::arrange(dplyr::desc(comp1)) %>%
dplyr::as_tibble()
# VIP plot
plsda_vip_top <- plsda_vip_df %>%
dplyr::filter(comp1 >= vip)
vip_plot <- ggplot2::ggplot(plsda_vip_top, ggplot2::aes(x = comp1, y = reorder(feature, comp1), fill = comp1)) +
ggplot2::geom_col() +
ggplot2::labs(x = "Variable Importance\nin the Projection (VIP)",
y = NULL,
fill = "Component 1\nVIP") +
theme_poma(legend_position = "right") +
scale_fill_poma_c()
# cross-validation
if (cross_validation) {
perf_plsda <- mixOmics::perf(plsda_res, validation = validation, folds = folds,
progressBar = FALSE, auc = TRUE, nrepeat = nrepeat)
ber_errors <- data.frame(perf_plsda$error.rate$BER) %>%
janitor::clean_names() %>%
tibble::rownames_to_column("component") %>%
tidyr::pivot_longer(cols = -component) %>%
dplyr::mutate(error = "BER") %>%
dplyr::as_tibble()
errors_plsda <- data.frame(perf_plsda$error.rate$overall) %>%
janitor::clean_names() %>%
tibble::rownames_to_column("component") %>%
tidyr::pivot_longer(cols = -component) %>%
dplyr::mutate(error = "Overall") %>%
dplyr::bind_rows(ber_errors)
# errors plot
errors_plsda_plot <- ggplot2::ggplot(data = errors_plsda, ggplot2::aes(x = component, y = value)) +
ggplot2::geom_line(ggplot2::aes(color = name, group = name), show.legend = FALSE) +
ggplot2::geom_point(ggplot2::aes(fill = name), size = 3, alpha = 0.8, pch = 21) +
ggplot2::labs(x = "Component",
y = "Error",
fill = "Error\nType") +
ggplot2::facet_wrap(~error) +
do.call(theme_poma, theme_params) +
scale_color_poma_d() +
scale_fill_poma_d()
# errors
errors <- errors_plsda %>%
dplyr::mutate(component = paste0(component, "_", error)) %>%
dplyr::select(-error) %>%
tidyr::pivot_wider(id_cols = component, names_from = name, values_from = value) %>%
dplyr::as_tibble()
return(list(factors = plsda_res_df,
factors_plot = factors_plot,
vip_values = plsda_vip_df,
vip_plot = vip_plot,
errors = errors_plsda,
errors_plot = errors_plsda_plot)
)
} else {
return(list(factors = plsda_res_df,
factors_plot = factors_plot,
vip_values = plsda_vip_df,
vip_plot = vip_plot)
)
}
}
else if (method == "splsda") {
dependent_variable <- SummarizedExperiment::colData(data) %>%
dplyr::as_tibble() %>%
dplyr::select_if(is.factor)
if (ncol(dependent_variable) == 0) {
stop("No factor variables to be used as dependent variable in metadata file")
}
if (is.null(y)) {
y <- colnames(dependent_variable)[1]
}
dependent_variable <- dependent_variable %>%
dplyr::select(dplyr::all_of(y[1])) %>%
dplyr::pull(1)
splsda_res <- mixOmics::splsda(X = to_pls,
Y = dependent_variable,
ncomp = ncomp,
keepX = num_features)
# factors
splsda_res_df <- data.frame(y = dependent_variable, splsda_res$variates$X) %>%
tibble::rownames_to_column("sample_id") %>%
dplyr::as_tibble()
# factors plot
factors_plot <- ggplot2::ggplot(splsda_res_df, ggplot2::aes(x = comp1, y = comp2))+
{if(!labels)ggplot2::geom_point(ggplot2::aes(fill = y), size = 3, alpha = 0.8, pch = 21)} +
ggplot2::labs(x = "Component 1",
y = "Component 2",
fill = "Dependent\nvariable (Y)",
color = "Dependent\nvariable (Y)") +
{if(ellipse)ggplot2::stat_ellipse(ggplot2::aes(color = y), type = "norm", show.legend = FALSE)} +
{if(labels)ggplot2::geom_text(ggplot2::aes(color = y, label = sample_id))} +
do.call(theme_poma, theme_params) +
scale_fill_poma_d() +
scale_color_poma_d()
# selected features
selected_features <- mixOmics::selectVar(splsda_res, comp = 1)$value %>%
dplyr::rename(value = 1) %>%
tibble::rownames_to_column("feature") %>%
dplyr::as_tibble()
# selected features plot
selected_features_plot <- ggplot2::ggplot(selected_features, ggplot2::aes(x = value, y = reorder(feature, value), fill = value)) +
ggplot2::geom_col() +
ggplot2::labs(x = "Value",
y = NULL,
fill = "Component 1\nValue") +
theme_poma(legend_position = "right") +
scale_fill_poma_c()
# cross-validation
if (cross_validation) {
tune_splsda <- mixOmics::tune.splsda(X = to_pls,
Y = dependent_variable,
ncomp = ncomp, validation = validation, folds = folds,
progressBar = FALSE, dist = 'max.dist', measure = "BER",
test.keepX = c(1:num_features), nrepeat = nrepeat)
opt_ncomp <- tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests
select_keepX <- tune_splsda$choice.keepX[1:ncomp] # optimal number of variables to select
errors_splsda_sd <- data.frame(tune_splsda$error.rate.sd) %>%
tibble::rownames_to_column("feature_sd") %>%
tidyr::pivot_longer(cols = -feature_sd) %>%
dplyr::select(sd = value)
errors_splsda <- data.frame(tune_splsda$error.rate) %>%
tibble::rownames_to_column("feature") %>%
tidyr::pivot_longer(cols = -feature) %>%
dplyr::bind_cols(errors_splsda_sd) %>%
dplyr::as_tibble()
# errors plot
errors_splsda_plot <- ggplot2::ggplot(errors_splsda, ggplot2::aes(x = as.factor(as.numeric(feature)), y = value)) +
ggplot2::geom_line(ggplot2::aes(color = name, group = name), show.legend = FALSE) +
ggplot2::geom_point(ggplot2::aes(fill = name), size = 3, alpha = 0.8, pch = 21) +
ggplot2::geom_errorbar(ggplot2::aes(ymin = value - sd, ymax = value + sd, color = name), width = 0.1, show.legend = FALSE) +
ggplot2::labs(x = "# Features",
y = "Error",
fill = "Component") +
do.call(theme_poma, theme_params) +
scale_color_poma_d() +
scale_fill_poma_d()
# errors
errors <- errors_splsda %>%
dplyr::select(feature, value, name) %>%
tidyr::pivot_wider(id_cols = feature, names_from = name, values_from = value) %>%
dplyr::as_tibble()
return(list(factors = splsda_res_df,
factors_plot = factors_plot,
selected_features = selected_features,
selected_features_plot = selected_features_plot,
errors = errors_splsda,
errors_plot = errors_splsda_plot,
optimal_components = opt_ncomp,
optimal_features = select_keepX)
)
} else {
return(list(factors = splsda_res_df,
factors_plot = factors_plot,
selected_features = selected_features,
selected_features_plot = selected_features_plot)
)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.