Nothing
#' Canonical Correlation Value for SmCCA
#'
#' Calculate canonical correlation value for SmCCA given canonical weight vectors and scaling factor
#'
#'@param X A list of data each with same number of subjects.
#'@param CCcoef A vector of scaling factors indicating weights for each pairwise canonical correlation.
#'@param CCWeight A list of canonical weight vectors corresponds to each data in \eqn{X}.
#'@param Y A phenotype matrix, should have only one column.
#'@return A numeric value of the total canonical correlation
#'@examples
#'library(SmCCNet)
#'data("ExampleData")
#'getCanCorMulti(list(X1,X2), CCcoef = c(1,1,1),
#'CCWeight = list(rnorm(500,0,1), rnorm(100,0,1)), Y = Y)
#'@export
getCanCorMulti <- function(X, CCcoef, CCWeight, Y)
{
# sort out all combinations
paircomb <- utils::combn(length(X), 2)
# projection
omics_projection <- purrr::map(1:length(X), function(xx){
X[[xx]] %*% CCWeight[[xx]]
})
omics_projection <- do.call(cbind, omics_projection)
# calculate correlation between omics projection
omics_cor <- stats::cor(omics_projection)
# only extract the upper triangle of correlation matrix
omics_cor <- omics_cor[upper.tri(omics_cor)]
# calculate canonical correlation (pairwise between-omics)
CCBetween <- sum(CCcoef[1:length(X)] * omics_cor)
# calculate canonical correlation (pairwise omics-phenotype)
pheno_cor <- sum(stats::cor(omics_projection, Y) * CCcoef[(length(X) + 1):length(CCcoef)])
# adding two components together
OverallCC <- CCBetween + pheno_cor
return(OverallCC)
}
#' @title Scaling Factor Input Prompt
#'
#' @description
#' Input the vector of the annotation of each type of dataset in the data list X (e.g., \code{c('gene', 'protein')}), and return prompt ask the user to supply the scaling
#' factor for SmCCNet algorithm to prioritize the correlation structures of
#' interest. All scaling factor values supplied should be numeric and nonnegative.
#' @param DataType A character vector that contains the annotation of each type of omics dataset in X.
#' @return A numeric vector of scaling factors.
#' @examples
#' # not run
#' # scalingFactorInput(c('gene','mirna', 'phenotype'))
#'
#' @export
#'
scalingFactorInput <- function(DataType = NULL)
{
# find all possible combinations
combs <- utils::combn(length(DataType), 2)
scalingfactors <- rep(0, ncol(combs))
# ask user to supply the scaling factor for each combination:
for (i in 1:ncol(combs))
{
# Initialize condition as not satisfied
satisfied <- FALSE
suppressWarnings(while (!satisfied) {
# Ask for user input
input <- readline(prompt = paste0("Please enter scaling factor value for correlation between ",
DataType[combs[1,i]], ' and ',DataType[combs[2,i]], ":"))
# Check if the input is numeric
if (is.numeric(as.numeric(input)) && !is.na(as.numeric(input))) {
# Check if the input is nonnegative
if (as.numeric(input) >= 0) {
satisfied <- TRUE # Set condition to satisfied
} else {
cat("Invalid input. Please enter a nonnegative number.\n")
}
} else {
cat("Invalid input. Please enter a number.\n")
}
})
scalingfactors[i] <- as.numeric(input)
}
for (j in 1:ncol(combs))
{
cat( paste0("The scaling factor value for correlation between ",
DataType[combs[1,j]], ' and ',DataType[combs[2,j]], " is: ",
scalingfactors[j], '\n'))
}
return(scalingfactors)
}
#'@title Evaluation of Binary Classifier with Different Evaluation Metrics
#'@description Evaluate binary classifier's performance with respect to user-selected
#'metric (accuracy, auc score, precision, recall, f1 score) for binary phenotype.
#'@param obs Observed phenotype, vector consists of 0, 1.
#'@param pred Predicted probability of the phenotype, vector consists of any value between 0 and 1
#'@param EvalMethod Binary classifier evaluation method, should be one of the following:
#''accuracy' (default), 'auc', 'precision', 'recall', and 'f1'.
#'@param BinarizeThreshold Cutoff threshold to binarize the predicted probability, default is set
#'to 0.5.
#'@param print_score Whether to print out the evaluation score, default is set to \code{TRUE}.
#'@return An evaluation score corresponding to the selected metric.
#'@examples
#'# simulate observed binary phenotype
#'obs <- rbinom(100,1,0.5)
#'# simulate predicted probability
#'pred <- runif(100, 0,1)
#'# calculate the score
#'pred_score <- classifierEval(obs, pred, EvalMethod = 'f1', print_score = FALSE)
#'
#'@export
#'
classifierEval <- function(obs, pred, EvalMethod = 'accuracy',
BinarizeThreshold = 0.5, print_score = TRUE)
{
# specifying which method to use as evaluation metric
if (EvalMethod == 'accuracy')
{
# binarize prediction
pred_binary <- ifelse(pred > BinarizeThreshold, 1, 0)
# calculate prediction accuracy:
eval_score <- mean(as.factor(pred_binary) == as.factor(obs))
}else if (EvalMethod == 'auc')
{
# calculate auc score
suppressMessages(eval_score <- pROC::auc(as.factor(obs), pred))
} else if (EvalMethod == 'precision')
{
# binarize prediction
pred_binary <- ifelse(pred > BinarizeThreshold, 1, 0)
# calculate precision
TP <- sum(obs == 1 & pred_binary == 1)
FP <- sum(obs == 0 & pred_binary == 1)
eval_score <- TP / (TP + FP)
} else if (EvalMethod == 'recall')
{
# binarize prediction
pred_binary <- ifelse(pred > BinarizeThreshold, 1, 0)
# calculate precision
TP <- sum(obs == 1 & pred_binary == 1)
FN <- sum(obs == 1 & pred_binary == 0)
eval_score <- TP / (TP + FN)
} else if (EvalMethod == 'f1')
{
# binarize prediction
pred_binary <- ifelse(pred > BinarizeThreshold, 1, 0)
# calculate f1 score
TP <- sum(obs == 1 & pred_binary == 1)
TN <- sum(obs == 0 & pred_binary == 0)
FP <- sum(obs == 0 & pred_binary == 1)
FN <- sum(obs == 1 & pred_binary == 0)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
eval_score <- 2 * (precision * recall) / (precision + recall)
} else{
stop('correct evaluation metric should be suuplied, selections are among
accuracy, auc, precision, recall, f1 score.')
}
# print out the evaluation score
if (print_score == TRUE)
cat(paste0('The ', EvalMethod, ' score is: ', eval_score, '\n'))
# return the score
return(eval_score)
}
#' @title Automated SmCCNet to Streamline the SmCCNet Pipeline
#'
#' @description Automated SmCCNet automatically identifies the project problem (single-omics vs multi-omics),
#' and type of analysis (CCA for quantitative phenotype vs. PLS for binary phenotype)
#' based on the input data that is provided. This method automatically preprocesses data,
#' chooses scaling factors, subsampling percentage, and optimal penalty terms,
#' then runs through the complete SmCCNet pipeline without the requirement for users to provide additional information.
#' This function will store all the subnetwork information to a user-defined directory, as well as return all the global network and evaluation information.
#' Refer to the automated SmCCNet vignette for more information.
#'
#'
#' @param X A list of matrices with same set and order of subjects (\eqn{n}).
#' @param Y Phenotype variable of either numeric or binary, for binary variable, for binary \eqn{Y}, it should be binarized to 0,1 before running this function.
#' @param AdjustedCovar A data frame of covariates of interest to be adjusted for through regressing-out approach, argument preprocess need to be set to TRUE if adjusting covariates are supplied.
#' @param Kfold Number of folds for cross-validation, default is set to 5.
#' @param EvalMethod The evaluation methods used to selected the optimal penalty parameter(s) when binary phenotype is given. The selections is among 'accuracy', 'auc', 'precision', 'recall', and 'f1', default is set to 'accuracy'.
#' @param subSampNum Number of subsampling to run, the higher the better in terms of accuracy, but at a cost of computational time, we generally recommend 500-1000 to increase robustness for larger data, default is set to 100.
#' @param BetweenShrinkage A real number > 0 that helps shrink the importance of omics-omics correlation component, the larger this number
#' is, the greater the shrinkage it is, default is set to 2.
#' @param ScalingPen A numeric vector of length 2 used as the penalty terms for scaling factor determination method: default set to 0.1 for both datasets, and
#' should be between 0 and 1.
#' @param DataType A vector indicating annotation of each dataset of \eqn{X}, example would be \code{c('gene', 'miRNA')}.
#' @param CutHeight A numeric value specifying the cut height for hierarchical clustering, should be between 0 and 1, default is set to 1 - 0.1^10.
#' @param min_size Minimally possible subnetwork size after network pruning, default set to 10.
#' @param max_size Maximally possible subnetwork size after network pruning, default set to 100.
#' @param summarization Summarization method used for network pruning and summarization, should be either 'NetSHy' or 'PCA'.
#' @param saving_dir Directory where user would like to store the subnetwork results, default is set to the current working directory.
#' @param preprocess Whether the data preprocessing step should be conducted, default is set to FALSE. If regressing out covariates is needed, provide corresponding covariates to AdjustCovar argument.
#' @param ncomp_pls Number of components for PLS algorithm, only used when binary phenotype is given, default is set to 3.
#' @param tuneLength The total number of candidate penalty term values for each omics data, default is set to 5.
#' @param tuneRangeCCA A vector of length 2 that represents the range of candidate penalty term values for each omics data based on canonical correlation analysis,
#' default is set to \code{c(0.1,0.5)}.
#' @param tuneRangePLS A vector of length 2 that represents the range of candidate penalty term values for each omics data based on partial least squared discriminant analysis,
#' default is set to \code{c(0.5,0.9)}.
#' @param seed Random seed for result reproducibility, default is set to 123.
#' @return This function returns the global adjacency matrix, omics data details, network clustering outcomes, and cross-validation results. Pruned subnetwork modules are saved in the directory specified by the user.
#' @examples
#'
#'
#' # library(SmCCNet)
#' # set.seed(123)
#' # data("ExampleData")
#' # Y_binary <- ifelse(Y > quantile(Y, 0.5), 1, 0)
#' ## single-omics PLS
#' # result <- fastAutoSmCCNet(X = list(X1), Y = as.factor(Y_binary), Kfold = 3,
#' # subSampNum = 100, DataType = c('Gene'),
#' # saving_dir = getwd(), EvalMethod = 'auc',
#' # summarization = 'NetSHy',
#' # CutHeight = 1 - 0.1^10, ncomp_pls = 5)
#' ## single-omics CCA
#' # result <- fastAutoSmCCNet(X = list(X1), Y = Y, Kfold = 3, preprocess = FALSE,
#' # subSampNum = 50, DataType = c('Gene'),
#' # saving_dir = getwd(), summarization = 'NetSHy',
#' # CutHeight = 1 - 0.1^10)
#' ## multi-omics PLS
#' # result <- fastAutoSmCCNet(X = list(X1,X2), Y = as.factor(Y_binary),
#' # Kfold = 3, subSampNum = 50,
#' # DataType = c('Gene', 'miRNA'),
#' # CutHeight = 1 - 0.1^10,
#' # saving_dir = getwd(), EvalMethod = 'auc',
#' # summarization = 'NetSHy',
#' # BetweenShrinkage = 5, ncomp_pls = 3)
#' ## multi-omics CCA
#' # result <- fastAutoSmCCNet(X = list(X1,X2), Y = Y,
#' # K = 3, subSampNum = 50, DataType = c('Gene', 'miRNA'),
#' # CutHeight = 1 - 0.1^10,
#' # saving_dir = getwd(),
#' # summarization = 'NetSHy',
#' # BetweenShrinkage = 5)
#'
#' @export
fastAutoSmCCNet <- function(X, Y, AdjustedCovar = NULL, preprocess = FALSE, Kfold = 5,
EvalMethod = 'accuracy', subSampNum = 100, DataType,
BetweenShrinkage = 2, ScalingPen = c(0.1,0.1),
CutHeight = 1 - 0.1^10,
min_size = 10,
max_size = 100, summarization = 'NetSHy', saving_dir = getwd(),
ncomp_pls = 3,
tuneLength = 5,
tuneRangeCCA = c(0.1,0.5),
tuneRangePLS = c(0.5,0.9),
seed = 123)
{
# set random seed
seed <- set.seed(seed)
cat("\n")
cat("**********************************\n")
cat("* Welcome to Automated SmCCNet! *\n")
cat("**********************************\n")
cat("\n")
# Preprocess?
if (preprocess == TRUE)
{
cat("\n")
cat("--------------------------------------------------\n")
cat(">> Starting data preprocessing...\n")
cat("--------------------------------------------------\n")
cat("\n")
if (!is.null(AdjustedCovar))
{
cat('Covariate(s) are provided, now regressing out covariate effects from omics data.', '\n')
AdjustedCovar <- as.data.frame(AdjustedCovar)
}
# preprocess data
X <- purrr::map(1:length(X), function(xx){
dataPreprocess(X = as.data.frame(X[[xx]]), covariates = AdjustedCovar, is_cv = FALSE,
center = TRUE, scale = TRUE)
})
X <- lapply(X, as.matrix)
}
# Define problem: Single-Omics? Multi-Omics? Categorical (Binary? MultiLevel?)? Continuous?
if (length(X) > 1)
{
AnalysisType <- 'multiomics'
}else{
if (length(X) == 1)
{
AnalysisType <- 'singleomics'
}else{
stop('User must provide a list of dataframe.')
}
}
# check the outcome
if (ncol(as.matrix(Y)) > 1)
{
method <- 'CCA'
}else{
if (is.factor(Y))
{
method <- 'PLS'
}
if (is.numeric(Y))
{
method <- 'CCA'
}
}
cat(paste0('This project uses ', AnalysisType, ' ', method), '\n')
# Automatically set CC coefficients: check pairwise canonical correlation
if (AnalysisType == 'multiomics')
{
cat("\n")
cat("--------------------------------------------------\n")
cat(">> Now determining the scaling factor for multi-omics analysis...\n")
cat("--------------------------------------------------\n")
cat("\n")
AllComb <- utils::combn(length(X), 2)
DataTypePheno <- c(DataType, 'Phenotype')
ScalingFactor <- rep(0, ncol(AllComb))
names(ScalingFactor) <- apply(AllComb,2, function(x){
paste0(DataTypePheno[x[1]],'-',DataTypePheno[x[2]])
})
for (i in 1:ncol(AllComb))
{
# define the pair of matrices
X_pair <- X[AllComb[,i]]
# extract canonical weight
CC_weight <- getCanWeightsMulti(X_pair, Trait = NULL, Lambda = ScalingPen, NoTrait = TRUE)
# define scaling factor
ScalingFactor[i] <- abs(stats::cor(X_pair[[1]]%*% CC_weight[[1]], X_pair[[2]]%*% CC_weight[[2]]))
}
# shrink between-omics scaling factor based on shrinkage parameter
ScalingFactor <- ScalingFactor/BetweenShrinkage
if (method == 'PLS')
{
# set between-discriminated ratio
BDRatio <- c(mean(ScalingFactor), 1)
cat('The between-omics and omics-phenotype importance ratio is: ', BDRatio[1], ':', BDRatio[2], '\n')
}
# include scaling factors for multi-omics-phenotype relationship
ScalingFactorCC <- c(ScalingFactor, rep(1, length(X)))
CombPheno <- utils::combn(length(X) + 1,2)
NonPhenoIndex <- apply(CombPheno, 2, function(x){x[2] != (length(X) + 1)})
ScalingFactorTemp <- rep(1, ncol(CombPheno))
DataTypePheno <- c(DataType, 'Phenotype')
names(ScalingFactorTemp) <- apply(CombPheno,2, function(x){
paste0(DataTypePheno[x[1]],'-',DataTypePheno[x[2]])
})
ScalingFactorTemp[NonPhenoIndex] <- ScalingFactor
ScalingFactor <- ScalingFactorTemp
# if multi-omics PLS is used
if (method == 'PLS')
{
ScalingFactor <- ScalingFactor[NonPhenoIndex]
}
cat("\n")
cat('The scaling factor selection is: ', '\n')
cat("\n")
for (i in 1:length(ScalingFactor))
{
cat(paste0(names(ScalingFactor)[i],": ", ScalingFactor[i]), '\n')
}
} else{
cat('single omics analysis, skip scaling factor.', '\n')
}
cat("\n")
cat("--------------------------------------------------\n")
cat(">> Determining the best penalty selection through cross-validation...\n")
cat("--------------------------------------------------\n")
cat("\n")
# Automatically select a penalty candidates
if (method == 'CCA')
{
# define penalty columns
pen <- matrix(0, nrow = tuneLength, ncol = length(X))
# fill in penalty candidate
for (i in 1:ncol(pen))
{
pen[,i] <- seq(from = tuneRangeCCA[1],
to = tuneRangeCCA[2],
length.out = tuneLength)
}
# convert matrix to a list of columns
list_cols <- as.list(as.data.frame(pen))
# generate all possible combinations
PenComb <- do.call(expand.grid, list_cols)
}
if (method == 'PLS')
{
# check if single-omics or multiomics
if (AnalysisType == 'singleomics')
{
PenComb <- seq(from = tuneRangePLS[1],
to = tuneRangePLS[2],
length.out = tuneLength)
}
if (AnalysisType == 'multiomics')
{
# define penalty columns
pen <- matrix(0, nrow = tuneLength, ncol = length(X) + 1)
# fill in penalty candidate
for (i in 1:ncol(pen))
{
pen[,i] <- seq(from = tuneRangePLS[1],
to = tuneRangePLS[2],
length.out = tuneLength)
}
# convert matrix to a list of columns
list_cols <- as.list(as.data.frame(pen))
# generate all possible combinations
PenComb <- do.call(expand.grid, list_cols)
}
}
# Automatically select percent of subsampling
SubsamplingPercent <- rep(0, length(X))
for (i in 1:length(X))
{
if(ncol(X[[i]]) < 300)
{
SubsamplingPercent[i] <- 0.9
}else{
SubsamplingPercent[i] <- 0.7
}
}
# split data into folds
X <- lapply(X, scale)
# check to see whether Y need to be scaled or not
if (method == 'CCA')
{
Y <- scale(Y)
}
foldIdx <- split(1:nrow(X[[1]]), sample(1:nrow(X[[1]]), Kfold))
folddata <- purrr::map(1:length(foldIdx), function(x){
Y <- as.matrix(Y)
X_train <- list()
X_test <- list()
Y_train <- list()
Y_test <- list()
for (i in 1:length(X))
{
X_train[[i]] <- X[[i]][-foldIdx[[x]],]
X_test[[i]] <- X[[i]][foldIdx[[x]],]
}
Y_train <- Y[-foldIdx[[x]],]
Y_test <- Y[foldIdx[[x]],]
return(list(X_train = X_train, X_test = X_test,Y_train = Y_train, Y_test = Y_test))
})
names(folddata) <- paste0('fold_', 1:Kfold)
# Run SmCCNet (for single-omics)
if (AnalysisType == 'singleomics')
{
# case 1: continuous outcome
if (method == 'CCA')
{
# define parallel computing multisession
future::plan(future::multisession, workers = Kfold)
CVResult <- furrr::future_map(1:Kfold, function(xx) {
# select the kth fold
omicsdata <- folddata[[xx]]
PenComb <- PenComb[,1]
RhoTrain <- rep(0, length(PenComb))
RhoTest <- rep(0, length(PenComb))
DeltaCor <- rep(0, length(PenComb))
for(idx in 1:length(PenComb)){
# choose the penalty term
l1 <- PenComb[idx]
# run single omics SmCCA with continuous outcome
Ws <- getRobustWeightsSingle(omicsdata[[1]][[1]], as.matrix(omicsdata[[3]]), l1, 1,
SubsamplingNum = 1)
rho.train <- stats::cor(omicsdata[[1]][[1]] %*% Ws, as.matrix(omicsdata[[3]]))
rho.test <- stats::cor(omicsdata[[2]][[1]] %*% Ws, as.matrix(omicsdata[[4]]))
# store cross-validation result
RhoTrain[idx] <- round(rho.train, digits = 5)
RhoTest[idx] <- round(rho.test, digits = 5)
DeltaCor[idx] <- abs(rho.train - rho.test)
}
# combine cross-validation result
CVResult <- as.data.frame(cbind(RhoTrain, RhoTest, DeltaCor))
return(CVResult)
},.progress = TRUE,.options = furrr::furrr_options(seed = TRUE))
# aggregate CV result and select the best penalty term
AggregatedCVResult <- Reduce("+", CVResult) / length(CVResult)
EvalMetric <- apply(AggregatedCVResult, 1, function(x) {x[3]/abs(x[2])})
BestPen <- PenComb[which.min(EvalMetric),1]
cat(paste0('\n','The best penalty term after ', Kfold,'-fold cross-validation is: ', BestPen), '\n')
cat(paste0('with testing canonical correlation = ', round(AggregatedCVResult$RhoTest[which.min(EvalMetric)],3),
', and prediction error = ', round(AggregatedCVResult$DeltaCor[which.min(EvalMetric)],3)), '\n')
cat('Now running single-omics CCA with best penalty term on the complete dataset.', '\n')
# run single-omics CCA
Ws <- getRobustWeightsSingle(X1 = X[[1]], Trait = as.matrix(as.numeric(Y)), Lambda1 = BestPen,
s1 = SubsamplingPercent, SubsamplingNum = subSampNum)
# construct global network module
Abar <- getAbar(Ws, FeatureLabel = colnames(X[[1]]))
}
# case when there is categorical outcome
if (method == 'PLS')
{
# check if it is non-binary
if (length(summary(as.factor(Y))) > 2)
stop('Currently not support non-binary phenotype.')
# define parallel computing multisession
future::plan(future::multisession, workers = Kfold)
CVResult <- furrr::future_map(1:Kfold, function(xx) {
# select the kth fold
omicsdata <- folddata[[xx]]
TrainMetric <- rep(0, length(PenComb))
TestMetric <- rep(0, length(PenComb))
for(idx in 1:length(PenComb)){
# choose the penalty term
l1 <- PenComb[idx]
# run single omics SmCCA with continuous outcome
Ws <- spls::splsda(omicsdata[[1]][[1]], omicsdata[[3]], K = ncomp_pls, eta = l1, kappa=0.5,
classifier= 'logistic', scale.x=FALSE)
weight <- matrix(0,nrow = ncol(omicsdata[[1]][[1]]), ncol = ncomp_pls)
weight[Ws[["A"]],] <- Ws[["W"]]
# create training and testing data, and fit logistic regression model
train_data <- data.frame(x = (omicsdata[[1]][[1]] %*% weight)[,1:ncomp_pls], y = as.factor(omicsdata[[3]]))
test_data <- data.frame(x = (omicsdata[[2]][[1]] %*% weight)[,1:ncomp_pls])
logisticFit <- stats::glm(y~., family = 'binomial',data = train_data)
# make prediction for train/test set
train_pred <- stats::predict(logisticFit, train_data, type = 'response')
test_pred <- stats::predict(logisticFit, test_data, type = 'response')
# specifying which method to use as evaluation metric
if (EvalMethod == 'accuracy')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate prediction accuracy:
acc.train <- mean(as.factor(train_pred) == as.factor(omicsdata[[3]]))
acc.test <- mean(as.factor(test_pred) == as.factor(omicsdata[[4]]))
# store prediction metric result
TrainMetric[idx] <- acc.train
TestMetric[idx] <- acc.test
}
if (EvalMethod == 'auc')
{
# calculate auc score
auc.train <- pROC::auc(as.factor(omicsdata[[3]]), train_pred)
auc.test <- pROC::auc(as.factor(omicsdata[[4]]), test_pred)
# store prediction metric result
TrainMetric[idx] <- auc.train
TestMetric[idx] <- auc.test
}
if (EvalMethod == 'precision')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate precision
TP_train <- sum(omicsdata[[3]] == 1 & train_pred == 1)
FP_train <- sum(omicsdata[[3]] == 0 & train_pred == 1)
precision.train <- TP_train / (TP_train + FP_train)
TP_test <- sum(omicsdata[[4]] == 1 & test_pred == 1)
FP_test <- sum(omicsdata[[4]] == 0 & test_pred == 1)
precision.test <- TP_test / (TP_test + FP_test)
# store prediction metric result
TrainMetric[idx] <- precision.train
TestMetric[idx] <- precision.test
}
if (EvalMethod == 'recall')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate precision
TP_train <- sum(omicsdata[[3]] == 1 & train_pred == 1)
FN_train <- sum(omicsdata[[3]] == 1 & train_pred == 0)
recall.train <- TP_train / (TP_train + FN_train)
TP_test <- sum(omicsdata[[4]] == 1 & test_pred == 1)
FN_test <- sum(omicsdata[[4]] == 1 & test_pred == 0)
recall.test <- TP_test / (TP_test + FN_test)
# store prediction metric result
TrainMetric[idx] <- recall.train
TestMetric[idx] <- recall.test
}
if (EvalMethod == 'f1')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate precision
TP_train <- sum(omicsdata[[3]] == 1 & train_pred == 1)
TN_train <- sum(omicsdata[[3]] == 0 & train_pred == 0)
FP_train <- sum(omicsdata[[3]] == 0 & train_pred == 1)
FN_train <- sum(omicsdata[[3]] == 1 & train_pred == 0)
precision.train <- TP_train / (TP_train + FP_train)
recall.train <- TP_train / (TP_train + FN_train)
f1.train <- 2 * (precision.train * recall.train) / (precision.train + recall.train)
TP_test <- sum(omicsdata[[4]] == 1 & test_pred == 1)
TN_test <- sum(omicsdata[[4]] == 0 & test_pred == 0)
FP_test <- sum(omicsdata[[4]] == 0 & test_pred == 1)
FN_test <- sum(omicsdata[[4]] == 1 & test_pred == 0)
precision.test <- TP_test / (TP_test + FP_test)
recall.test <- TP_test / (TP_test + FN_test)
f1.test <- 2 * (precision.test * recall.test) / (precision.test + recall.test)
# store prediction metric result
TrainMetric[idx] <- f1.train
TestMetric[idx] <- f1.test
}
}
# combine cross-validation result
CVResult <- as.data.frame(cbind(TrainMetric, TestMetric))
return(CVResult)
},.progress = TRUE,.options = furrr::furrr_options(seed = TRUE))
# aggregate CV result and select the best penalty term
AggregatedCVResult <- Reduce("+", CVResult) / length(CVResult)
BestPen <- PenComb[which.max(AggregatedCVResult$TestMetric)]
cat(paste0('\n'))
for (xx in 1:length(BestPen))
{
cat(paste0('The best penalty term for ', DataType[xx], ' after ', Kfold,'-fold cross-validation is: ', BestPen[xx]), '\n')
}
cat(paste0('with testing ', EvalMethod,' score = ', round(AggregatedCVResult$TestMetric[which.max(AggregatedCVResult$TestMetric)],3)), '\n')
cat('Now running single-omics PLS with best penalty term on the complete dataset.', '\n')
# run single-omics PLS
Ws <- getRobustWeightsSingleBinary(X1 = X[[1]], Trait = as.numeric(Y) - 1, Lambda1 = BestPen, K = ncomp_pls,
s1 = SubsamplingPercent, SubsamplingNum = subSampNum)
# construct global network module
Abar <- getAbar(Ws, FeatureLabel = colnames(X[[1]]))
}
}
# Run SmCCNet (for multi-omics)
if (AnalysisType == 'multiomics')
{
# case 1: continuous outcome
if (method == 'CCA')
{
# define parallel computing multisession
future::plan(future::multisession, workers = Kfold)
CVResult <- furrr::future_map(1:Kfold, function(xx) {
# select the kth fold
omicsdata <- folddata[[xx]]
RhoTrain <- rep(0, length(PenComb))
RhoTest <- rep(0, length(PenComb))
DeltaCor <- rep(0, length(PenComb))
for(idx in 1:nrow(PenComb)){
# choose the penalty term
l1 <- PenComb[idx, ]
# run single omics SmCCA with continuous outcome
Ws <- getCanWeightsMulti(omicsdata[[1]], Trait = as.matrix(omicsdata[[3]]), Lambda = l1, NoTrait = FALSE, CCcoef = ScalingFactor)
rho.train <- getCanCorMulti(X = omicsdata[[1]], Y = omicsdata[[3]],CCWeight = Ws, CCcoef = ScalingFactorCC)
rho.test <- getCanCorMulti(X = omicsdata[[2]], Y = omicsdata[[4]],CCWeight = Ws, CCcoef = ScalingFactorCC)
# store cross-validation result
RhoTrain[idx] <- round(rho.train, digits = 5)
RhoTest[idx] <- round(rho.test, digits = 5)
DeltaCor[idx] <- abs(rho.train - rho.test)
}
# combine cross-validation result
CVResult <- as.data.frame(cbind(RhoTrain, RhoTest, DeltaCor))
return(CVResult)
},.progress = TRUE,.options = furrr::furrr_options(seed = TRUE))
# aggregate CV result and select the best penalty term
AggregatedCVResult <- Reduce("+", CVResult) / length(CVResult)
EvalMetric <- apply(AggregatedCVResult, 1, function(x) {x[3]/abs(x[2])})
BestPen <- PenComb[which.min(EvalMetric),]
cat(paste0('\n'))
for (xx in 1:length(BestPen))
{
cat(paste0('The best penalty term for ', DataType[xx], ' after ', Kfold,'-fold cross-validation is: ', BestPen[xx]), '\n')
}
cat(paste0('with testing canonical correlation = ', round(AggregatedCVResult$RhoTest[which.min(EvalMetric)],3),
', and prediction error = ', round(AggregatedCVResult$DeltaCor[which.min(EvalMetric)],3)), '\n')
cat('Now running multi-omics CCA with best penalty term on the complete dataset.', '\n')
# run multi-omics CCA
Ws <- getRobustWeightsMulti(X, Trait = as.matrix(Y), NoTrait = FALSE,CCcoef = ScalingFactor,
Lambda = BestPen,s = SubsamplingPercent, SubsamplingNum = subSampNum)
# construct global network
featurelabel <- unlist(lapply(X, colnames))
Abar <- getAbar(Ws, FeatureLabel = featurelabel)
}
# case when there is categorical outcome
if (method == 'PLS')
{
# check if it is non-binary
if (length(summary(as.factor(Y))) > 2)
stop('Currently not support non-binary outcome.')
# define parallel computing multisession
future::plan(future::multisession, workers = Kfold)
# running multisession parallel computing with multi-block PLS
CVResult <- furrr::future_map(1:Kfold, function(xx) {
# select the kth fold
omicsdata <- folddata[[xx]]
TrainMetric <- rep(0, nrow(PenComb))
TestMetric <- rep(0, nrow(PenComb))
for(idx in 1:nrow(PenComb)){
# choose the penalty term
l1 <- PenComb[idx,]
# run multi-omics SmCCA with continuous outcome
# Ws <- getRobustWeightsMultiBinary(omicsdata[[1]], as.numeric(omicsdata[[3]]), SubsamplingPercent = rep(1, length(omicsdata[[1]])), Between_Discriminate_Ratio = BDRatio,
# LambdaBetween = l1[1,1:(length(l1) - 1)], LambdaPheno = l1[1,length(l1)], SubsamplingNum = 1, CCcoef = ScalingFactor, ncomp_pls = ncomp_pls)
suppressMessages(projection <- getRobustWeightsMultiBinary(omicsdata[[1]],
as.numeric(omicsdata[[3]]),
SubsamplingPercent=rep(1, length(omicsdata[[1]])),
Between_Discriminate_Ratio = BDRatio,
LambdaBetween = l1[1,1:(length(l1) - 1)],
LambdaPheno = l1[1,length(l1)],
SubsamplingNum = 1,
CCcoef = ScalingFactor,
ncomp_pls = ncomp_pls, EvalClassifier = TRUE,
testData = omicsdata[[2]]))
# determine the type of dataset each feature belongs to
# feature_size <- unlist(lapply(X, ncol))
# types <- unlist(purrr::map((1:length(feature_size)), function(x){
# rep(DataType[x], feature_size[x])
#}))
# spliting the canonical weight vector by type
#Ws_split <- split(Ws[,1], types)
# project each omics dataset into the lower dimensional embedding (for both training and testing data)
#omics_projection_train <- purrr::map(1:length(X), function(xx){
# omicsdata[[1]][[xx]] %*% Ws_split[[xx]]
#})
#omics_projection_test <- purrr::map(1:length(X), function(xx){
# omicsdata[[2]][[xx]] %*% Ws_split[[xx]]
#})
# create training and testing data, and fit logistic regression model
#train_data <- data.frame(x = do.call(cbind, omics_projection_train), y = as.factor(omicsdata[[3]]))
#test_data <- data.frame(x = do.call(cbind, omics_projection_test))
train_data <- data.frame(x = projection[[1]], y = as.factor(omicsdata[[3]]))
test_data <- data.frame(x = projection[[2]])
# catching error when performing the logistic regression
has_error <- FALSE
tryCatch({
# fit logistic regression model
logisticFit <- stats::glm(y ~ ., family = 'binomial', data = train_data)
# make prediction for train/test set
train_pred <- stats::predict(logisticFit, train_data, type = 'response')
test_pred <- stats::predict(logisticFit, test_data, type = 'response')
# specifying which method to use as evaluation metric
if (EvalMethod == 'accuracy')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate prediction accuracy:
acc.train <- sum(train_pred == omicsdata[[3]])/length(train_pred)
acc.test <- sum(test_pred == omicsdata[[4]])/length(test_pred)
# store prediction metric result
TrainMetric[idx] <- acc.train
TestMetric[idx] <- acc.test
}else if (EvalMethod == 'auc')
{
# calculate auc score
auc.train <- pROC::auc(as.factor(omicsdata[[3]]), train_pred)
auc.test <- pROC::auc(as.factor(omicsdata[[4]]), test_pred)
# store prediction metric result
TrainMetric[idx] <- auc.train
TestMetric[idx] <- auc.test
}else if (EvalMethod == 'precision')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate precision
TP_train <- sum(omicsdata[[3]] == 1 & train_pred == 1)
FP_train <- sum(omicsdata[[3]] == 0 & train_pred == 1)
precision.train <- TP_train / (TP_train + FP_train)
TP_test <- sum(omicsdata[[4]] == 1 & test_pred == 1)
FP_test <- sum(omicsdata[[4]] == 0 & test_pred == 1)
precision.test <- TP_test / (TP_test + FP_test)
# store prediction metric result
TrainMetric[idx] <- precision.train
TestMetric[idx] <- precision.test
}else if (EvalMethod == 'recall')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate precision
TP_train <- sum(omicsdata[[3]] == 1 & train_pred == 1)
FN_train <- sum(omicsdata[[3]] == 1 & train_pred == 0)
recall.train <- TP_train / (TP_train + FN_train)
TP_test <- sum(omicsdata[[4]] == 1 & test_pred == 1)
FN_test <- sum(omicsdata[[4]] == 1 & test_pred == 0)
recall.test <- TP_test / (TP_test + FN_test)
# store prediction metric result
TrainMetric[idx] <- recall.train
TestMetric[idx] <- recall.test
}else if (EvalMethod == 'f1')
{
# binarize prediction
train_pred <- ifelse(train_pred > 0.5, 1, 0)
test_pred <- ifelse(test_pred > 0.5, 1, 0)
# calculate precision
TP_train <- sum(omicsdata[[3]] == 1 & train_pred == 1)
TN_train <- sum(omicsdata[[3]] == 0 & train_pred == 0)
FP_train <- sum(omicsdata[[3]] == 0 & train_pred == 1)
FN_train <- sum(omicsdata[[3]] == 1 & train_pred == 0)
precision.train <- TP_train / (TP_train + FP_train)
recall.train <- TP_train / (TP_train + FN_train)
f1.train <- 2 * (precision.train * recall.train) / (precision.train + recall.train)
TP_test <- sum(omicsdata[[4]] == 1 & test_pred == 1)
TN_test <- sum(omicsdata[[4]] == 0 & test_pred == 0)
FP_test <- sum(omicsdata[[4]] == 0 & test_pred == 1)
FN_test <- sum(omicsdata[[4]] == 1 & test_pred == 0)
precision.test <- TP_test / (TP_test + FP_test)
recall.test <- TP_test / (TP_test + FN_test)
f1.test <- 2 * (precision.test * recall.test) / (precision.test + recall.test)
# store prediction metric result
TrainMetric[idx] <- f1.train
TestMetric[idx] <- f1.test
}else{
stop('correct evaluation metric should be suuplied, selections are among
accuracy, auc, precision, recall, f1 score.')
}
},
error = function(e) {
cat("Caught an error:", e$message, "on iteration", i, "\n")
has_error <- TRUE
})
if (has_error) {
next # Skip to the next iteration
}
}
# combine cross-validation result
CVResult <- as.data.frame(cbind(TrainMetric, TestMetric))
return(CVResult)
},.progress = TRUE,.options = furrr::furrr_options(seed = TRUE))
# aggregate CV result and select the best penalty term
AggregatedCVResult <- Reduce("+", CVResult) / length(CVResult)
BestPen <- PenComb[which.max(AggregatedCVResult$TestMetric),]
cat('\n')
for (xx in 1:length(X))
{
cat(paste0('The best penalty term for ', DataType[xx], ' after ', Kfold,'-fold cross-validation is: ', BestPen[xx]), '\n')
}
cat(paste0('and the best penalty term on classifier is: ', BestPen[length(X) + 1], '\n'))
cat(paste0('with testing ', EvalMethod,' score = ', round(AggregatedCVResult$TestMetric[which.max(AggregatedCVResult$TestMetric)],3)), '\n')
cat('Now running multi-omics PLS with best penalty term on the complete dataset.', '\n')
# run single-omics PLS
outcome <- as.matrix(as.numeric(Y) - 1)
Ws <- getRobustWeightsMultiBinary(X,
outcome,
SubsamplingPercent= SubsamplingPercent,
Between_Discriminate_Ratio = BDRatio,
LambdaBetween = BestPen[1:length(X)],
LambdaPheno = BestPen[length(X)+1],
SubsamplingNum = subSampNum,
CCcoef = ScalingFactor,
ncomp_pls = ncomp_pls, EvalClassifier = FALSE)
# construct global network module
featurelabel <- unlist(lapply(X, colnames))
Abar <- getAbar(Ws, FeatureLabel = featurelabel)
}
}
# save the cross-validation result
PenComb <- as.data.frame(PenComb)
colnames(PenComb) <- paste0('l', 1:ncol(PenComb))
AggregatedCVResultPen <- as.data.frame(cbind(PenComb, AggregatedCVResult))
utils::write.csv(AggregatedCVResultPen, paste0(saving_dir, '/', 'CVResult.csv'))
# save cross-validation data
save(folddata, file = paste0(saving_dir, '/', 'CVFold.Rdata'))
# save global network result
globalNetwork <- list(AdjacencyMatrix = Abar, Data = X, Phenotype = Y)
save(globalNetwork,
file = paste0(saving_dir, '/', 'globalNetwork.Rdata'))
cat("\n")
cat("--------------------------------------------------\n")
cat(">> Now starting network clustering...\n")
cat("--------------------------------------------------\n")
cat("\n")
#if (cluster == TRUE)
#{
# run hierarchical clustering algorithm
OmicsModule <- getOmicsModules(Abar, CutHeight = CutHeight, PlotTree = FALSE)
#}else{
# cat(">> Cluster = FALSE, skip clustering...\n")
# OmicsModule <- list(1:nrow(Abar))
#}
# store feature label
AbarLabel <- unlist(lapply(X, colnames))
# calculate correlation matrix
bigCor2 <- stats::cor(do.call(cbind, X))
# Combined data
combined_data <- do.call(cbind, X)
cat('Clustering completed...')
cat("\n")
cat("\n")
cat("--------------------------------------------------\n")
cat(">> Now starting network pruning and summarization score extraction...\n")
cat("--------------------------------------------------\n")
cat("\n")
# filter out network modules with insufficient number of nodes
module_length <- unlist(lapply(OmicsModule, length))
network_modules <- OmicsModule[module_length > min_size]
cat(paste0('There are ', length(network_modules), ' network modules before pruning'))
# define data type
feature_size <- unlist(lapply(X, ncol))
types <- unlist(purrr::map((1:length(feature_size)), function(x){
rep(DataType[x], feature_size[x])
}))
if (length(network_modules) == 0)
{
stop('No network is identify after clustering, consider changing the tree-cut parameter or changing the clustering algorithm.')
}
# extract pruned network modules
for(i in 1:length(network_modules))
{
cat('\n','Now extracting subnetwork for network module ', i,'\n')
# define subnetwork
abar_sub <- Abar[network_modules[[i]],network_modules[[i]]]
cor_sub <- bigCor2[network_modules[[i]],network_modules[[i]]]
# prune network module
cat(paste0('Network module ', i, ' Result:'))
networkPruning(Abar = abar_sub,CorrMatrix = cor_sub, type = types[network_modules[[i]]], data = combined_data[,network_modules[[i]]],
Pheno = as.numeric(Y), ModuleIdx = i, min_mod_size = min_size,
max_mod_size = min(nrow(abar_sub), max_size), method = summarization,
saving_dir = saving_dir)
}
cat("\n")
cat("\n")
cat("************************************\n")
cat("* Execution Finished! *\n")
cat("* Automated SmCCNet Completed! *\n")
cat("************************************\n")
cat("\n")
cat("There are 4 files stored in the local directory: \n")
cat("1. CVResult.csv: cross-validation result. \n")
cat("2. CVFold.Rdata: omics and phenotype data splited into different folds. \n")
cat("3. globalNetwork.Rdata: global network adjacency matrix, omics data, phenotype. \n")
cat("4. size_a_net_b.Rdata: subnetworks result file after clustering and network pruning. \n")
cat("SUBNETWORK RESULT FILE CONTAINS:\n")
cat("1. correlation_sub: correlation matrix for the subnetwork.\n")
cat("2. M: adjacency matrix for the subnetwork.\n")
cat("3. omics_corelation_data: individual molecular feature correlation with phenotype.\n")
cat("4. pc_correlation: first 3 PCs correlation with phenotype.\n")
cat("5. pc_loading: principal component loadings.\n")
cat("6. pca_x1_score: principal component score and phenotype data.\n")
cat("7. mod_size: number of molecular features in the subnetwork.\n")
cat("8. sub_type: type of feature for each molecular features.\n")
cat("\n")
cat("NOTE:\n")
cat("1. The results are saved in the user-specified directory.\n")
cat(" If not specified, use 'getwd()' to find the current working directory.\n")
cat("\n")
cat("2. The results are in 'size_a_net_b.Rdata', where:\n")
cat(" 'a' = network size, 'b' = module number.\n")
cat("\n")
cat("3. To prevent overwriting result files across projects, rename the result file after each run.\n")
cat("\n")
cat("4. Subnetwork visualization can be done through shinyApp or cytoscape. Please refer to the network visualization section in the multi-omics or single-omics vignette for more detail. For the Shiny app, visit: https://smccnet.shinyapps.io/smccnetnetwork/\n")
cat("\n")
cat("************************************\n")
return(list(AdjacencyMatrix = Abar, Data = X, ClusteringModules = OmicsModule, CVResult = AggregatedCVResult))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.