Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
## ---- echo = FALSE, results = "hide", warning = FALSE, eval = TRUE------------
suppressPackageStartupMessages({
library(pbapply)
library(Matrix)
library(igraph)
})
## ---- eval = TRUE-------------------------------------------------------------
library(pbapply)
library(Matrix)
library(igraph)
library(SmCCNet)
library(parallel)
## ----flowPlot, out.width="100%", fig.cap = "Workflow for SmCCNet single-omics setting (binary and quantitative phenotype).", echo = FALSE----
knitr::include_graphics("figures/single-omics-smccnet.jpg")
## ----example data-------------------------------------------------------------
data(ExampleData)
head(X1[ , 1:6])
head(Y)
## ----p1p2, eval = TRUE--------------------------------------------------------
p1 <- ncol(X1)
N <- nrow(X1)
AbarLabel <- colnames(X1)
## ---- eval = FALSE------------------------------------------------------------
# # preprocess data
# processed_data <- dataPreprocess(X = as.data.frame(X1), covariates = NULL,
# is_cv = TRUE, cv_quantile = 0.2, center = TRUE, scale = TRUE)
## ----CVpara, eval = FALSE, warning = FALSE------------------------------------
# K <- 3 # number of folds in K-fold CV.
# s1 <- 0.7
# subSamp <- 50 # number of subsamples (will be described in later section).
#
# # create sparsity penalty options.
# pen1 <- seq(.05, .3, by = .05)
#
# # set a CV directory.
# CVDir <- "Example3foldCV/"
# pheno <- "Example3foldCV"
# dir.create(CVDir)
## ----make K-fold, eval = FALSE------------------------------------------------
# # set random seed
# set.seed(12345)
#
# # save data and parameters into local directory
# save(X1, Y, s1, subSamp, pen1,
# file = paste0(CVDir, "Data.Rdata"))
#
# # split data into K folds
# foldIdx <- split(1:N, sample(1:N, K))
# for(i in 1:K){
# iIdx <- foldIdx[[i]]
# x1.train <- scale(X1[-iIdx, ])
# yy.train <- Y[-iIdx, ]
# x1.test <- scale(X1[iIdx, ])
# yy.test <- Y[iIdx, ]
#
# if(is.na(min(min(x1.train), min(yy.train), min(x1.test), min(yy.test)))){
# stop("Invalid scaled data.")
# }
#
# subD <- paste0(CVDir, "CV_", i, "/")
# dir.create(subD)
#
# # save data from each fold into local directory
# save(x1.train, yy.train, x1.test, yy.test,
# pen1, p1,
# file = paste0(subD, "Data.Rdata"))
# }
## ----run CV, eval = FALSE-----------------------------------------------------
# # number of clusters in parSapply should be the same as number specified above
# suppressWarnings(for (CVidx in 1:K)
# {
# # define the sub-directory for each fold
# subD <- paste0(CVDir, "CV_", CVidx, "/")
# # load fold data
# load(paste0(subD, "Data.Rdata"))
# dir.create(paste0(subD, "SmCCA/"))
# # create empty vector to store cross-validation result
# RhoTrain <- RhoTest <- DeltaCor <- rep(0, length(pen1))
# # evaluate through all the possible penalty candidates
# for(idx in 1:length(pen1)){
# l1 <- pen1[idx]
# print(paste0("Running SmCCA on CV_", CVidx, " pen=", l1))
# # run single-omics SmCCNet
# Ws <- getRobustWeightsSingle(x1.train, as.matrix(yy.train), l1, 1,
# SubsamplingNum = 1)
# # average
# meanW <- rowMeans(Ws)
# v <- meanW[1:p1]
#
# rho.train <- cor(x1.train %*% v, yy.train)
#
#
# rho.test <- cor(x1.test %*% v, yy.test)
#
#
# RhoTrain[idx] <- round(rho.train, digits = 5)
# RhoTest[idx] <- round(rho.test, digits = 5)
# DeltaCor[idx] <- abs(rho.train - rho.test)
#
#
#
# }
#
# DeltaCor.all <- cbind(pen1, RhoTrain, RhoTest, DeltaCor)
# colnames(DeltaCor.all) <- c("l1", "Training CC", "Test CC", "CC Pred. Error")
# write.csv(DeltaCor.all,
# file = paste0(subD, "SmCCA/SCCA_", subSamp,"_allDeltaCor.csv"))
#
#
# })
## ----aggregate error, eval = FALSE--------------------------------------------
# # combine prediction errors from all K folds and compute the total prediction
# # error for each sparsity penalty pair.
# aggregateCVSingle(CVDir, "SmCCA", NumSubsamp = subSamp, K = K)
## ----get abar, eval = FALSE---------------------------------------------------
# # set up directory to store all the results
# plotD <- paste0(CVDir, "Figures/")
# saveD <- paste0(CVDir, "Results/")
# dataF <- paste0(CVDir, "Data.Rdata")
# dir.create(plotD)
# dir.create(saveD)
# dir.create(dataF)
#
# # type of CCA result, only "SmCCA" supported
# Method = "SmCCA"
#
#
# # after SmCCA CV, select the best penalty term,
# # and use it for running SmCCA on the complete dataset
# for(Method in "SmCCA"){
# # select optimal penalty term from CV result
# T12 <- read.csv(paste0(CVDir, "Results/", Method, "CVmeanDeltaCors.csv"))
# # calculate evaluation metric **
# pen <- T12[which.min(T12[ ,4]/abs(T12[ ,3])) ,2]
#
#
# l1 <- pen;
# system.time({
# Ws <- getRobustWeightsSingle(X1 = X1, Trait = as.matrix(Y),
# Lambda1 = l1,
# s1, SubsamplingNum = subSamp)
#
#
# Abar <- getAbar(Ws, FeatureLabel = AbarLabel[1:p1])
# save(l1, X1, Y, s1, Ws, Abar,
# file = paste0(saveD, Method, K, "foldSamp", subSamp, "_", pen,
# ".Rdata"))
# })
#
#
# }
## ----get modules, eval = FALSE------------------------------------------------
# # perform clustering based on the adjacency matrix Abar
# OmicsModule <- getOmicsModules(Abar, PlotTree = FALSE)
#
# # make sure there are no duplicated labels
# AbarLabel <- make.unique(AbarLabel)
#
# # calculate feature correlation matrix
# bigCor2 <- cor(X1)
#
# # data type
# types <- rep('gene', nrow(bigCor2))
## ---- eval = FALSE------------------------------------------------------------
# # filter out network modules with insufficient number of nodes
# module_length <- unlist(lapply(OmicsModule, length))
# network_modules <- OmicsModule[module_length > 10]
# # extract pruned network modules
# for(i in 1:length(network_modules))
# {
# cat(paste0('For network module: ', i, '\n'))
# # define subnetwork
# abar_sub <- Abar[network_modules[[i]],network_modules[[i]]]
# cor_sub <- bigCor2[network_modules[[i]],network_modules[[i]]]
# # prune network module
# networkPruning(Abar = abar_sub,CorrMatrix = cor_sub,
# type = types[network_modules[[i]]],
# data = X1[,network_modules[[i]]],
# Pheno = Y, ModuleIdx = i, min_mod_size = 10,
# max_mod_size = 100, method = 'PCA',
# saving_dir = getwd())
# cat("\n")
# }
## ---- eval = TRUE,echo = FALSE, warning = FALSE, message = FALSE--------------
load('../vignettes/cont_size_16_net_1.Rdata')
row.names(omics_correlation_data) <- NULL
colnames(omics_correlation_data) <- c('Molecular Feature', 'Correlation to Phenotype', 'P-value')
knitr::kable(omics_correlation_data, caption = 'Individual molecular features correlation table with respect to phenotype (correlation and p-value).')
## ----loadings1, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "PC1 loading for each subnetwork feature."----
library(grid)
library(tidyverse)
library(shadowtext)
library(reshape2)
BLUE <- "#076fa2"
data <- data.frame(name = row.names(pc_loading), loading = abs(pc_loading[,1]))
plt <- ggplot(data) +
geom_col(aes(loading, name), fill = BLUE, width = 0.6)
plt
## ----corheatmap, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "Correlation heatmap for subnetwork features."----
###### Correlation heatmap
melted_cormat <- melt(correlation_sub)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
labs(title = "Correlation Heatmap of Network Features") +
theme(plot.title.position = "plot")
## ----adjheatmap, echo = FALSE, warning = FALSE, message = FALSE, out.width="100%", fig.cap = "Adjacency matrix heatmap for subnetwork features."----
###### Correlation heatmap
melted_cormat <- melt(as.matrix(M))
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
labs(title = "Adjacency Matrix Heatmap of Network Features") +
theme(plot.title.position = "plot")
## ---- eval = FALSE------------------------------------------------------------
# library(RCy3)
# library(igraph)
# # load subnetwork data (example, user need to provide the directory)
# load('ResultDirectory/size_a_net_b.Rdata')
# M <- as.matrix(M)
#
# # correlation matrix for the subnetwork
# filter_index <- which(abs(correlation_sub) < 0.05)
# M_ind <- ifelse(correlation_sub > 0, 1, -1)
# M_adj <- M * M_ind
# M_adj[filter_index] <- 0
# diag(M_adj) <- 0
#
# # network visualization through cytoscape
# graph <- igraph::graph_from_adjacency_matrix(M_adj, mode = 'undirected',
# weighted = TRUE, diag = TRUE, add.colnames = NULL, add.rownames = NA)
#
# # define network node type and connectivity
# V(graph)$type <- sub_type
# V(graph)$type
# V(graph)$connectivity <- rowSums(abs(M))
# V(graph)$connectivity
#
# createNetworkFromIgraph(graph,"single_omics_network")
## ----netPlot, out.width="100%", fig.cap = "Trimmed module 1. The strength of the node connections is indicated by the thickness of edges. Red edges and blue edges are for negative and positive connections respectively.", echo = FALSE----
knitr::include_graphics("../vignettes/example_network_cont.png")
## ----example data binary------------------------------------------------------
data(ExampleData)
head(X1[ , 1:6])
head(Y)
# binarize quantitative outcome
Y <- ifelse(Y > median(Y), 1, 0)
## ----p1p2 binary, eval = TRUE-------------------------------------------------
p1 <- ncol(X1)
N <- nrow(X1)
AbarLabel <- colnames(X1)
## ---- eval = FALSE------------------------------------------------------------
# K <- 3 # number of folds in K-fold CV.
# s1 <- 0.7 # feature subsampling percentage.
# subSamp <- 50 # number of subsamples.
# CCcoef <- NULL # unweighted version of SmCCNet.
# metric <- 'auc' # evaluation metric to be used.
#
# # create sparsity penalty options.
# pen1 <- seq(.1, .9, by = .1)
#
# # set a CV directory.
# CVDir <- "Example3foldCV_Binary/"
# pheno <- "Example3foldCV_Binary"
# dir.create(CVDir)
#
# Y <- ifelse(Y > median(Y), 1, 0)
## ----make K-fold 2, eval = FALSE----------------------------------------------
# set.seed(12345) # set random seed.
# # save original data into local directory
# save(X1, Y, s1, subSamp, pen1,
# file = paste0(CVDir, "Data.Rdata"))
#
#
# # define the number of components to be extracted
# ncomp <- 3
# # split data into k folds
# foldIdx <- split(1:N, sample(1:N, K))
# for(i in 1:K){
# iIdx <- foldIdx[[i]]
# x1.train <- scale(X1[-iIdx, ])
# yy.train <- Y[-iIdx, ]
# x1.test <- scale(X1[iIdx, ])
# yy.test <- Y[iIdx, ]
#
# if(is.na(min(min(x1.train), min(yy.train), min(x1.test), min(yy.test)))){
# stop("Invalid scaled data.")
# }
#
# subD <- paste0(CVDir, "CV_", i, "/")
# dir.create(subD)
# save(x1.train, yy.train, x1.test, yy.test,
# pen1, p1,
# file = paste0(subD, "Data.Rdata"))
# }
#
# ############################################## Running Cross-Validation
#
# # run SmCCA with K-fold cross-validation
# suppressWarnings(for (CVidx in 1:K){
# # define evaluation method
# EvalMethod <- 'precision'
# # define and create saving directory
# subD <- paste0(CVDir, "CV_", CVidx, "/")
# load(paste0(subD, "Data.Rdata"))
# dir.create(paste0(subD, "SmCCA/"))
#
# # create empty vector to store cross-validation accuracy result
# TrainScore <- TestScore <- rep(0, length(pen1))
# for(idx in 1:length(pen1)){
# # define value of penalty
# l1 <- pen1[idx]
#
# # run SPLS-DA to extract canonical weight
# Ws <- spls::splsda(x = x1.train, y = yy.train, K = ncomp, eta = l1,
# kappa=0.5,
# classifier= 'logistic', scale.x=FALSE)
#
# # create emtpy matrix to save canonical weights for each subsampling
# weight <- matrix(0,nrow = ncol(x1.train), ncol = ncomp)
# weight[Ws[["A"]],] <- Ws[["W"]]
#
# # train the latent factor model with logistic regression
# train_data <- data.frame(x = (x1.train %*% weight)[,1:ncomp],
# y = as.factor(yy.train))
# test_data <- data.frame(x = (x1.test %*% weight)[,1:ncomp])
#
# logisticFit <- stats::glm(y~., family = 'binomial',data = train_data)
# # make prediction for train/test set
# train_pred <- stats::predict(logisticFit, train_data, type = 'response')
# test_pred <- stats::predict(logisticFit, test_data, type = 'response')
# # specifying which method to use as evaluation metric
# TrainScore[idx] <- classifierEval(obs = yy.train,
# pred = train_pred,
# EvalMethod = metric, print_score = FALSE)
# TestScore[idx] <- classifierEval(obs = yy.test,
# pred = test_pred,
# EvalMethod = metric, print_score = FALSE)
#
# }
#
# # combine cross-validation results
# DeltaAcc.all <- as.data.frame(cbind(pen1, TrainScore, TestScore))
# DeltaAcc.all$Delta <- abs(DeltaAcc.all$TrainScore - DeltaAcc.all$TestScore)
# colnames(DeltaAcc.all) <- c("l1", "Training Score", "Testing Score", "Delta")
#
# # save cross-validation results to local directory
# write.csv(DeltaAcc.all,
# file = paste0(subD, "SmCCA/SCCA_", subSamp,"_allDeltaCor.csv"))
#
# }
# )
## ---- eval = FALSE------------------------------------------------------------
# # save cross-validation result
# cv_result <- aggregateCVSingle(CVDir, "SmCCA", NumSubsamp = subSamp, K = 3)
#
# # create directory to save overall result with the best penalty term
# plotD <- paste0(CVDir, "Figures/")
# saveD <- paste0(CVDir, "Results/")
# dataF <- paste0(CVDir, "Data.Rdata")
# dir.create(plotD)
# dir.create(saveD)
# dir.create(dataF)
#
# # specify method (only SmCCA works)
# Method = "SmCCA"
#
# for(Method in "SmCCA"){
# # read cross validation result in
# T12 <- read.csv(paste0(CVDir, "Results/", Method, "CVmeanDeltaCors.csv"))
# # determine the optimal penalty term
# pen <- T12[which.max(T12[ ,3]) ,2]
# l1 <- pen;
# system.time({
# # run SPLSDA with optimal penalty term
# Ws <- getRobustWeightsSingleBinary(X1 = X1, Trait = as.matrix(Y),
# Lambda1 = l1,
# s1, SubsamplingNum = subSamp)
#
# # get adjacency matrix
# Abar <- getAbar(Ws, FeatureLabel = AbarLabel[1:p1])
# # save result into local directory
# save(l1, X1, Y, s1, Ws, Abar,
# file = paste0(saveD, Method, K, "foldSamp", subSamp, "_", pen,
# ".Rdata"))
# })
#
#
# }
## ----sessionInfo--------------------------------------------------------------
sessionInfo()
warnings()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.