Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## -----------------------------------------------------------------------------
library(MSstatsSampleSize)
## -----------------------------------------------------------------------------
# # read in protein abundance sheet
# # The CSV sheet has 173 columns from control and cancer groups.
# # Each row is protein and each column (except the first column) is biological replicate.
# # The first column 'Protein' contains the protein names.
# OV_SRM_train <- read.csv(file = "OV_SRM_train.csv")
# # assign the column 'Protein' as row names
# rownames(OV_SRM_train) <- OV_SRM_train$Protein
# # remove the column 'Protein
# OV_SRM_train <- OV_SRM_train[, colnames(OV_SRM_train)!="Protein"]
head(OV_SRM_train)
# Read in annotation including condition and biological replicates.
# Users should make this annotation file.
# OV_SRM_train_annotation <- read.csv(file="OV_SRM_train_annotation.csv", header=TRUE)
head(OV_SRM_train_annotation)
# estimate the mean protein abunadnce and variance in each condition
variance_estimation <- estimateVar(data = OV_SRM_train,
annotation = OV_SRM_train_annotation)
# the mean protein abundance in each condition
head(variance_estimation$mu)
# the standard deviation in each condition
head(variance_estimation$sigma)
# the mean protein abundance across all the conditions
head(variance_estimation$promean)
## ---- eval=FALSE--------------------------------------------------------------
# # output a pdf file with mean-SD plot
# meanSDplot(variance_estimation)
## ----message = FALSE , warning = FALSE----------------------------------------
# expected_FC = "data": fold change estimated from OV_SRM_train
# select_simulated_proteins = "proportion": select the simulated proteins based on the proportion of total proteins
# simulate_valid = FALSE: use input OV_SRM_train as validation set
simulated_datasets <- simulateDataset(data = OV_SRM_train,
annotation = OV_SRM_train_annotation,
num_simulations = 10, # simulate 10 times
expected_FC = "data",
list_diff_proteins = NULL,
select_simulated_proteins = "proportion",
protein_proportion = 1.0,
protein_number = 1000,
samples_per_group = 50, # 50 samples per condition
simulate_validation = FALSE,
valid_samples_per_group = 50)
## -----------------------------------------------------------------------------
# the number of simulated proteins
simulated_datasets$num_proteins
# a vector with the number of simulated samples in each condition
simulated_datasets$num_samples
# the list of simulated protein abundance matrices
# Each element of the list represents one simulation
head(simulated_datasets$simulation_train_Xs[[1]]) # first simulation
# the list of simulated condition vectors
# Each element of the list represents one simulation
head(simulated_datasets$simulation_train_Ys[[1]]) # first simulation
## ----message = FALSE , warning = FALSE----------------------------------------
# expected_FC = expected_FC: user defined fold change
unique(OV_SRM_train_annotation$Condition)
expected_FC <- c(1, 1.5)
names(expected_FC) <- c("control", "ovarian cancer")
set.seed(1212)
# Here I randomly select some proteins as differential to show how the function works
# The user should prepare this list of differential proteins by themselves
diff_proteins <- sample(rownames(OV_SRM_train), 20)
simualted_datasets_predefined_FC <- simulateDataset(data = OV_SRM_train,
annotation = OV_SRM_train_annotation,
num_simulations = 10, # simulate 10 times
expected_FC = expected_FC,
list_diff_proteins = diff_proteins,
select_simulated_proteins = "proportion",
protein_proportion = 1.0,
protein_number = 1000,
samples_per_group = 50, # 50 samples per condition
simulate_validation = FALSE,
valid_samples_per_group = 50)
## ----message = FALSE, warning = FALSE-----------------------------------------
classification_results <- designSampleSizeClassification(
simulations = simulated_datasets,
parallel = FALSE)
## ----message = FALSE, warning = FALSE-----------------------------------------
# the number of simulated proteins
classification_results$num_proteins
# a vector with the number of simulated samples in each condition
classification_results$num_samples
# the mean predictive accuracy over all the simulated datasets,
# which have same 'num_proteins' and 'num_samples'
classification_results$mean_predictive_accuracy
# the mean protein importance vector over all the simulated datasets,
# the length of which is 'num_proteins'.
head(classification_results$mean_feature_importance)
## ----message = FALSE, warning = FALSE-----------------------------------------
## try parallel computation to speed up
## The parallel computation may cause error while allocating the core resource
## Then the users can try the abova function without parallel computation
classification_results_parallel <- designSampleSizeClassification(
simulations = simulated_datasets,
parallel = TRUE)
## ---- eval=FALSE--------------------------------------------------------------
# #### sample size classification ####
# # simulate different sample sizes
# # 1) 10 biological replicats per group
# # 1) 25 biological replicats per group
# # 2) 50 biological replicats per group
# # 3) 100 biological replicats per group
# # 4) 200 biological replicats per group
# list_samples_per_group <- c(10, 25, 50, 100, 200)
#
# # save the simulation results under each sample size
# multiple_sample_sizes <- list()
#
# for(i in seq_along(list_samples_per_group)){
# # run simulation for each sample size
# simulated_datasets <- simulateDataset(data = OV_SRM_train,
# annotation = OV_SRM_train_annotation,
# num_simulations = 10, # simulate 10 times
# expected_FC = "data",
# list_diff_proteins = NULL,
# select_simulated_proteins = "proportion",
# protein_proportion = 1.0,
# protein_number = 1000,
# samples_per_group = list_samples_per_group[i],
# simulate_valid = FALSE,
# valid_samples_per_group = 50)
#
# # run classification performance estimation for each sample size
# res <- designSampleSizeClassification(simulations = simualted_datasets,
# parallel = TRUE)
#
# # save results
# multiple_sample_sizes[[i]] <- res
# }
#
# ## make the plots
# designSampleSizeClassificationPlots(multiple_sample_sizes,
# list_samples_per_group,
# ylimUp_predictive_accuracy = 0.8,
# ylimDown_predictive_accuracy = 0.6)
## -----------------------------------------------------------------------------
# output a pdf file with sample size calculation plot for hypothesis testing
# also return a table which summaries the plot
HT_res <- designSampleSizeHypothesisTestingPlot(data = OV_SRM_train,
annotation= OV_SRM_train_annotation,
desired_FC = "data",
select_testing_proteins = "proportion",
protein_proportion = 1.0,
protein_number = 1000,
FDR=0.05,
power=0.9)
# data frame with columns desiredFC, numSample, FDR, power and CV
head(HT_res)
## ---- eval=FALSE--------------------------------------------------------------
# # output a pdf file with 11 PCA plots
# designSampleSizePCAplot(simulated_datasets)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.