#' Planning future experimental designs of Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiments in sample size calculation
#'
#' @description Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM),
#' Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment
#' based on intensity-based linear model. Two options of the calculation:
#' (1) number of biological replicates per condition,
#' (2) power.
#'
#' @param data 'FittedModel' in testing output from function groupComparison.
#' @param desiredFC the range of a desired fold change which includes the lower
#' and upper values of the desired fold change.
#' @param FDR a pre-specified false discovery ratio (FDR) to control the overall
#' false positive rate. Default is 0.05
#' @param numSample minimal number of biological replicates per condition.
#' TRUE represents you require to calculate the sample size for this category,
#' else you should input the exact number of biological replicates.
#' @param power a pre-specified statistical power which defined as the probability
#' of detecting a true fold change. TRUE represent you require to calculate the power
#' for this category, else you should input the average of power you expect. Default is 0.9
#' @inheritParams .documentFunction
#'
#' @details The function fits the model and uses variance components to calculate
#' sample size. The underlying model fitting with intensity-based linear model with
#' technical MS run replication. Estimated sample size is rounded to 0 decimal.
#' The function can only obtain either one of the categories of the sample size
#' calculation (numSample, numPep, numTran, power) at the same time.
#'
#' @return data.frame - sample size calculation results including varibles:
#' desiredFC, numSample, FDR, and power.
#'
#' @importFrom stats median
#' @importFrom plotly ggplotly style add_trace plot_ly subplot layout
#'
#' @export
#'
#' @author Meena Choi, Ching-Yun Chang, Olga Vitek.
#'
#' @examples
#' # Consider quantitative data (i.e. QuantData) from yeast study.
#' # A time course study with ten time points of interests and three biological replicates.
#' QuantData <- dataProcess(SRMRawData)
#' head(QuantData$FeatureLevelData)
#' ## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
#' comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
#' comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
#' comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
#' comparison<-rbind(comparison1,comparison2, comparison3)
#' row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
#' colnames(comparison)<-unique(QuantData$ProteinLevelData$GROUP)
#'
#' testResultMultiComparisons<-groupComparison(contrast.matrix=comparison,data=QuantData)
#'
#' ## Calculate sample size for future experiments:
#' #(1) Minimal number of biological replicates per condition
#' designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=TRUE,
#' desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
#' #(2) Power calculation
#' designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=2,
#' desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
#'
designSampleSize = function(
data, desiredFC, FDR = 0.05, numSample = TRUE, power = 0.9,
use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL
) {
MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
log_file_path, "MSstats_sampleSize_log_")
getOption("MSstatsLog")("INFO", "** MSstats - designSampleSize function")
getOption("MSstatsLog")("INFO", paste0("Desired fold change = ",
paste(desiredFC, collapse=" - ")))
getOption("MSstatsLog")("INFO", paste0("FDR = ", FDR))
getOption("MSstatsLog")("INFO", paste0("Power = ", power))
var_component = .getVarComponent(data)
## for label-free DDA, there are lots of missingness and lots of zero SE. So, remove NA SE.
median_sigma_error = median(var_component[["Error"]], na.rm = TRUE)
median_sigma_subject = .getMedianSigmaSubject(var_component)
getOption("MSstatsLog")("INFO", "Calculated variance component. - okay")
## power calculation
if (isTRUE(power)) {
delta = log2(seq(desiredFC[1], desiredFC[2], 0.025))
desiredFC = 2 ^ delta
power_output = .calculatePower(desiredFC, FDR, delta, median_sigma_error,
median_sigma_subject, numSample)
CV = round( (2 * (median_sigma_error / numSample + median_sigma_subject / numSample)) / desiredFC, 3)
getOption("MSstatsLog")("INFO", "Power is calculated. - okay")
sample_size = data.frame(desiredFC, numSample, FDR,
power = power_output, CV)
}
if (is.numeric(power)) {
delta = log2(seq(desiredFC[1], desiredFC[2], 0.025))
desiredFC = 2 ^ delta
## Large portion of proteins are not changing
m0_m1 = 99 ## it means m0/m1=99, m0/(m0+m1)=0.99
alpha = power * FDR / (1 + (1 - FDR) * m0_m1)
if (isTRUE(numSample)) {
numSample = .getNumSample(desiredFC, power, alpha, delta,
median_sigma_error, median_sigma_subject)
CV = round(2 * (median_sigma_error / numSample + median_sigma_subject / numSample) / desiredFC, 3)
getOption("MSstatsLog")("INFO", "The number of sample is calculated. - okay")
sample_size = data.frame(desiredFC, numSample, FDR, power, CV)
}
}
sample_size
}
#' Get variances from models fitted by the groupComparison function
#' @param fitted_models FittedModels element of groupComparison output
#' @keywords internal
.getVarComponent = function(fitted_models) {
Protein = NULL
result = data.table::rbindlist(
lapply(fitted_models, function(fit) {
if (!is.null(fit)) {
if (!is(fit, "lmerMod")) {
error = summary(fit)$sigma^2
subject <- NA
group_subject <- NA
} else {
stddev = c(sapply(lme4::VarCorr(fit), function(el) attr(el, "stddev")),
attr(lme4::VarCorr(fit), "sc"))
error = stddev[names(stddev) == ""]^2
if (any(names(stddev) %in% "SUBJECT.(Intercept)")) {
subject = stddev["SUBJECT.(Intercept)"]^2
} else {
subject = NA
}
if (any(names(stddev) %in% "SUBJECT:GROUP.(Intercept)")) {
group_subject = stddev["SUBJECT:GROUP.(Intercept)"]^2
} else {
group_subject = NA
}
}
list(Error = error,
Subject = subject,
GroupBySubject = group_subject)
} else {
NULL
}
})
)
result[, Protein := 1:.N]
result
}
#' Get median per subject or group by subject
#' @param var_component data.frame, output of .getVarComponent
#' @importFrom stats median
#' @keywords internal
.getMedianSigmaSubject = function(var_component) {
if (sum(!is.na(var_component[, "GroupBySubject"])) > 0) {
median_sigma_subject = median(var_component[["GroupBySubject"]], na.rm=TRUE)
} else {
if (sum(!is.na(var_component[, "Subject"])) > 0) {
median_sigma_subject = median(var_component[["Subject"]], na.rm=TRUE)
} else {
median_sigma_subject = 0
}
}
median_sigma_subject
}
#' Power calculation
#' @inheritParams designSampleSize
#' @param delta difference between means (?)
#' @param median_sigma_error median of error standard deviation
#' @param median_sigma_subject median standard deviation per subject
#' @importFrom stats qnorm
#' @keywords internal
.calculatePower = function(desiredFC, FDR, delta, median_sigma_error,
median_sigma_subject, numSample) {
m0_m1 = 99
t = delta / sqrt(2 * (median_sigma_error / numSample + median_sigma_subject / numSample))
powerTemp = seq(0, 1, 0.01)
power = numeric(length(t))
for (i in seq_along(t)) {
diff = qnorm(powerTemp) + qnorm(1 - powerTemp * FDR / (1 + (1 - FDR) * m0_m1) / 2) - t[i]
min(abs(diff), na.rm = TRUE)
power[i] = powerTemp[order(abs(diff))][1]
}
power
}
#' Get sample size
#' @inheritParams designSampleSize
#' @inheritParams .calculatePower
#' @param alpha significance level
#' @param delta difference between means (?)
#' @importFrom stats qnorm
#' @keywords internal
.getNumSample = function(desiredFC, power, alpha, delta, median_sigma_error,
median_sigma_subject){
z_alpha = qnorm(1 - alpha / 2)
z_beta = qnorm(power)
aa = (delta / (z_alpha + z_beta)) ^ 2
numSample = round(2 * (median_sigma_error + median_sigma_subject) / aa, 0)
numSample
}
#' Visualization for sample size calculation
#'
#' @description To illustrate the relationship of desired fold change and the calculated
#' minimal number sample size which are (1) number of biological replicates per condition,
#' (2) number of peptides per protein,
#' (3) number of transitions per peptide, and
#' (4) power. The input is the result from function (\code{\link{designSampleSize}}.
#'
#' @param data output from function designSampleSize.
#' @param isPlotly Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
#' will save Plotly plots as HTML files. If set to FALSE MSstats will save ggplot2 plots
#' as PDF files
#'
#' @details Data in the example is based on the results of sample size calculation from function \code{\link{designSampleSize}}
#'
#' @return Plot for estimated sample size with assigned variable.
#'
#' @export
#'
#' @author Meena Choi, Ching-Yun Chang, Olga Vitek.
#' @examples
#' # Based on the results of sample size calculation from function designSampleSize,
#' # we generate a series of sample size plots for number of biological replicates, or peptides,
#' # or transitions or power plot.
#' QuantData<-dataProcess(SRMRawData)
#' head(QuantData$ProcessedData)
#' ## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
#' comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
#' comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
#' comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
#' comparison<-rbind(comparison1,comparison2, comparison3)
#' row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
#' colnames(comparison)<-unique(QuantData$ProteinLevelData$GROUP)
#'
#' testResultMultiComparisons<-groupComparison(contrast.matrix=comparison, data=QuantData)
#'
#' # plot the calculated sample sizes for future experiments:
#' # (1) Minimal number of biological replicates per condition
#' result.sample<-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=TRUE,
#' desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
#' designSampleSizePlots(data=result.sample)
#' # (2) Power
#' result.power<-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=2,
#' desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
#' designSampleSizePlots(data=result.power)
#'
designSampleSizePlots = function(data, isPlotly = FALSE) {
if (length(unique(data$numSample)) > 1) {
index = "numSample"
} else if (length(unique(data$power)) > 1) {
index = "power"
} else if (length(unique(data$numSample)) == 1 & length(unique(data$power)) == 1) {
index = "numSample"
} else {
stop ("Invalid input")
}
if(isPlotly) {
axis.size = 10
lab.size = 12
text.size = 12
} else {
text.size = 1.2
axis.size = 1.3
lab.size = 1.7
}
if (index == "numSample") {
if(isPlotly) {
p <- plot_ly(data, x = ~desiredFC, y = ~numSample, type = 'scatter', mode = 'lines',
line = list(width = 2))
p <- layout(p, xaxis = list(
title = "Desired fold change",
tickvals = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
ticktext = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
tickfont = list(size = axis.size)
),
yaxis = list(
title = "Minimal number of biological replicates",
tickfont = list(size = axis.size)
),
annotations = list(
list(
x = 1, y = 1, xref = 'paper', yref = 'paper',
text = sprintf("FDR is %s<br>Statistical power is %s",
paste(unique(data$FDR), collapse = ", "),
paste(unique(data$power), collapse = ", ")),
showarrow = FALSE,
xanchor = 'right', yanchor = 'top',
font = list(size = text.size)
)
),
margin = list(t = 50)) # Adjust top margin to avoid cutting off text
return(p)
}
else {
plot(data$desiredFC, data$numSample,
lwd=2, xlab="", ylab="",
cex.axis=axis.size, type="l", xaxt="n")
axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
cex.axis=axis.size)
mtext("Desired fold change", 1, line=3.5, cex=lab.size)
mtext("Minimal number of biological replicates", 2, line=2.5, cex=lab.size)
legend("topright",
c(paste("FDR is", unique(data$FDR)),
paste("Statistical power is", unique(data$power))),
bty="n",
cex=text.size)
}
}
if (index == "power") {
if(isPlotly) {
p <- plot_ly(data, x = ~desiredFC, y = ~power, type = 'scatter', mode = 'lines',
line = list(width = 2))
p <- layout(p, xaxis = list(
title = "Desired fold change",
tickvals = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
ticktext = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
tickfont = list(size = axis.size)
),
yaxis = list(
title = "Power",
tickfont = list(size = axis.size)
),
showlegend = FALSE, # Hide default legend
annotations = list(
list(
x = 0.5, y = 0.5, xref = 'paper', yref = 'paper',
text = paste("Number of replicates is", unique(data$numSample), "<br>FDR is", unique(data$FDR)),
showarrow = FALSE,
xanchor = 'right', yanchor = 'bottom',
font = list(size = text.size)
)
),
margin = list(b = 100)) # Adjust bottom margin if necessary
return(p)
} else {
plot(data$desiredFC, data$power,
lwd=2, xlab="", ylab="",
cex.axis=axis.size, type="l", xaxt="n")
axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
cex.axis=axis.size)
mtext("Desired fold change", 1, line=3.5, cex=lab.size)
mtext("Power", 2, line=2.5, cex=lab.size)
legend("bottomright",
c(paste("Number of replicates is", unique(data$numSample)),
paste("FDR is", unique(data$FDR))),
bty="n",
cex=text.size)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.