## ---- SCP Variance Analysis ----
##' @name ScpModel-VarianceAnalysis
##'
##' @title Analysis of variance for single-cell proteomics
##'
##' @description
##'
##' Analysis of variance investigates the contribution of each effects
##' in capturing the variance in the data.
##'
##' @section Running the variance analysis:
##'
##' `scpVarianceAnalysis()` computes the amount of data (measured as
##' the sums of squares) that is captured by each model variable, but
##' also that is not modelled and hence captured in the residuals. The
##' proportion of variance explained by each effect is the sums of
##' squares for that effect divided by the sum of all sums of squares
##' for each effect and residuals. This is computed for each feature
##' separately. The function returns a list of `DataFrame`s with one
##' table for each effect.
##'
##' `scpVarianceAggregate()` combines the analysis of variance results
##' for groups of features. This is useful, for example, to
##' return protein-level results when data is modelled at the peptide
##' level. The function takes the list of tables generated by
##' `scpVarianceAnalysis()` and returns a new list of `DataFrame`s
##' with aggregated results.
##'
##' @section Exploring variance analysis results:
##'
##' [scpAnnotateResults()] adds annotations to the component
##' analysis results. The annotations are added to all elements of the
##' list returned by `scpComponentAnalysis()`. See the associated man
##' page for more information.
##'
##' `scpVariancePlot()` takes the list of tables generated by
##' `scpVarianceAnalysis()` and returns a `ggplot2` bar plot. The
##' bar plot shows the proportion of explained variance by each effect
##' and the residual variance. By default, the function will combine
##' the results over all features, showing the effect's contributions
##' on the complete data set. When `combine = FALSE`, the results
##' are shown for individual features, with additional arguments to
##' control how many and which features are shown. Bars can also be
##' grouped by `fcol`. This is particularly useful when exploring
##' peptide level results, but grouping peptides that belong to the
##' same protein (note that you should not use `scpVarianceAggregate()`
##' in that case).
##'
##' @seealso
##'
##' - [ScpModel-Workflow] to run a model on SCP data upstream of
##' analysis of variance.
##' - [scpAnnotateResults()] to annotate analysis of variance results.
##'
##' @author Christophe Vanderaa, Laurent Gatto
##'
##' @example inst/examples/examples_ScpModel-VarianceAnalysis.R
##'
NULL
## ---- Analysis functions ----
##' @name ScpModel-VarianceAnalysis
##'
##' @param object An object that inherits from the
##' `SingleCellExperiment` class. It must contain an estimated
##' `ScpModel` in its metadata.
##'
##' @param name A `character(1)` providing the name to use to retrieve
##' the model results. When retrieving a model and `name` is
##' missing, the name of the first model found in `object` is used.
##'
##' @export
scpVarianceAnalysis <- function(object, name) {
ss <- .computeSumsOfSquares(object, name)
denom <- .explainedVarianceDenominator(ss)
df <- scpModelDf(object, name)
out <- lapply(colnames(ss[, -1]), function (i) {
DataFrame(
feature = rownames(ss),
SS = ss[, i],
df = df[rownames(ss)],
percentExplainedVar = ss[, i] / denom * 100
)
})
names(out) <- colnames(ss[, -1])
List(out)
}
## Internal function that compute the sums of squares for each effect
## in a model, as well as the residual SS and the toal SS. The results
## are returned in a table with the different SS as columns and
## features as rows
##
## @param object A SummarizedExperiment object containing an estimated
## ScpModel object in the metadata.
## @param name A `character(1)` providing the name to use to retrieve
## the ScpModel object in the metadata. When missing, the name of
## the first model found in `object` is used.
.computeSumsOfSquares <- function(object,
name) {
name <- .checkModelName(object, name)
SStotal <- .computeTotalSS(object, name)
SSresiduals <- .computeResidualSS(object, name)
SSmodel <- .computeEffectSS(object, name)
cbind(SStotal, Residuals = SSresiduals, SSmodel)
}
## Internal function that computes the total SS for a model by
## computing the Euclidean norm of the input data after intercept
## removal. Missing values are ignored.
##
## @param object A SummarizedExperiment object containing an estimated
## ScpModel object in the metadata.
## @param name A `character(1)` providing the name to use to retrieve
## the ScpModel object in the metadata. When missing, the name of
## the first model found in `object` is used.
.computeTotalSS <- function(object, name) {
Y <- scpModelInput(object, name)
intercept <- scpModelIntercept(object, name)
Y <- sweep(Y, STATS = intercept, MARGIN = 1, FUN = "-")
rowSums(Y^2, na.rm = TRUE)
}
## Internal function that computes the residual SS for a model by
## computing the Euclidean norm of the residuals. Missing values are
## ignored.
##
## @param object A SummarizedExperiment object containing an estimated
## ScpModel object in the metadata.
## @param name A `character(1)` providing the name to use to retrieve
## the ScpModel object in the metadata. When missing, the name of
## the first model found in `object` is used.
.computeResidualSS <- function(object, name) {
R <- scpModelResiduals(object, name, join = TRUE)
rowSums(R^2, na.rm = TRUE)
}
## Internal function that computes the SS captured by each model
## variable for a model by computing the Euclidean norm of each
## effect matrix. Missing values are ignored.
##
## @param object A SummarizedExperiment object containing an estimated
## ScpModel object in the metadata.
## @param name A `character(1)` providing the name to use to retrieve
## the ScpModel object in the metadata. When missing, the name of
## the first model found in `object` is used.
.computeEffectSS <- function(object, name) {
sapply(scpModelEffects(object, name), function(x){
rowSums(x^2, na.rm = TRUE)
})
}
## Internal function that decides how the denominator is computed when
## computing the proportion of explained variance. When
## useTotalSS = TRUE, equation 24 in Thiel et al. 2017 is used. When
## useTotalSS = FALSE, equation 26 in Thiel et al. 2017 is used.
.explainedVarianceDenominator <- function(ss, useTotalSS = FALSE) {
if (useTotalSS) {
denom <- ss[, "SStotal"]
} else {
denom <- rowSums(ss[, colnames(ss) != "SStotal"])
}
denom
}
## ---- Aggregation functions ----
##' @name ScpModel-VarianceAnalysis
##'
##' @param varianceList A list of tables returned by
##' `scpVarianceAnalysis()`.
##'
##' @param fcol A `character(1)` indicating the column to use for
##' grouping features. Typically, this would be protein or gene
##' names for grouping proteins.
##'
##' @export
scpVarianceAggregate <- function(varianceList, fcol) {
stopifnot(fcol %in% colnames(varianceList[[1]]))
endoapply(varianceList, function(x) {
x <- x[!is.na(x[[fcol]]), ]
out <- reduceDataFrame(x, x[[fcol]], count = TRUE)
out$SS <- sapply(out$SS, sum, na.rm = TRUE)
out$df <- sapply(out$df, sum, na.rm = TRUE)
out$percentExplainedVar <-
sapply(out$percentExplainedVar, mean, na.rm = TRUE)
out$feature <- out[[fcol]]
out
})
}
## ---- Plotting functions ----
##' @name ScpModel-VarianceAnalysis
##'
##' @param effect A `character(1)` used to filter theb results. It
##' indicates which effect should be considered when sorting the
##' results.
##'
##' @param by A `character(1)` used to filter the results. It
##' indicates which variable should be considered when sorting the
##' results. Can be one of: "SS", "df", or "percentExplainedVar".
##'
##' @param top A `numeric(1)` used to filter the results. It indicates how
##' many features should be plotted. When `top = Inf` (default),
##' all feature are considered.
##'
##' @param decreasing A `logical(1)` indicating whether the effects
##' should be ordered decreasingly (`TRUE`, default) or
##' increasingly (`FALSE`) depending on the value provided by
##' `by`.
##'
##' @param combined A `logical(1)` indicating whether the results
##' should be combined across all features. When `TRUE`, the
##' barplot shows the explained variance for the complete dataset.
##'
##' @param colourSeed A `integer(1)` providing a seed that is used
##' when randomly sampling colours for the effects. Change the
##' number to generate another colour scheme.
##'
##' @import ggplot2
##' @export
scpVariancePlot <- function(varianceList,
effect = "Residuals",
by = "percentExplainedVar",
top = Inf,
decreasing = TRUE,
combined = TRUE,
fcol = NULL,
colourSeed = 1234) {
varianceTable <- .gatherVarianceData(varianceList)
varianceTable <- .filterVarianceData(
varianceTable, top, by, effect, decreasing
)
levs <- unique(varianceTable$effectName)
levs <- c("Residuals", levs[levs != "Residuals"])
varianceTable$effectName <- factor(
varianceTable$effectName, levels = levs
)
set.seed(colourSeed)
pl <- if (combined) {
.plotExplainedVarianceCombined(varianceTable)
} else {
.plotExplainedVarianceByFeature(varianceTable, fcol)
}
pl <- pl + labs(x = "", y = "Explained variance")
pl <- .prettifyVariancePlot(pl)
pl
}
## Internal function that combines a list of variance tables into a data
## table compatible for ggplot. The effect name is added to enable
## stratification of the effects when plotting.
##
## @param varianceList The output of scpVarianceAnalysis(), i.e. a
## list of DataFrame.
##
.gatherVarianceData <- function(varianceList) {
out <- lapply(names(varianceList), function(effect) {
varianceTable <- varianceList[[effect]]
varianceTable$effectName <- effect
varianceTable
})
do.call(rbind, out)
}
## Internal function that filters the variance results. The filtering
## requires:
## - top: how many features to keep. The numbers of rows of the output
## is top times the number of effects in the table.
## - by: based on which metric should the features be sorted
## - effect: on which effect should the filtering be focused
## - decreasing: are the top features sorted from high to low (TRUE)
## or from low to high (FALSE).
##
## The functions returns a row-subset of the input DataFrame. Note
## that the features are encoded as a factor where the levels are sorted
## depending on the filtering criteria so that the remaining features
## are correctly sorted when plotting using ggplot.
##
## @param top A `numeric(1)` used to filter the results. It indicates how
## many features should be plotted. When `top = Inf` (default),
## all feature are considered.
## @param by A `character(1)` used to filter the results. It
## indicates which variable should be considered when sorting the
## results. Can be one of: "SS", "df", or "percentExplainedVar".
## @param effect A `character(1)` used to filter theb results. It
## indicates which effect should be considered when sorting the
## results.
## @param decreasing A `logical(1)` indicating whether the effects
## should be ordered decreasingly (`TRUE`, default) or
## increasingly (`FALSE`) depending on the value provided by
## `by`.
##
.filterVarianceData <- function(varianceTable, top, by, effect,
decreasing) {
stopifnot(by %in% colnames(varianceTable))
stopifnot(effect %in% varianceTable$effectName)
if (top > nrow(varianceTable)) top <- nrow(varianceTable)
isEffect <- varianceTable[["effectName"]] == effect
criteria <- varianceTable[[by]][isEffect]
topIndex <- order(criteria, decreasing = decreasing)[seq_len(top)]
topFeatures <- varianceTable$feature[topIndex]
varianceTable <- varianceTable[varianceTable$feature %in% topFeatures, ]
varianceTable$feature <- factor(
varianceTable$feature,
levels = topFeatures
)
varianceTable
}
## Internal function that creates a ggplot object that creates stacked
## bar plots for each individual feature. The plots depict the explained
## variance (that is the percentExplainedVar) from gathered
## variance analysis results (cf `.gatherVarianceData()` and
## `.filterVarianceData()`). The function allows to group features
## (through facetting) based on a fcol column.
##
## @param varianceTable A DataFrame as generated by
## `.gatherVarianceData()` and `.filterVarianceData()`.
## @param fcol A `character(1)` indicating the column to use for
## grouping features. Typically, this would be protein or gene
## names for grouping proteins.
.plotExplainedVarianceByFeature <- function(varianceTable,
fcol = NULL) {
if (!is.null(fcol)) varianceTable$fcol <- varianceTable[[fcol]]
pl <- ggplot(as.data.frame(varianceTable)) +
aes(
fill = .data$effectName,
y = .data$percentExplainedVar,
x = .data$feature
) +
geom_bar(
colour = "black",
stat = "identity"
)
if (!is.null(fcol)){
pl <- pl + facet_grid(~ fcol, scales = "free_x", space = "free")
}
pl
}
## Internal function that creates a ggplot object that creates a single
## stacked bar plot of the explained variance (that is the
## percentExplainedVar), combining the data from all features in the
## table. The table is obtained after gathering the
## variance analysis results (cf `.gatherVarianceData()` and
## `.filterVarianceData()`). The function combines data from multiple
## features by taking the average percentage explained variance across
## features for each effect.
##
## @param varianceTable A DataFrame as generated by
## `.gatherVarianceData()` and `.filterVarianceData()`.
##
.plotExplainedVarianceCombined <- function(varianceTable) {
varianceTable <- split(data.frame(varianceTable), varianceTable$effectName)
varianceTable <- lapply(varianceTable, function(x) {
data.frame(
effectName = unique(x$effectName),
percentExplainedVar = mean(x$percentExplainedVar)
)
})
varianceTable <- do.call(rbind, varianceTable)
ggplot(varianceTable) +
aes(fill = .data$effectName,
y = .data$percentExplainedVar,
x = 1) +
geom_bar(colour = "black", stat = "identity")
}
## Internal function to improve aesthetic of default ggplot plots,
## that is:
## 1. Provide a set of colour blind-friendly colors using
## `RColorBrewer`. Note that only 8 colors are available, so if
## more effects than colors, the colors are recycled.
## 2. Use ggplot's classic theme
## 3. Adapt x-axis annotations. When the plot is a variance plot for
## the full data, the axis annotation is removed because not
## meaningfull. When plotting individual peptides, the annotations
## are rotated for improved readability.
##
## @param pl A ggplot object as created by
## .plotExplainedVarianceByFeature() or
## .plotExplainedVarianceCombined().
##
##' @import RColorBrewer
.prettifyVariancePlot <- function(pl) {
cols <- brewer.pal(n = 8, name = "Set2")
ncols <- length(unique(pl$data$effectName)) - 1
cols <- cols[sample(seq_len(ncols) %% length(cols) + 1)]
pl <- pl +
theme_classic() +
theme(
axis.text.x = element_text(
angle = 90, hjust = 1, vjust = 0.5
),
axis.ticks.x = element_blank(),
axis.line.x = element_blank()
) +
scale_fill_manual(
values = c("white", cols),
name = "Effect name"
)
if (identical(pl$mapping$x, 1))
pl <- pl + theme(axis.text.x = element_blank())
pl
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.