# ANALYSIS FUNCTIONS ------------------------------------------------------
#' Relative Log Expression Plot
#'
#' Takes two covariates, i.e., group and batch, and computes the RLE-plot over
#' the grouping of the first covariate, colored by the second covariate.
#' Effectively illustrating the relative expression between samples from
#' different batches within the respective study groups. Other covariates can be
#' chosen as input and the function will check for factors and convert if
#' necessary. Categorical factors, e.g., group membership, sex and batch,
#' produce the best result.
#'
#' The function returns either a plot-frame or the finished ggplot object. Input
#' is an MbecData-object. If cumulative log-ratio (clr) and total sum-scaled
#' (tss) abundance matrices are part of the input, i.e., 'mbecTransform()' was
#' used, they can be selected as input by using the 'type' argument with either
#' "otu", "clr" or "tss". If batch effect corrected matrices are available, they
#' can be used by specifying the 'type' argument as "cor" and using the 'label'
#' argument to select the appropriate matrix by its denominator, e.g., for batch
#' correction method ComBat this would be "bat", for RemoveBatchEffects from the
#' limma package this is "rbe". Default correction method-labels are "ruv3",
#' "bmc","bat","rbe","pn","svd".
#'
#' The combination of 'type' and 'label' argument basically accesses the
#' attribute 'cor', a list that stores all matrices of corrected counts.
#' This list can also be accessed via getter and setter methods. Hence, the user
#' can supply their own matrices with own names.
#'
#' @keywords RLE relative log expression
#' @param input.obj MbecData-object
#' @param model.vars two covariates of interest to select by. First relates to
#' 'batch' and the second to relevant grouping.
#' @param type Which abundance matrix to use for the calculation.
#' @param label Which corrected abundance matrix to use for analysis.
#' @param return.data logical if TRUE returns the data.frame required for
#' plotting. Default (FALSE) will return plot object.
#' @return Either a ggplot2 object or a formatted data-frame to plot from.
#' @export
#'
#' @examples
#' # This will return the data.frame for plotting.
#' data.RLE <- mbecRLE(input.obj=dummy.mbec, type="clr",
#' model.vars=c('group','batch'), return.data=TRUE)
#'
#' # This will return the ggplot2 object for display, saving and modification.
#' plot.RLE <- mbecRLE(input.obj=dummy.mbec, model.vars=c('group','batch'),
#' type="clr", return.data=FALSE)
mbecRLE <- function(input.obj, model.vars = c("batch","group"), type="clr",
label=character(), return.data = FALSE) {
tmp <- mbecGetData(input.obj=input.obj, orientation="fxs",
required.col=eval(model.vars), type=eval(type),
label=label)
tmp.cnts <- tmp[[1]]
tmp.meta <- tmp[[2]] %>% tibble::rownames_to_column(., var="specimen")
tmp.long <- NULL
for(g.idx in unique(tmp.meta[, eval(model.vars[2])])) {
message("Calculating RLE for group: ", g.idx)
tmp.cnts.group <-
dplyr::select(
tmp.cnts,tmp.meta$specimen[tmp.meta[,eval(model.vars[2])] %in% g.idx])
feature.med = apply(tmp.cnts.group, 1, stats::median)
tmp.group.long <- apply(tmp.cnts.group, 2,
function(sample.col) sample.col - feature.med) %>%
as.data.frame() %>%
tidyr::pivot_longer(cols = everything(),
names_to = "specimen", values_to = "values")
tmp.long <- rbind.data.frame(tmp.long, tmp.group.long)
}
tmp.long <- dplyr::left_join(tmp.long, tmp.meta,
by = "specimen") %>%
dplyr::mutate(plot.order = paste(get(model.vars[2]),
get(model.vars[1]), sep = "_")) %>%
dplyr::arrange(plot.order) %>%
dplyr::mutate(specimen = factor(specimen, levels = unique(specimen)))
if (return.data) {
return(tmp.long)
}
return(mbecRLEPlot(tmp.long, model.vars))
}
#' Principal Component Analysis Plot
#'
#' Takes two covariates, i.e., group and batch, and computes the ordination-plot
#' for user-selected principal components. Covariates determine sample-shape and
#' color and can be switched to shift the emphasis on either group. In addition
#' to the ordination-plot, the function will show the distribution of
#' eigenvalues (colored by the second covariate) on their respective principal
#' components.
#'
#' The function returns either a plot-frame or the finished ggplot object. Input
#' is an MbecData-object. If cumulative log-ratio (clr) and total sum-scaled
#' (tss) abundance matrices are part of the input, i.e., 'mbecTransform()' was
#' used, they can be selected as input by using the 'type' argument with either
#' "otu", "clr" or "tss". If batch effect corrected matrices are available, they
#' can be used by specifying the 'type' argument as "cor" and using the 'label'
#' argument to select the appropriate matrix by its denominator, e.g., for batch
#' correction method ComBat this would be "bat", for RemoveBatchEffects from the
#' limma package this is "rbe". Default correction method-labels are "ruv3",
#' "bmc","bat","rbe","pn","svd".
#'
#' The combination of 'type' and 'label' argument basically accesses the
#' attribute 'cor', a list that stores all matrices of corrected counts.
#' This list can also be accessed via getter and setter methods. Hence, the user
#' can supply their own matrices with own names.
#'
#' @keywords PCA principal component analysis
#' @param input.obj list(cnts, meta), phyloseq, MbecData object (correct
#' orientation is handled internally)
#' @param model.vars two covariates of interest to select by first variable
#' selects color (batch) and second one determines shape (group)
#' @param pca.axes numeric vector which axes to plot, first is X and second is Y
#' @param type Which abundance matrix to use for the calculation.
#' @param label Which corrected abundance matrix to use for analysis.
#' @param return.data logical if TRUE returns the data.frame required for
#' plotting. Default (FALSE) will return plot object.
#' @return either a ggplot2 object or a formatted data-frame to plot from
#' @export
#' @include mbecs_classes.R
#'
#' @examples
#' # This will return the data.frame for plotting.
#' data.PCA <- mbecPCA(input.obj=dummy.mbec,
#' model.vars=c('group','batch'), pca.axes=c(1,2), return.data=TRUE)
#'
#' # This will return the ggplot2 object for display, saving and modification.
#' # Selected PCs are PC3 on x-axis and PC2 on y-axis.
#' plot.PCA <- mbecPCA(input.obj=dummy.mbec,
#' model.vars=c('group','batch'), pca.axes=c(3,2), return.data=FALSE)
setGeneric("mbecPCA", signature = "input.obj",
function(input.obj, model.vars = c("batch", "group"),
pca.axes = c(1, 2), type="clr", label=character(),
return.data = FALSE) standardGeneric("mbecPCA"))
## In this form it works for 'phyloseq' and 'mbecData' objects
.mbecPCA <- function(input.obj, model.vars=c("batch", "group"),
pca.axes=c(1,2), type="clr", label=character(),
return.data=FALSE) {
tmp <- mbecGetData(input.obj=input.obj, orientation = "sxf",
required.col=eval(model.vars), type=eval(type),
label=label)
tmp.cnts <- tmp[[1]]; tmp.meta <- tmp[[2]]
# calculate IQR and sort counts in decreasing order
iqr <- apply(tmp.cnts, 2, stats::IQR)
tmp.cnts <- tmp.cnts[, order(iqr, decreasing = TRUE)]
PCA <- stats::prcomp(tmp.cnts, scale = FALSE)
axes.number <- dim(PCA$x)[2]
axes.names <- paste("PC", seq_len(axes.number), sep = "")
plot.df <- PCA$x %>%
data.frame(stringsAsFactors = FALSE) %>%
dplyr::rename_at(seq_len(axes.number), ~axes.names) %>%
tibble::rownames_to_column(var = "sID") %>%
dplyr::left_join(tmp.meta, by = "sID")
metric.df <- data.frame(var.explained=round((100 * PCA$sdev^2)/
(sum(PCA$x^2/max(1,nrow(PCA$x) - 1))), 2),
row.names = axes.names) %>%
dplyr::mutate(axis.min = floor(apply(PCA$x, 2, function(col) min(col)))) %>%
dplyr::mutate(axis.max = ceiling(apply(PCA$x, 2, function(col) max(col))))
for (idx in seq_along(model.vars) ) {
if (!is.factor(plot.df[, eval(model.vars[idx])])) {
warning("Grouping variables need to be factors.
Coercing to factor now, adjust beforehand to get best results.")
plot.df[,eval(model.vars[idx])] <- factor(plot.df[,eval(model.vars[idx])])
}
}
# No plotting, just return data.
if (return.data) {
return(list(plot.df, metric.df, pca.axes))
}
return(mbecPCAPlot(plot.df, metric.df, model.vars, pca.axes))
}
#' Principal Component Analysis Plot for MbecData
#'
#' Takes two covariates, i.e., group and batch, and computes the ordination-plot
#' for user-selected principal components. Covariates determine sample-shape and
#' color and can be switched to shift the emphasis on either group. In addition
#' to the ordination-plot, the function will show the distribution of
#' eigenvalues (colored by the second covariate) on their respective principal
#' components.
#'
#' The function returns either a plot-frame or the finished ggplot object. Input
#' is an MbecData-object. If cumulative log-ratio (clr) and total sum-scaled
#' (tss) abundance matrices are part of the input, i.e., 'mbecTransform()' was
#' used, they can be selected as input by using the 'type' argument with either
#' "otu", "clr" or "tss". If batch effect corrected matrices are available, they
#' can be used by specifying the 'type' argument as "cor" and using the 'label'
#' argument to select the appropriate matrix by its denominator, e.g., for batch
#' correction method ComBat this would be "bat", for RemoveBatchEffects from the
#' limma package this is "rbe". Default correction method-labels are "ruv3",
#' "bmc","bat","rbe","pn","svd".
#'
#' The combination of 'type' and 'label' argument basically accesses the
#' attribute 'cor', a list that stores all matrices of corrected counts.
#' This list can also be accessed via getter and setter methods. Hence, the user
#' can supply their own matrices with own names.
#'
#' @keywords PCA principal component analysis
#' @param input.obj MbecData object
#' @param model.vars two covariates of interest to select by first variable
#' selects color (batch) and second one determines shape (group).
#' @param pca.axes numeric vector which axes to plot, first is X and second is Y
#' @param type Which abundance matrix to use for the calculation.
#' @param label Which corrected abundance matrix to use for analysis.
#' @param return.data logical if TRUE returns the data.frame required for
#' plotting. Default (FALSE) will return plot object.
#' @return either a ggplot2 object or a formatted data-frame to plot from
#' @export
#' @include mbecs_classes.R
#'
#' @examples
#' # This will return the data.frame for plotting.
#' data.PCA <- mbecPCA(input.obj=dummy.mbec,
#' model.vars=c('group','batch'), pca.axes=c(1,2), return.data=TRUE)
#'
#' # This will return the ggplot2 object for display, saving and modification.
#' # Selected PCs are PC3 on x-axis and PC2 on y-axis.
#' plot.PCA <- mbecPCA(input.obj=dummy.mbec,
#' model.vars=c('group','batch'), pca.axes=c(3,2), return.data=FALSE)
setMethod("mbecPCA", "MbecData", function(input.obj,
model.vars = c("batch", "group"),
pca.axes = c(1, 2), type="clr",
label=character(),
return.data=FALSE) {
.mbecPCA(input.obj, model.vars=model.vars, pca.axes=pca.axes, type=type,
label=label, return.data=return.data)
})
#' Feature Differential Abundance Box-Plot
#'
#' Displays the abundance of a selected feature, grouped/colored by a covariate,
#' i.e., batch, in a box-plot. Includes the density-plot, i.e., the distribution
#' of counts for each sub-group. Selection methods for features are "TOP" and
#' "ALL" which select the top-n or all features respectively. The default value
#' for the argument 'n' is 10. If 'n' is supplied with a vector of feature
#' names, e.g., c("OTU1","OTU5", "OTU10"), of arbitrary length, the argument
#' 'method' will be ignored and only the given features selected for plotting.
#'
#' The function returns either a plot-frame or the finished ggplot object. Input
#' is an MbecData-object. If cumulative log-ratio (clr) and total sum-scaled
#' (tss) abundance matrices are part of the input, i.e., 'mbecTransform()' was
#' used, they can be selected as input by using the 'type' argument with either
#' "otu", "clr" or "tss". If batch effect corrected matrices are available, they
#' can be used by specifying the 'type' argument as "cor" and using the 'label'
#' argument to select the appropriate matrix by its denominator, e.g., for batch
#' correction method ComBat this would be "bat", for RemoveBatchEffects from the
#' limma package this is "rbe". Default correction method-labels are "ruv3",
#' "bmc","bat","rbe","pn","svd".
#'
#' The combination of 'type' and 'label' argument basically accesses the
#' attribute 'cor', a list that stores all matrices of corrected counts.
#' This list can also be accessed via getter and setter methods. Hence, the user
#' can supply their own matrices with own names.
#'
#' @keywords Box abundance density
#' @param input.obj MbecData object
#' @param method One of 'ALL' or 'TOP' for 'n' most variable features, DEFAULT
#' is 'ALL'.
#' @param n Number of OTUs to display for 'TOP' method, or vector of specific
#' feature names to select.
#' @param model.var Covariate to group by, default is "batch".
#' @param type Which abundance matrix to use for the calculation.
#' @param label Which corrected abundance matrix to use for analysis.
#' @param return.data logical if TRUE returns the data.frame required for
#' plotting. Default (FALSE) will return plot object.
#' @return either a ggplot2 object or a formatted data-frame to plot from
#' @export
#' @include mbecs_classes.R
#'
#' @examples
#' # This will return the plot-frame of all features in the data-set.
#' data.Box <- mbecBox(input.obj=dummy.mbec, method='ALL', model.var='batch',
#' type='clr', return.data=TRUE)
#'
#' # This will return the ggplot2 object of the top 5 most variable features.
#' plot.Box <- mbecBox(input.obj=dummy.mbec, method='TOP', n=5,
#' model.var='batch', type='otu', return.data=FALSE)
mbecBox <- function(input.obj, method = c("ALL", "TOP"), n = 10,
model.var = "batch", type="clr", label=character(),
return.data = FALSE) {
input.obj <- mbecProcessInput(input.obj = input.obj, required.col = model.var)
# needs sxf orientation
tmp <- mbecGetData(input.obj = input.obj, orientation = "sxf",
required.col = eval(model.var), type=eval(type),
label=label)
tmp[[2]] <- tibble::rownames_to_column(tmp[[2]], var = "specimen")
otu.idx <- colnames(tmp[[1]])
tmp <- tmp[[1]] %>%
tibble::rownames_to_column(var = "specimen") %>%
dplyr::left_join(tmp[[2]], by = c(specimen = "specimen"))
if (method[1] == "TOP") {
iqr <- apply(tmp[, otu.idx], 2, stats::IQR)
iqr <- iqr[order(iqr, decreasing = TRUE)]
otu.idx <- names(iqr)[seq_len(min(length(otu.idx), n))]
tmp <- tmp %>%
dplyr::select(c(dplyr::all_of(otu.idx), "specimen", eval(model.var)))
} else if (length(method) >= 2) {
message("'Method' parameter contains multiple elements -
using to select features.")
otu.idx <- method
tmp <- tmp %>%
dplyr::select(c(dplyr::all_of(otu.idx), "specimen", eval(model.var)))
} # else is 'select-all-mode'
if (return.data) {
return(list(tmp, otu.idx))
}
return(mbecBoxPlot(tmp, otu.idx, model.var))
}
#' Feature Differential Abundance Heatmap
#'
#' Shows the abundance value of selected features in a heatmap. By default, the
#' function expects two covariates group and batch to depict clustering in these
#' groups. More covariates can be included. Selection methods for features are
#' "TOP" and "ALL" which select the top-n or all features respectively. The
#' default value for the argument 'n' is 10. If 'n' is supplied with a vector
#' of feature names, e.g., c("OTU1","OTU5", "OTU10"), of arbitrary length, the
#' argument method' will be ignored and only the given features selected for
#' plotting.
#'
#' The function returns either a plot-frame or the finished ggplot object. Input
#' is an MbecData-object. If cumulative log-ratio (clr) and total sum-scaled
#' (tss) abundance matrices are part of the input, i.e., 'mbecTransform()' was
#' used, they can be selected as input by using the 'type' argument with either
#' "otu", "clr" or "tss". If batch effect corrected matrices are available, they
#' can be used by specifying the 'type' argument as "cor" and using the 'label'
#' argument to select the appropriate matrix by its denominator, e.g., for batch
#' correction method ComBat this would be "bat", for RemoveBatchEffects from the
#' limma package this is "rbe". Default correction method-labels are "ruv3",
#' "bmc","bat","rbe","pn","svd".
#'
#' The combination of 'type' and 'label' argument basically accesses the
#' attribute 'cor', a list that stores all matrices of corrected counts.
#' This list can also be accessed via getter and setter methods. Hence, the user
#' can supply their own matrices with own names.
#'
#' @keywords Heat abundance clustering
#' @param input.obj MbecData object
#' @param model.vars Covariates of interest to show in heatmap.
#' @param center Flag to activate centering, DEFAULT is TRUE.
#' @param scale Flag to activate scaling, DEFAULT is TRUE.
#' @param method One of 'ALL' or 'TOP' or a vector of feature names.
#' @param n Number of features to select in method TOP.
#' @param type Which abundance matrix to use for the calculation.
#' @param label Which corrected abundance matrix to use for analysis.
#' @param return.data Logical if TRUE returns the data.frame required for
#' plotting. Default (FALSE) will return plot object.
#' @return either a ggplot2 object or a formatted data-frame to plot from
#' @export
#' @include mbecs_classes.R
#'
#' @examples
#' # This will return the plot-frame of all features in the data-set.
#' data.Heat <- mbecHeat(input.obj=dummy.mbec, model.vars=c('group','batch'),
#' center=TRUE, scale=TRUE, method='ALL', return.data=TRUE)
#'
#' # This will return the ggplot2 object of the top 5 most variable features.
#' plot.Heat <- mbecHeat(input.obj=dummy.mbec, model.vars=c('group','batch'),
#' center=TRUE, scale=TRUE, method='TOP', n=5, return.data=FALSE)
mbecHeat <- function(input.obj, model.vars = c("batch", "group"), center = TRUE,
scale = TRUE, method = "TOP", n = 10, type="clr",
label=character(), return.data = FALSE) {
cols <- pals::tableau20(20)[c(1, 3, 5, 7, 9, 11, 13, 15, 17, 19)]
tmp <- mbecGetData(input.obj=input.obj, orientation="sxf",
required.col=eval(model.vars), type=eval(type),
label=label)
tmp.cnts <- tmp[[1]]
tmp.meta <- tmp[[2]]
otu.idx <- colnames(tmp[[1]])
for (g.idx in c(seq_along(model.vars))) {
if (!is.factor(tmp.meta[, eval(model.vars[g.idx])])) {
warning("Grouping variables need to be factors. Coercing variable: ",
eval(model.vars[g.idx]),
" to factor now, adjust beforehand to get best results.")
tmp.meta[, eval(model.vars[g.idx])] <-
factor(tmp.meta[, eval(model.vars[g.idx])])
}
}
tmp.cnts <- base::scale(tmp.cnts, center=eval(center), scale=eval(scale))
tmp.cnts <- base::scale(t(tmp.cnts), center=eval(center), scale=eval(scale))
if (method[1] == "TOP") {
iqr <- apply(tmp.cnts[otu.idx, ], 1, stats::IQR)
iqr <- iqr[order(iqr, decreasing = TRUE)]
otu.idx <- names(iqr)[seq_len(min(length(otu.idx), n))]
tmp.cnts <- tmp.cnts[otu.idx, ]
} else if (length(method) >= 2)
{
message("'Method' parameter contains multiple elements -
using to select features.")
tmp.cnts <- tmp.cnts[method, ]
} # else is 'select-all-mode'
if (return.data) {
return(list(tmp.cnts, tmp.meta))
}
return(mbecHeatPlot(center, scale, tmp.cnts, tmp.meta, model.vars))
}
#' Mosaic Sample Group Allocation
#'
#' Depicts the dispersion of samples over two (preferentially categorical)
#' covariates of interest. Effectively showing, the un-/evenness within and
#' between covariates to inform the choice of methods for the subsequent steps
#' in an analysis.
#'
#' The function returns either a plot-frame or the finished ggplot object. Input
#' for the data-set can be an MbecData-object.
#'
#' @keywords Mosaic sample allocation
#' @param input.obj MbecData object
#' @param model.vars Two covariates of interest to the sample allocation.
#' @param return.data Logical if TRUE returns the data.frame required for
#' plotting. Default (FALSE) will return plot object.
#' @return either a ggplot2 object or a formatted data-frame to plot from
#' @export
#' @include mbecs_classes.R
#'
#' @examples
#' # This will return the plot-df of the samples grouped by group and batch.
#' data.Mosaic <- mbecMosaic(input.obj=dummy.mbec,
#' model.vars=c('group','batch'), return.data=TRUE)
#'
#' # Return the ggplot2 object of the samples grouped by group and batch
#' plot.Mosaic <- mbecMosaic(input.obj=dummy.mbec,
#' model.vars=c('group','batch'), return.data=FALSE)
mbecMosaic <- function(input.obj, model.vars = c("batch", "group"),
return.data = FALSE) {
cols <- pals::tableau20(20)
tmp <- mbecGetData(input.obj, orientation = "sxf",
required.col = eval(model.vars))
tmp.meta <- tmp[[2]]
if (length(model.vars) < 2) {
message("Only one variable specified for Mosaic-plot, two are required!")
} else if (length(model.vars) > 2) {
message("More than 2 variables specified. Mosaic will take the first two.")
}
for (g.idx in eval(model.vars)) {
if (!is.factor(tmp.meta[, eval(g.idx)])) {
warning("Grouping variables need to be factors. Coercing variable: '",
eval(g.idx),
"' to factor now, adjust beforehand to get best results.")
tmp.meta[, eval(g.idx)] <- as.factor(tmp.meta[, eval(g.idx)])
}
}
n.observations <- base::dim(tmp.meta)[1]
study.summary <- base::table(tmp.meta[, eval(model.vars[1])],
tmp.meta[, eval(model.vars[2])]) %>%
as.data.frame() %>%
dplyr::mutate(Freq.scaled = Freq/n.observations)
if (return.data)
{
return(study.summary)
} # else return the plots
return(mbecMosaicPlot(study.summary, model.vars))
}
# VARIANCE CALCULATION ----------------------------------------------------
#' Estimate Explained Variance
#'
#' The function offers a selection of methods/algorithms to estimate the
#' proportion of variance that can be attributed to covariates of interest.
#' This shows, how much variation is explained by the treatment effect, which
#' proportion is introduced by processing in batches and the leftover variance,
#' i.e., residuals that are not currently explained. Covariates of interest
#' (CoI) are selected by the user and the function will incorporate them into
#' the model building for the respective algorithm. The user can select from
#' five different approaches to adapt to the characteristics of the data-set,
#' e.g., LMMs are a better choice than LMs for a very unbalanced study design.
#' Available approaches are: Linear Model (lm), Linear Mixed Model (lmm),
#' Redundancy Analysis (rda), Principal Variance Component Analysis (pvca) or
#' Silhouette Coefficient (s.coef).
#'
#' Linear Model (lm): An additive model of all covariates is fitted to each
#' feature respectively and the proportion of variance is extracted for each
#' covariate (OTU_x ~ covariate_1 + covariate_2 + ...).
#'
#' Linear Mixed Model (lmm): All but the first covariate are considered mixed
#' effects. A model is fitted to each OTU respectively and the proportion of
#' variance extracted for each covariate.
#' (OTU_x ~ covariate_1 + (1|covariate_2) + (1|...)).
#'
#' partial Redundancy Analysis (rda): Iterates over given covariates, builds a
#' model of all covariates that includes one variable as condition/constraint
#' and then fits it to the feature abundance matrix. The difference in explained
#' variance between the full- and the constrained-model is then attributed to
#' the constraint. (cnts ~ group + Condition(batch) vs. cnts ~ group + batch)
#'
#' Principal Variance Component Analysis (pvca): Algorithm - calculate the
#' correlation of the fxs count-matrix - from there extract the eigenvectors and
#' eigenvalues and calculate the proportion of explained variance per
#' eigenvector (i.e. principal component) by dividing the eigenvalues by the
#' sum of eigenvalues. Now select as many PCs as required to fill a chosen
#' quota for the total proportion of explained variance. Iterate over all PCs
#' and fit a linear mixed model that contains all covariates as random effect
#' and all unique interactions between two covariates. Compute variance
#' covariance components form the resulting model --> From there we get the
#' Variance that each covariate(variable) contributes to this particular PC.
#' Then just standardize variance by dividing it through the sum of variance for
#' that model. Scale each PCs results by the proportion this PC accounted for in
#' the first place. And then do it again by dividing it through the total amount
#' of explained variance, i.e. the cutoff to select the number of PCs to take
#' (obviously not the cutoff but rather the actual values for the selected PCs).
#' Finally take the average over each random variable and interaction term and
#' display in a nice plot.
#'
#' Silhouette Coefficient (s.coef): Calculate principal components and get
#' sample-wise distances on the resulting (sxPC) matrix. Then iterate over all
#' the covariates and calculate the cluster silhouette (which is basically
#' either zero, if the cluster contains only a single element, or it is the
#' distance to the closest different cluster minus the distance of the sample
#' within its own cluster divided (scaled) by the maximum distance). Average
#' over each element in a cluster for all clusters and there is the
#' representation of how good the clustering is. This shows how good a
#' particular covariate characterizes the data, i.e., a treatment variable for
#' instance may differentiate the samples into treated and untreated groups
#' which implies two clusters. In an ideal scenario, the treatment variable,
#' i.e., indicator for some biological effect would produce a perfect
#' clustering. In reality, the confounding variables, e.g., batch, sex or age,
#' will also influence the ordination of samples. Hence, the clustering
#' coefficient is somewhat similar to the amount of explained variance metric
#' that the previous methods used. If used to compare an uncorrected data-set to
#' a batch-corrected set, the expected result would be an increase of clustering
#' coefficient for the biological effect (and all other covariates - because a
#' certain amount of uncertainty was removed from the data) and a decrease for
#' the batch effect.
#'
#' The function returns a data-frame for further analysis - the report functions
#' (mbecReport and mbecReportPrelim) will automatically produce plots. Input for
#' the data-set can be an MbecData-object, a phyloseq-object or a list that
#' contains counts and covariate data. The covariate table requires an 'sID'
#' column that contains sample IDs equal to the sample naming in the counts
#' table. Correct orientation of counts will be handled internally.
#'
#' @keywords Model Evaluation Variance
#' @param input.obj MbecData object
#' @param model.vars Vector of covariates to include in model-construction, in
#' case parameter 'model.form' is not supplied.
#' @param method Select method of modeling: Linear Model (lm), Linear Mixed
#' Model (lmm), Redundancy Analysis (rda), Principal Variance Component Analysis
#' (pvca) or Silhouette Coefficient (s.coef).
#' @param model.form string that describes a model formula, i.e.,
#' 'y ~ covariate1 + (1|covariate2)'.
#' @param type Which abundance matrix to use for the calculation.
#' @param label Which corrected abundance matrix to use for analysis.
#' @param no.warning (OPTIONAL) True/False-flag that should turn of singularity
#' warnings, but it doesn't quite work
#' @param na.action (OPTIONAL) set NA handling, will take global option if not
#' supplied
#' @return Data.frame that contains proportions of variance for given covariates
#' in every feature.
#' @include mbecs_classes.R
#' @export
#'
#' @examples
#' # This will return a data-frame that contains the variance attributable to
#' # group and batch according to linear additive model.
#' df.var.lm <- mbecModelVariance(input.obj=dummy.mbec,
#' model.vars=c("batch", "group"), method='lm', type='clr')
#' # This will return a data-frame that contains the variance attributable to
#' # group and batch according to principal variance component analysis.
#' df.var.pvca <- mbecModelVariance(input.obj=dummy.mbec,
#' model.vars=c("batch", "group"), method='pvca')
mbecModelVariance <- function(input.obj, model.vars=character(),
method=c("lm","lmm", "rda", "pvca", "s.coef"),
model.form=NULL, type="clr", label=character(),
no.warning = TRUE, na.action = NULL) {
oldw <- getOption("warn")
if (no.warning) {
options(warn = -1)
}
on.exit(options(warn = oldw))
# handle optional parameters
if (is.null(na.action)) {
na.action = getOption("na.action")
}
method <- match.arg(method, choices = c("lm","lmm", "rda", "pvca", "s.coef"))
type <- match.arg(type, choices = c("otu","clr","tss","ass","cor"))
## PVCA stuff
pct_threshold = 0.5876 # threshold for explained variances
tmp <- mbecGetData(input.obj = input.obj, orientation = "sxf",
required.col=eval(model.vars), type=eval(type),
label=label)
tmp.cnts <- tmp[[1]]; tmp.meta <- tmp[[2]]
## Adjust 'type' variable if selected matrix comes from 'cor' list
if( type == "cor" && length(label) != 0 ) type <- label
## CHECK if grouping variables are factors
for (g.idx in eval(model.vars)) {
if (!is.factor(tmp.meta[, eval(g.idx)])) {
warning("Grouping variables need to be factors. Coercing variable: ",
eval(g.idx),
" to factor now, adjust beforehand to get best results.")
tmp.meta[, eval(g.idx)] <- as.factor(tmp.meta[, eval(g.idx)])
}
}
if (method == "lm") {
res <- mbecModelVarianceLM(model.form,model.vars,tmp.cnts,tmp.meta,type)
return(res)
} else if (method == "lmm") {
res <- mbecModelVarianceLMM(model.form,model.vars,tmp.cnts,tmp.meta,type)
return(res)
} else if (method == "rda") {
res <- mbecModelVarianceRDA(model.vars,tmp.cnts,tmp.meta,type)
return(res)
} else if (method == "pvca") {
res <- mbecModelVariancePVCA(model.vars, tmp.cnts, tmp.meta,
type, pct_threshold, na.action)
return(res)
} else if (method == "s.coef") {
res <- mbecModelVarianceSCOEF(model.vars, tmp.cnts, tmp.meta, type)
return(res)
}
message("Input doesn't apply to any available method. Nothing was done here.")
return(NULL)
}
#' Estimate Explained Variance with Linear Models
#'
#' The function offers a selection of methods/algorithms to estimate the
#' proportion of variance that can be attributed to covariates of interest.
#' This shows, how much variation is explained by the treatment effect, which
#' proportion is introduced by processing in batches and the leftover variance,
#' i.e., residuals that are not currently explained. Covariates of interest
#' (CoI) are selected by the user and the function will incorporate them into
#' the model.
#'
#' Linear Model (lm): An additive model of all covariates is fitted to each
#' feature respectively and the proportion of variance is extracted for each
#' covariate (OTU_x ~ covariate_1 + covariate_2 + ...).
#'
#' @keywords LM Proportion of Variance
#' @param model.form Formula for linear model, function will create simple
#' additive linear model if this argument is not supplied.
#' @param model.vars Covariates to use for model building if argument
#' 'model.form' is not given.
#' @param tmp.cnts Abundance matrix in 'sample x feature' orientation.
#' @param tmp.meta Covariate table that contains at least the used variables.
#' @param type String the denotes data source, i.e., one of "otu","clr" or "tss"
#' for the transformed counts or the label of the batch corrected count-matrix.
#' @return Data.frame that contains proportions of variance for given covariates
#' in a linear modelling approach.
#' @include mbecs_classes.R
mbecModelVarianceLM <- function(model.form, model.vars, tmp.cnts, tmp.meta, type) {
message("Fitting linear model to every feature and extract proportion of
variance explained by covariates.")
if (!is.null(model.form)) {
message("Use provided model formula.")
tmp.formula <- stats::as.formula(model.form)
} else {
message("Construct formula from covariates.")
tmp.formula = stats::as.formula(paste("y", " ~ ", paste(model.vars,
collapse = " + ")))
}
# Add a progress bar.
features <- colnames(tmp.cnts)
lm.pb <- utils::txtProgressBar(min=0, max=length(features), style=3,
width=80, char="=")
model.variances <- NULL
for (x in seq_along(features)) {
y <- tmp.cnts[[eval(x)]]
model.fit <- stats::lm(tmp.formula, data = tmp.meta)
model.variances <- rbind.data.frame(model.variances,
mbecVarianceStats(model.fit))
utils::setTxtProgressBar(lm.pb, x)
}
# close progress bar
close(lm.pb)
res <- dplyr::mutate(model.variances, type = eval(type))
attr(res, "modelType") <- "anova"
return(res)
}
#' Estimate Explained Variance with Linear Mixed Models
#'
#' The function offers a selection of methods/algorithms to estimate the
#' proportion of variance that can be attributed to covariates of interest.
#' This shows, how much variation is explained by the treatment effect, which
#' proportion is introduced by processing in batches and the leftover variance,
#' i.e., residuals that are not currently explained. Covariates of interest
#' (CoI) are selected by the user and the function will incorporate them into
#' the model.
#'
#' Linear Mixed Model (lmm): Only the first covariate is considered a mixed
#' effect. A model is fitted to each OTU respectively and the proportion of
#' variance extracted for each covariate.
#' (OTU_x ~ covariate_2.. + covariate_n + (1|covariate_1)
#'
#' @keywords LMM Proportion of Variance
#' @param model.form Formula for linear mixed model, function will create simple
#' additive linear mixed model if this argument is not supplied.
#' @param model.vars Covariates to use for model building if argument
#' 'model.form' is not given.
#' @param tmp.cnts Abundance matrix in 'sample x feature' orientation.
#' @param tmp.meta Covariate table that contains at least the used variables.
#' @param type String the denotes data source, i.e., one of "otu","clr" or "tss"
#' for the transformed counts or the label of the batch corrected count-matrix.
#' @return Data.frame that contains proportions of variance for given covariates
#' in a linear mixed modelling approach.
#' @include mbecs_classes.R
mbecModelVarianceLMM <- function(model.form, model.vars, tmp.cnts, tmp.meta,
type) {
message("Fitting linear-mixed model to every feature and extract proportion
of variance explained by covariates.")
# FixMe: maybe include the option to adjust this
control=lme4::lmerControl(calc.derivs=TRUE, check.rankX="stop.deficient")
if (!is.null(model.form)) {
message("Use provided model formula.")
tmp.formula <- stats::as.formula(model.form)
} else {
message("Construct formula from covariates.")
f.terms <- paste("(1|", model.vars, ")", sep = "")
tmp.formula <- stats::as.formula(paste(paste("y", paste(model.vars[-1],
collapse = " + "),
sep = " ~ "),
paste(f.terms[1], collapse = " + "),
sep = " + "))
}
# Add a progress bar.
features <- colnames(tmp.cnts)
lmm.pb <- utils::txtProgressBar(min = 0, max = length(features), style = 3,
width = 80, char="=")
model.variances <- NULL
for (x in seq_along(features)) {
y <- tmp.cnts[[eval(x)]] # meh
model.fit <- lme4::lmer(tmp.formula, data = tmp.meta, control = control)
model.variances <- rbind.data.frame(model.variances,
mbecVarianceStats(model.fit))
utils::setTxtProgressBar(lmm.pb, x)
}
close(lmm.pb)
res <- dplyr::mutate(model.variances, type = eval(type))
attr(res, "modelType") <- "linear-mixed model"
return(res)
}
#' Estimate Explained Variance with Redundancy Analysis
#'
#' The function offers a selection of methods/algorithms to estimate the
#' proportion of variance that can be attributed to covariates of interest.
#' This shows, how much variation is explained by the treatment effect, which
#' proportion is introduced by processing in batches and the leftover variance,
#' i.e., residuals that are not currently explained. Covariates of interest
#' (CoI) are selected by the user and the function will incorporate them into
#' the model.
#'
#' partial Redundancy Analysis (rda): Iterates over given covariates, builds a
#' model of all covariates that includes one variable as condition/constraint
#' and then fits it to the feature abundance matrix. The difference in explained
#' variance between the full- and the constrained-model is then attributed to
#' the constraint. (cnts ~ group + Condition(batch) vs. cnts ~ group + batch)
#'
#' @keywords Partial Redundancy Analysis of Variance
#' @param model.vars Covariates to use for model building.
#' @param tmp.cnts Abundance matrix in 'sample x feature' orientation.
#' @param tmp.meta Covariate table that contains at least the used variables.
#' @param type String the denotes data source, i.e., one of "otu","clr" or "tss"
#' for the transformed counts or the label of the batch corrected count-matrix.
#' @return Data.frame that contains proportions of variance for given covariates
#' in a partial redundancy analysis approach.
#' @include mbecs_classes.R
mbecModelVarianceRDA <- function(model.vars, tmp.cnts, tmp.meta, type) {
model.variances = data.frame(matrix(nrow = length(model.vars),
ncol = 1, dimnames = list(model.vars,
eval(type))))
# Add a progress bar.
rda.pb <- utils::txtProgressBar(min = 0, max = length(model.vars), style = 3,
width = 80, char="=")
for (condition.idx in seq_along(model.vars)) {
tmp.formula = stats::as.formula(
paste("tmp.cnts"," ~ ",
paste(model.vars[-eval(condition.idx)],
"+", collapse = " "), " Condition(",
model.vars[eval(condition.idx)],")", sep = ""))
tmp.rda.covariate <- vegan::rda(tmp.formula,
data = tmp.meta)
model.variances[condition.idx,eval(type)] <-
summary(tmp.rda.covariate)$partial.chi *
100/summary(tmp.rda.covariate)$tot.chi
utils::setTxtProgressBar(rda.pb, condition.idx)
}
close(rda.pb)
res <- data.frame(t(model.variances)) %>%
dplyr::mutate(type = eval(type))
attr(res, "modelType") <- "rda"
return(res)
}
#' Estimate Explained Variance with Principal Variance Component Analysis
#'
#' The function offers a selection of methods/algorithms to estimate the
#' proportion of variance that can be attributed to covariates of interest.
#' This shows, how much variation is explained by the treatment effect, which
#' proportion is introduced by processing in batches and the leftover variance,
#' i.e., residuals that are not currently explained. Covariates of interest
#' (CoI) are selected by the user and the function will incorporate them into
#' the model.
#'
#' Principal Variance Component Analysis (pvca): Algorithm - calculate the
#' correlation of the fxs count-matrix - from there extract the eigenvectors and
#' eigenvalues and calculate the proportion of explained variance per
#' eigenvector (i.e. principal component) by dividing the eigenvalues by the
#' sum of eigenvalues. Now select as many PCs as required to fill a chosen
#' quota for the total proportion of explained variance. Iterate over all PCs
#' and fit a linear mixed model that contains all covariates as random effect
#' and all unique interactions between two covariates. Compute variance
#' covariance components form the resulting model --> From there we get the
#' Variance that each covariate(variable) contributes to this particular PC.
#' Then just standardize variance by dividing it through the sum of variance for
#' that model. Scale each PCs results by the proportion this PC accounted for in
#' the first place. And then do it again by dividing it through the total amount
#' of explained variance, i.e. the cutoff to select the number of PCs to take
#' (obviously not the cutoff but rather the actual values for the selected PCs).
#' Finally take the average over each random variable and interaction term and
#' display in a nice plot.
#'
#' @keywords Principal Variance Component Analysis
#' @param model.vars Covariates to use for model building.
#' @param tmp.cnts Abundance matrix in 'sample x feature' orientation.
#' @param tmp.meta Covariate table that contains at least the used variables.
#' @param type String the denotes data source, i.e., one of "otu","clr" or "tss"
#' for the transformed counts or the label of the batch corrected count-matrix.
#' @param pct_threshold Cutoff value for accumulated variance in principal
#' components.
#' @param na.action Set NA handling, will take global option if not supplied.
#' @return Data.frame that contains proportions of variance for given covariates
#' in a principal variance component analysis approach.
#' @include mbecs_classes.R
mbecModelVariancePVCA <- function(model.vars, tmp.cnts, tmp.meta, type,
pct_threshold, na.action) {
n.vars <- length(model.vars)
s.names <- rownames(tmp.cnts)
tmp.cnts = apply(tmp.cnts, 2, scale, center = TRUE, scale = FALSE) %>%
t()
colnames(tmp.cnts) <- s.names
tmp.cor <- stats::cor(tmp.cnts)
eCor <- eigen(tmp.cor)
eVal <- eCor$values
eVec <- eCor$vectors
rownames(eVec) <- s.names
n.eVal <- length(eVal)
sum.eVal <- sum(eVal)
prop.PCs <- eVal/sum.eVal
n.PCs <- max(3, min(which(vapply(cumsum(prop.PCs),
function(x) x >= pct_threshold,
FUN.VALUE = logical(1)))))
# suppress tibble verbose output
suppressMessages(lmm.df <- eVec %>%
tibble::as_tibble(.name_repair = "unique") %>%
dplyr::select_at(seq_len(eval(n.PCs))) %>%
cbind(., tmp.meta))
f.terms <- paste("(1|", model.vars, ")", sep = "")
for( var.idx in seq_len((n.vars - 1)) ) {
for( interaction.idx in seq.int(from=(var.idx + 1), to=(n.vars), by=1) ) {
f.terms <- c(f.terms, paste("(1|", model.vars[var.idx], ":",
model.vars[interaction.idx], ")", sep = ""))
}
}
n.effects <- length(f.terms) + 1
randomEffectsMatrix <- matrix(data = 0, nrow = n.PCs, ncol = n.effects)
model.formula <- stats::as.formula(paste("lmm.df[,vec.idx]", " ~ ",
paste(f.terms,
collapse = " + "), sep = ""))
for (vec.idx in seq_len(n.PCs)) {
randomEffects <- data.frame(lme4::VarCorr(Rm1ML <-
lme4::lmer(model.formula,lmm.df,
REML = TRUE,
verbose = 0,
na.action=na.action)))
randomEffectsMatrix[vec.idx, ] <- as.numeric(randomEffects[, 4])
}
names.effects <- randomEffects[, 1]
randomEffectsMatrix.std <- randomEffectsMatrix/rowSums(randomEffectsMatrix)
scaled.eVal <- eVal/sum(eVal)
randomEffectsMatrix.wgt <- randomEffectsMatrix.std*scaled.eVal[seq_len(n.PCs)]
model.variances <- colSums(randomEffectsMatrix.wgt)/
sum(colSums(randomEffectsMatrix.wgt))
names(model.variances) <- names.effects
res <- data.frame(t(model.variances)) %>%
dplyr::mutate(type = eval(type))
attr(res, "modelType") <- "PVCA - lmm"
return(res)
}
#' Estimate Explained Variance with Silhouette Coefficient
#'
#' The function offers a selection of methods/algorithms to estimate the
#' proportion of variance that can be attributed to covariates of interest.
#' This shows, how much variation is explained by the treatment effect, which
#' proportion is introduced by processing in batches and the leftover variance,
#' i.e., residuals that are not currently explained. Covariates of interest
#' (CoI) are selected by the user and the function will incorporate them into
#' the model.
#'
#' Silhouette Coefficient (s.coef): Calculate principal components and get
#' sample-wise distances on the resulting (sxPC) matrix. Then iterate over all
#' the covariates and calculate the cluster silhouette (which is basically
#' either zero, if the cluster contains only a single element, or it is the
#' distance to the closest different cluster minus the distance of the sample
#' within its own cluster divided (scaled) by the maximum distance). Average
#' over each element in a cluster for all clusters and there is the
#' representation of how good the clustering is. This shows how good a
#' particular covariate characterizes the data, i.e., a treatment variable for
#' instance may differentiate the samples into treated and untreated groups
#' which implies two clusters. In an ideal scenario, the treatment variable,
#' i.e., indicator for some biological effect would produce a perfect
#' clustering. In reality, the confounding variables, e.g., batch, sex or age,
#' will also influence the ordination of samples. Hence, the clustering
#' coefficient is somewhat similar to the amount of explained variance metric
#' that the previous methods used. If used to compare an uncorrected data-set to
#' a batch-corrected set, the expected result would be an increase of clustering
#' coefficient for the biological effect (and all other covariates - because a
#' certain amount of uncertainty was removed from the data) and a decrease for
#' the batch effect.
#'
#' @keywords Silhouette Coefficient Assessment
#' @param model.vars Covariates to use for model building.
#' @param tmp.cnts Abundance matrix in 'sample x feature' orientation.
#' @param tmp.meta Covariate table that contains at least the used variables.
#' @param type String the denotes data source, i.e., one of "otu","clr" or "tss"
#' for the transformed counts or the label of the batch corrected count-matrix.
#' @return Data.frame that contains proportions of variance for given covariates
#' in a silhouette coefficient analysis approach.
#' @include mbecs_classes.R
mbecModelVarianceSCOEF <- function(model.vars, tmp.cnts, tmp.meta, type) {
tmp.prcomp <- stats::prcomp(tmp.cnts,
center = TRUE, scale = FALSE)
tmp.dist <- stats::dist(tmp.prcomp$x,
method = "euclidian")
avg.sil.df <- NULL
for (var.elem in model.vars) {
print(var.elem)
tmp.sil = cluster::silhouette(x=as.numeric(tmp.meta[,eval(var.elem)]),
dist = tmp.dist)
avg.sil.df <- rbind.data.frame(avg.sil.df,
data.frame(variable=var.elem,
cluster=levels(tmp.meta[,eval(var.elem)]),
sil.coefficient =
c(summary(tmp.sil))$clus.avg.widths))
}
res <- avg.sil.df %>%
dplyr::mutate(type = eval(type))
attr(res, "modelType") <- "s.coef"
return(res)
}
# VARIANCE STATISTICS -----------------------------------------------------
#' Wrapper for Model Variable Variance Extraction
#'
#' For a Linear (Mixed) Model, this function extracts the proportion of variance
#' that can be explained by terms and interactions and returns a named
#' row-vector.
#'
#' Linear Model: Perform an analysis of variance (ANOVA) on the model.fit and
#' return the Sum of squares for each term, scaled by the total sum of squares.
#'
#' Linear Mixed Model: employ helper function 'mbecMixedVariance' to extract
#' residuals, random effects and fixed effects components from the model. The
#' components are then transformed to reflect explained proportions of variance
#' for the model coefficients. The function implements transformation for
#' varying coefficients as well, but NO ADJUSTMENT for single or multiple
#' coefficients at this point.
#'
#' @keywords lm lmm proportion variance
#' @param model.fit A linear (mixed) model object of class 'lm' or 'lmerMod'.
#' @return A named row-vector, containing proportional variance for model terms.
#' @export
#'
#' @examples
#' # This will return the data.frame for plotting.
#' limo <- stats::lm(dummy.list$cnts[,1] ~ group + batch, data=dummy.list$meta)
#' vec.variance <- mbecVarianceStats(model.fit=limo)
mbecVarianceStats <- function(model.fit) {
### ToDo: implement glm versions of this
# check validity of model fit
mbecValidateModel(model.fit)
# linear model
if (is(model.fit, "lm")) {
res <- mbecVarianceStatsLM(model.fit)
} else if (is(model.fit, "lmerMod")) {
res <- mbecVarianceStatsLMM(model.fit)
} else if (is(model.fit, "glm")) {
### ToDon't
}
return(res)
}
#' Model Variable Variance Extraction from LM
#'
#' For a Linear Model, this function extracts the proportion of variance
#' that can be explained by terms and interactions and returns a named
#' row-vector.
#'
#' Linear Model: Perform an analysis of variance (ANOVA) on the model.fit and
#' return the Sum of squares for each term, scaled by the total sum of squares.
#'
#' @keywords lm proportion variance
#' @param model.fit A linear model object of class 'lm'.
#' @return A named row-vector, containing proportional variance for model terms.
mbecVarianceStatsLM <- function(model.fit) {
vp = stats::anova(model.fit) %>%
data.frame() %>%
dplyr::mutate(variance =
dplyr::select(., "Sum.Sq")/sum(dplyr::select(.,"Sum.Sq")),
.keep = "none") %>%
t()
return(vp)
}
#' Model Variable Variance Extraction from LMM
#'
#' For a Linear Mixed Model, this function extracts the proportion of variance
#' that can be explained by terms and interactions and returns a named
#' row-vector.
#'
#' Linear Mixed Model: employ helper function 'mbecMixedVariance' to extract
#' residuals, random effects and fixed effects components from the model. The
#' components are then transformed to reflect explained proportions of variance
#' for the model coefficients. The function implements transformation for
#' varying coefficients as well, but NO ADJUSTMENT for single or multiple
#' coefficients at this point.
#'
#' @keywords lmm proportion variance
#' @param model.fit A linear mixed model object of class 'lmerMod'.
#' @return A named row-vector, containing proportional variance for model terms.
mbecVarianceStatsLMM <- function(model.fit) {
vc <- mbecMixedVariance(model.fit)
lib.df <- data.frame(
covariates = colnames(model.fit@frame),
row.names = vapply(colnames(model.fit@frame),
function(covariate)
paste(paste(covariate,
levels(model.fit@frame[, eval(covariate)]),
sep = ""), collapse = ","),
FUN.VALUE = character(1)))
total.var.LUT <- unlist(lapply(vc, function(var.comp) {
if (length(var.comp) > 1) {
weights = (table(
model.fit@frame[[lib.df[eval(paste(names(var.comp),
collapse = ",")),
]]])/nrow(model.fit@frame))
var.comp %*% weights
} else {
names(var.comp) <- NULL
var.comp
}
}), use.names = TRUE, recursive = FALSE)
vp <- list()
for (effect in names(vc)) {
# value of this effect divided by the total variance
tmp.t.var <- sum(total.var.LUT[-which(names(total.var.LUT) %in%
eval(effect))])
vp[[eval(effect)]] <- vc[[eval(effect)]]/(tmp.t.var +
vc[[eval(effect)]])
}
vp <- unlist(vp)
names(vp) <- gsub(".(Intercept)", replacement = "",
names(vp), fixed = TRUE)
vp <- data.frame(t(vp))
return(vp)
}
#' Mixed Model Variance-Component Extraction
#'
#' A helper function that extracts the variance components of linear mixed
#' models, i.e., residuals, random-effects, fixed-effects, scales them to
#' sample-size and returns a list of components.
#'
#' Uses 'lme4::VarCorr' to extract Residuals and random-effects components.
#' Standard Deviation of Residuals is stored as 'sc' attribute in the output of
#' 'VarCorr'.
#'
#' Uses 'lme4::fixef' to extract fixed-effects components, i.e., parameter
#' estimates. The attribute 'pp' of the model contains the dense model matrix
#' for fixed-effects parameters (X). The fixed effects variance, sigma2f, is the
#' variance of the matrix-multiplication beta times X (parameter vector by
#' model matrix)
#'
#' @keywords lmm proportion variance
#' @param model.fit A linear mixed model object of class 'lmerMod'.
#' @return A named list, containing proportional variance for model terms that
#' describe mixed effects.
#' @export
#'
#' @examples
#' # This will return the variance of random/mixed components.
#' limimo <- lme4::lmer(dummy.list$cnts[,1] ~ group + (1|batch),
#' data=dummy.list$meta)
#' list.variance <- mbecMixedVariance(model.fit=limimo)
mbecMixedVariance <- function(model.fit) {
rVC <- lme4::VarCorr(model.fit)
randomVar <- c(Residuals = attr(rVC, "sc")^2, lapply(rVC, diag))
n.scaling <- (stats::nobs(model.fit) - 1)/stats::nobs(model.fit)
fixedVar <- t(t(model.fit@pp$X) * lme4::fixef(model.fit)) %>%
as.data.frame() %>%
dplyr::select(!"(Intercept)") %>%
dplyr::mutate(total.var = apply(., 1, sum)) %>%
apply(2, function(effect) stats::var(effect) * n.scaling)
fixedVar <- lapply(
split(utils::head(fixedVar, -1)/sum(utils::head(fixedVar,-1)) *
utils::tail(fixedVar, 1), names(utils::head(fixedVar, -1))),
unname)
return(c(randomVar, fixedVar))
}
#' Validate Linear (Mixed) Models
#'
#' A helper function that calculates the collinearity between model variables
#' and stops execution, if the maximum value is bigger than the allowed
#' threshold.
#'
#' ToDo: maybe some additional validation steps and more informative output.
#'
#' @keywords collinearity model validation
#' @param model.fit lm() or lmm() output.
#' @param colinearityThreshold Cut-off for model rejection, I=[0,1].
#' @return No return values. Stops execution if validation fails.
#' @export
#'
#' @examples
#' # This will just go through if colinearity threshold is met.
#' limimo <- lme4::lmer(dummy.list$cnts[,1] ~ group + (1|batch),
#' data=dummy.list$meta)
#' mbecValidateModel(model.fit=limimo, colinearityThreshold=0.999)
mbecValidateModel <- function(model.fit, colinearityThreshold = 0.999) {
## ToDo: health & Safety
if( colinScore(model.fit) > colinearityThreshold) {
warning("Some covariates are strongly correlated.
Please re-evaluate the variable selection and try again.")
}
}
#' Variable Correlation Linear (Mixed) Models
#'
#' Takes a fitted model and computes maximum correlation between covariates as
#' return value. Return value contains actual correlation-matrix as 'vcor'
#' attribute.
#'
#' ToDo: maybe some additional validation steps and more informative output.
#'
#' @keywords collinearity model validation
#' @param model.fit lm() or lmm() output
#' @return Maximum amount of correlation for given model variables.
#' @export
#'
#' @examples
#' # This will return the maximum colinearity score in the given model
#' limimo <- lme4::lmer(dummy.list$cnts[,1] ~ group + (1|batch),
#' data=dummy.list$meta)
#' num.max_corr <- colinScore(model.fit=limimo)
colinScore <- function(model.fit) {
# get variance-covariance matrix
V <- stats::vcov(model.fit)
# 'NA' values or non-square matrix
if (any(is.na(V)) || nrow(V) == 0) {
score <- ifelse(any(is.na(V)), 1, 0)
attr(score, "vcor") = NA
} else {
# scale to correlation
C <- stats::cov2cor(as.matrix(V))
# get largest correlation
score = max(abs(C[lower.tri(C)]))
attr(score, "vcor") = C
}
return(score)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.