####################################################################################
## ---------- internal
.pca <- function(X, ncomp = 2, center = TRUE, scale = FALSE, max.iter = 500,
tol = 1e-09, logratio = c('none','CLR','ILR'), ilr.offset = 0.001,
V = NULL, multilevel = NULL){
arg.call = match.call()
## match or set the multi-choice arguments
arg.call$logratio <- logratio <- .matchArg(logratio)
#-- check that the user did not enter extra arguments
user.arg = names(arg.call)[-1]
err = tryCatch(mget(names(formals()), sys.frame(sys.nframe())),
error = function(e) e)
if ("simpleError" %in% class(err))
stop(err[[1]], ".", call. = FALSE)
#-- X matrix
if (is.data.frame(X))
X = as.matrix(X)
if (!is.matrix(X) || is.character(X))
stop("'X' must be a numeric matrix.", call. = FALSE)
if (any(apply(X, 1, is.infinite)))
stop("infinite values in 'X'.", call. = FALSE)
#-- put a names on the rows and columns of X --#
X.names = colnames(X)
if (is.null(X.names))
X.names = paste("V", 1:ncol(X), sep = "")
ind.names = rownames(X)
if (is.null(ind.names))
ind.names = 1:nrow(X)
#-- ncomp
if (is.null(ncomp))
ncomp = min(nrow(X),ncol(X))
ncomp = round(ncomp)
if ( !is.numeric(ncomp) || ncomp < 1 || !is.finite(ncomp))
stop("invalid value for 'ncomp'.", call. = FALSE)
if (ncomp > min(ncol(X), nrow(X)))
stop("use smaller 'ncomp'", call. = FALSE)
#-- log.ratio
choices = c('CLR', 'ILR','none')
logratio = choices[pmatch(logratio, choices)]
if (any(is.na(logratio)) || length(logratio) > 1)
stop("'logratio' should be one of 'CLR' ,'ILR'or 'none'.", call. = FALSE)
if (logratio != "none" && any(X < 0))
stop("'X' contains negative values, you can not log-transform your data")
#-- cheking center and scale
if (!is.logical(center))
{
if (!is.numeric(center) || (length(center) != ncol(X)))
stop("'center' should be either a logical value or a numeric vector of length equal to the number of columns of 'X'.",
call. = FALSE)
}
if (!is.logical(scale))
{
if (!is.numeric(scale) || (length(scale) != ncol(X)))
stop("'scale' should be either a logical value or a numeric vector of length equal to the number of columns of 'X'.",
call. = FALSE)
}
#-- max.iter
if (is.null(max.iter) || !is.numeric(max.iter) || max.iter < 1 || !is.finite(max.iter))
stop("invalid value for 'max.iter'.", call. = FALSE)
max.iter = round(max.iter)
#-- tol
if (is.null(tol) || !is.numeric(tol) || tol < 0 || !is.finite(tol))
stop("invalid value for 'tol'.", call. = FALSE)
#-- end checking --#
#------------------#
#-----------------------------#
#-- logratio transformation --#
if (is.null(V) & logratio == "ILR") # back-transformation to clr-space, will be used later to recalculate loadings etc
V = clr.backtransfo(X)
X = logratio.transfo(X = X, logratio = logratio, offset = if(logratio == "ILR") {ilr.offset} else {0})
#as X may have changed
if (ncomp > min(ncol(X), nrow(X)))
stop("use smaller 'ncomp'", call. = FALSE)
#-- logratio transformation --#
#-----------------------------#
#---------------------------------------------------------------------------#
#-- multilevel approach ----------------------------------------------------#
if (!is.null(multilevel))
{
# we expect a vector or a 2-columns matrix in 'Y' and the repeated measurements in 'multilevel'
multilevel = data.frame(multilevel)
if ((nrow(X) != nrow(multilevel)))
stop("unequal number of rows in 'X' and 'multilevel'.")
if (ncol(multilevel) != 1)
stop("'multilevel' should have a single column for the repeated measurements.")
multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements
Xw = withinVariation(X, design = multilevel)
X = Xw
}
#-- multilevel approach ----------------------------------------------------#
#---------------------------------------------------------------------------#
X = scale(X, center = center, scale = scale)
cen = attr(X, "scaled:center")
sc = attr(X, "scaled:scale")
if (any(sc == 0))
stop("cannot rescale a constant/zero column to unit variance.",
call. = FALSE)
is.na.X = is.na(X)
na.X = FALSE
if (any(is.na.X)) na.X = TRUE
NA.X = any(is.na.X)
cl = match.call()
cl[[1]] = as.name('pca')
result = list(call = cl, X = X, ncomp = ncomp,NA.X = NA.X,
center = if (is.null(cen)) {FALSE} else {cen},
scale = if (is.null(sc)) {FALSE} else {sc},
names = list(X = X.names, sample = ind.names))
#-- pca approach -----------------------------------------------------------#
#---------------------------------------------------------------------------#
if(logratio == 'CLR' | logratio=='none')
{
#-- if there are missing values use NIPALS agorithm
if (any(is.na.X))
{
res = nipals(X, ncomp = ncomp, reconst = TRUE, max.iter = max.iter, tol = tol)
result$sdev = res$eig / sqrt(max(1, nrow(X) - 1))
names(result$sdev) = paste("PC", 1:length(result$sdev), sep = "")
result$rotation = res$p
dimnames(result$rotation) = list(X.names, paste("PC", 1:ncol(result$rotation), sep = ""))
X[is.na.X] = res$rec[is.na.X]
result$x = X %*% res$p
dimnames(result$x) = list(ind.names, paste("PC", 1:ncol(result$x), sep = ""))
} else {
#-- if data is complete use singular value decomposition
#-- borrowed from 'prcomp' function
res = svd(X, nu = 0)
result$sdev = res$d[1:ncomp] / sqrt(max(1, nrow(X) - 1))
result$rotation = res$v[, 1:ncomp, drop = FALSE]
result$x = X %*% res$v[, 1:ncomp, drop = FALSE]
}
} else {
# if 'ILR', transform data and then back transform in clr space (from RobCompositions package)
# data have been transformed above
res = svd(X, nu = max(1, nrow(X) - 1))
if (ncomp < ncol(X))
{
result$sdev = res$d[1:ncomp] / sqrt(max(1, nrow(X) - 1)) # Note: what differs with RobCompo is that they use: cumsum(eigen(cov(X))$values)/sum(eigen(cov(X))$values)
# calculate loadings using back transformation to clr-space
result$rotation = V %*% res$v[, 1:ncomp, drop = FALSE]
# extract component score from the svd, multiply matrix by vector using diag, NB: this differ from our mixOmics PCA calculations
# NB: this differ also from Filmoser paper, but ok from their code: scores are unchanged
result$x = res$u[, 1:ncomp, drop = FALSE] %*% diag(res$d[1:ncomp, drop = FALSE])
} else {
result$sdev = res$d / sqrt(max(1, nrow(X) - 1))
result$rotation = V %*% res$v
result$x = res$u%*% diag(res$d)
}
}
names(result$sdev) = paste("PC", 1:length(result$sdev), sep = "")
dimnames(result$rotation) = list(X.names, paste("PC", 1:ncol(result$rotation), sep = ""))
dimnames(result$x) = list(ind.names, paste("PC", 1:ncol(result$x), sep = ""))
result$var.tot=sum(X^2 / max(1, nrow(X) - 1))# same as all res$d, or variance after nipals replacement of the missing values
# to be similar to other methods, add loadings and variates as outputs
result$loadings = list(X=result$rotation)
result$variates = list(X=result$x)
# output multilevel if needed
if(!is.null(multilevel))
result=c(result, list(Xw = Xw, design = multilevel))
class(result) = c("pca","prcomp")
if(!is.null(multilevel))
class(result)=c("mlpca",class(result))
#calcul explained variance
result$explained_variance = result$sdev^2 / result$var.tot
result$cum.var = cumsum(result$explained_variance)
return(invisible(result))
}
#' Principal Components Analysis
#'
#' Performs a principal components analysis on the given data matrix that can
#' contain missing values. If data are complete 'pca' uses Singular Value
#' Decomposition, if there are some missing values, it uses the NIPALS
#' algorithm.
#' The calculation is done either by a singular value decomposition of the
#' (possibly centered and scaled) data matrix, if the data is complete or by
#' using the NIPALS algorithm if there is data missing. Unlike
#' \code{\link{princomp}}, the print method for these objects prints the
#' results in a nice format and the \code{plot} method produces a bar plot of
#' the percentage of variance explaned by the principal components (PCs).
#'
#' When using NIPALS (missing values), we make the assumption that the first
#' (\code{min(ncol(X),} \code{nrow(X)}) principal components will account for
#' 100 \% of the explained variance.
#'
#' Note that \code{scale= TRUE} cannot be used if there are zero or constant
#' (for \code{center = TRUE}) variables.
#'
#' Components are omitted if their standard deviations are less than or equal
#' to \code{comp.tol} times the standard deviation of the first component. With
#' the default null setting, no components are omitted. Other settings for
#' \code{comp.tol} could be \code{comp.tol = sqrt(.Machine$double.eps)}, which
#' would omit essentially constant components, or \code{comp.tol = 0}.
#'
#' According to Filzmoser et al., a ILR log ratio transformation is more
#' appropriate for PCA with compositional data. Both CLR and ILR are valid.
#'
#' Logratio transform and multilevel analysis are performed sequentially as
#' internal pre-processing step, through \code{\link{logratio.transfo}} and
#' \code{\link{withinVariation}} respectively.
#'
#' Logratio can only be applied if the data do not contain any 0 value (for
#' count data, we thus advise the normalise raw data with a 1 offset). For ILR
#' transformation and additional offset might be needed.
## ----------------------------------- Parameters
#' @param X A numeric matrix (or data frame) which provides the data for the
#' principal components analysis. It can contain missing values.
#' Alternatively, a \code{MultiAssayExperiment} object.
#' @param assay Name or index of an assay from \code{X}.
#' @param ncomp Integer, if data is complete \code{ncomp} decides the number of
#' components and associated eigenvalues to display from the \code{pcasvd}
#' algorithm and if the data has missing values, \code{ncomp} gives the number
#' of components to keep to perform the reconstitution of the data using the
#' NIPALS algorithm. If \code{NULL}, function sets \code{ncomp = min(nrow(X),
#' ncol(X))}.
#' @param center A logical value indicating whether the variables should be
#' shifted to be zero centered. Alternately, a vector of length equal the
#' number of columns of \code{X} can be supplied. The value is passed to
#' \code{\link{scale}}.
#' @param scale A logical value indicating whether the variables should be
#' scaled to have unit variance before the analysis takes place. The default is
#' \code{FALSE} for consistency with \code{prcomp} function, but in general
#' scaling is advisable. Alternatively, a vector of length equal the number of
#' columns of \code{X} can be supplied. The value is passed to
#' \code{\link{scale}}.
#' @param max.iter Integer, the maximum number of iterations in the NIPALS
#' algorithm.
#' @param tol A positive real, the tolerance used in the NIPALS algorithm.
#' @param logratio One of ('none','CLR','ILR'). Specifies the log ratio
#' transformation to deal with compositional values that may arise from
#' specific normalisation in sequencing data. Default to 'none'
#' @param ilr.offset When logratio is set to 'ILR', an offset must be input to
#' avoid infinite value after the logratio transform, default to 0.001.
#' @param V Matrix used in the logratio transformation id provided.
#' @param multilevel Sample information for multilevel decomposition for repeated measurements.
## ----------------------------------- Value
#' @return \code{pca} returns a list with class \code{"pca"} and
#' \code{"prcomp"} containing the following components: \item{ncomp}{the number
#' of principal components used.} \item{sdev}{the eigenvalues of the
#' covariance/correlation matrix, though the calculation is actually done with
#' the singular values of the data matrix or by using NIPALS.}
#' \item{rotation}{the matrix of variable loadings (i.e., a matrix whose
#' columns contain the eigenvectors).} \item{loadings}{same as 'rotation' to
#' keep the mixOmics spirit} \item{x}{the value of the rotated data (the
#' centred (and scaled if requested) data multiplied by the rotation/loadings
#' matrix), also called the principal components.} \item{variates}{same as 'x'
#' to keep the mixOmics spirit} \item{center, scale}{the centering and scaling
#' used, or \code{FALSE}.} \item{explained_variance}{explained variance from
#' the multivariate model, used for plotIndiv}
## ----------------------------------- Misc
#' @author Florian Rohart, Kim-Anh Lê Cao, Ignacio González, Al J Abadi
#' @seealso \code{\link{nipals}}, \code{\link{prcomp}}, \code{\link{biplot}},
#' \code{\link{plotIndiv}}, \code{\link{plotVar}} and http://www.mixOmics.org
#' for more details.
#' @references On log ratio transformations: Filzmoser, P., Hron, K., Reimann,
#' C.: Principal component analysis for compositional data with outliers.
#' Environmetrics 20(6), 621-632 (2009) Lê Cao K.-A., Costello ME, Lakis VA,
#' Bartolo, F,Chua XY, Brazeilles R, Rondeau P. MixMC: Multivariate insights
#' into Microbial Communities. PLoS ONE, 11(8): e0160169 (2016). On multilevel
#' decomposition: Westerhuis, J.A., van Velzen, E.J., Hoefsloot, H.C., Smilde,
#' A.K.: Multivariate paired data analysis: multilevel plsda versus oplsda.
#' Metabolomics 6(1), 119-128 (2010) Liquet, B., Lê Cao, K.-A., Hocini, H.,
#' Thiebaut, R.: A novel approach for biomarker selection and the integration
#' of repeated measures experiments from two assays. BMC bioinformatics 13(1),
#' 325 (2012)
#' @keywords algebra
## ----------------------------------- Examples
#' @example examples/pca-example.R
####################################################################################
## ---------- Generic
## DOC NOTE:
## For better documentation, I manually demonstrate the usage of the 'ANY' signature
## in the generic section.
## The reason is, as is, you'll get the first usage for the generic with (X, ncomp, ...)
## which is really not informative. At the same time, you can't just not document the
## generic as oit HAS to come before the methods and defines the name of the function
## to document. So by doing this, we're poiting roxygen2 to the right name for the
## .Rd file, while showing the right usage form.
## If only there was a workaround to get the full IDE arg suggestions too!
## It is possible to repeat all args and no '...', but any change will be a nightmare.
#' @param ... Aguments passed to the generic.
#' @usage \S4method{pca}{ANY}(X, ncomp = 2, center = TRUE, scale = FALSE, max.iter = 500,
#' tol = 1e-09, logratio = c('none','CLR','ILR'), ilr.offset = 0.001,
#' V = NULL, multilevel = NULL)
#' @export
setGeneric('pca', function (X, ncomp=2,...) standardGeneric('pca'))
####################################################################################
## ---------- Methods
## ----------------------------------- ANY
#' @export
setMethod('pca', 'ANY', .pca)
## ----------------------------------- MultiAssayExperiment
#' @importFrom SummarizedExperiment assay assays
#' @rdname pca
#' @export
setMethod('pca', 'MultiAssayExperiment', function(X, ncomp=2,..., assay=NULL){
## if assay is not valid throw appropriate error
if(!assay %in% tryCatch(names(assays(X)), error=function(e)e)) .inv_assay()
## match the call and change the call name from '.local' to 'pca' for output
ml <- match.call()
ml[[1L]] <- quote(pca)
## get a copy to alter the call for internal call's use
mli <- ml
mli[[1L]] <- quote(.pca)
## get the indices of call args that match the internal args and re-order mc
## by doing this, we delibrately drop the 'assay' or any other non-internal args as well
arg.ind <- match(names(formals(.pca)), names(mli), 0L)
## put the internal name in mli call object and sort the args in internal's order
mli <- mli[c(1L,arg.ind)]
## change X to the transpose of the assay matrix
mli[['X']] <- t(assay(X, assay))
## evaluate the call in the parent.frame and return
result <- eval(mli, parent.frame())
## change the 'call' slot in the output list to the method call
result[["call"]] <- ml
return(result)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.