Nothing
#' Perform Horn's parallel analysis to choose the number of principal components to retain.
#'
#' @param mat A numeric matrix where rows correspond to variables and columns correspond to samples.
#' @param max.rank Integer scalar specifying the maximum number of PCs to retain.
#' @param ... Further arguments to pass to \code{\link{pca}}.
#' @param niters Integer scalar specifying the number of iterations to use for the parallel analysis.
#' @param threshold Numeric scalar representing the \dQuote{p-value} threshold above which PCs are to be ignored.
#' @param transposed Logical scalar indicating whether \code{mat} is transposed, i.e., rows are samples and columns are variables.
#' @param BSPARAM A \linkS4class{BiocSingularParam} object specifying the algorithm to use for PCA.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how the iterations should be paralellized.
#'
#' @details Horn's parallel analysis involves shuffling observations within each row of
#' \code{x} to create a permuted matrix. PCA is performed on the permuted matrix
#' to obtain the percentage of variance explained under a random null hypothesis.
#' This is repeated over several iterations to obtain a distribution of curves on
#' the scree plot.
#'
#' For each PC, the \dQuote{p-value} (for want of a better word) is defined as the
#' proportion of iterations where the variance explained at that PC is greater
#' than that observed with the original matrix. The number of PCs to retain is
#' defined as the last PC where the p-value is below \code{threshold}. This aims
#' to retain all PCs that explain \dQuote{significantly} more variance than
#' expected by chance.
#'
#' This function can be sped up by specifying \code{BSPARAM=IrlbaParam()} or
#' similar, to use approximate strategies for performing the PCA. Another option
#' is to set \code{BPPARAM} to perform the iterations in parallel.
#'
#' @return A list is returned, containing:
#' \itemize{
#' \item \code{original}, the output from running \code{\link{pca}} on \code{mat} with the specified arguments.
#' \item \code{permuted}, a matrix of variance explained from randomly permuted matrices.
#' Each column corresponds to a single permutated matrix, while each row corresponds to successive principal components.
#' \item \code{n}, the estimated number of principal components to retain.
#' }
#'
#' @author Aaron Lun
#'
#' @examples
#' # Mocking up some data.
#' ngenes <- 1000
#' means <- 2^runif(ngenes, 6, 10)
#' dispersions <- 10/means + 0.2
#' nsamples <- 50
#' counts <- matrix(rnbinom(ngenes*nsamples, mu=means,
#' size=1/dispersions), ncol=nsamples)
#'
#' # Choosing the number of PCs
#' lcounts <- log2(counts + 1)
#' output <- parallelPCA(lcounts)
#' output$n
#'
#' @importFrom dqrng generateSeedVectors
#' @importFrom BiocParallel bpmapply
#' @importFrom BiocParallel SerialParam
#'
#' @export
parallelPCA <- function(mat, max.rank=100, ..., niters=50, threshold=0.1,
transposed=FALSE, BSPARAM=ExactParam(), BPPARAM=SerialParam())
{
if (!transposed) {
mat <- t(mat)
}
original <- pca(mat, rank=max.rank, ..., transposed=TRUE, BSPARAM=BSPARAM)
original.s2 <- original$variance
# Running across permutations.
pcg.states <- .setup_pcg_state(niters)
permuted <- bpmapply(FUN=.parallel_PA,
seed=pcg.states$seeds[[1]], stream=pcg.states$streams[[1]],
MoreArgs=list(mat=mat, ..., max.rank=max.rank, BSPARAM=BSPARAM),
BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
permutations <- do.call(cbind, permuted)
# Figuring out where the original drops to "within range" of permuted.
prop <- rowMeans(permutations >= original.s2)
above <- prop > threshold
if (!any(above)) {
npcs <- length(above)
} else {
npcs <- min(which(above)) - 1L
}
list(original=original, permuted=permutations, n=npcs)
}
.parallel_PA <- function(mat, max.rank, ..., seed, stream, BSPARAM)
# Function for use in bplapply, defined here to automatically
# take advantage of the namespace when using snowParam. Note
# that shuffle_matrix needs samples in columns, hence the t()
# above and transposed=TRUE in the pca() calls.
{
re.y <- shuffle_matrix(mat, seed, stream)
out <- pca(re.y, rank=max.rank, ..., transposed=TRUE, BSPARAM=BSPARAM)
out$variance
}
.setup_pcg_state <- function(per.core)
# Sets up seeds in the serial component to ensure that the same
# stream of random numbers is used in each worker, regardless
# of the parallelization scheme.
{
seeds <- streams <- vector("list", length(per.core))
last <- 0L
for (i in seq_along(per.core)) {
N <- per.core[i]
seeds[[i]] <- generateSeedVectors(N, nwords=2)
streams[[i]] <- last + seq_len(N)
last <- last + N
}
list(seeds=seeds, streams=streams)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.