#' @title Simulate Tree Structured Counts
#'
#' @description
#' Simulate a tree structured matrix of counts according to a Poisson-lognormal
#' model, with the log parameter of the poisson following a Brownian Motion (BM)
#' on the tree with noise.
#'
#' @details
#' For each gene, the log-lambda parameter evolves like a BM on the tree,
#' with an extra independent variance noise that can depend on the species.
#' Each gene has its own tree variance for the BM.
#' Each gene and each species has its own mean.
#' The counts for each gene and each species are then obtained as a Poisson draw
#' with a different lambda parameter, as generated by the BM.
#'
#' @param tree A phylogenetic tree with n tips.
#' @param log_means a matrix with the number of genes p rows and the number of
#' species n columns. Column names should match the tree taxa names.
#' @param log_variance_phylo a vector of length p of phylogenetic variances for
#' the BM in the log space for each gene.
#' @param log_variance_sample a matrix of size p x n of environmental variances
#' for individual variations in the log space, for each gene and species.
#' Column names should match the tree taxa names.
#'
#' @return A list, with:
#' \describe{
#' \item{log_lambda}{the p x n matrix of log-lambda simulated by the BM on the tree.}
#' \item{counts}{the p x n matrix of counts with corresponding Poisson draws.}
#' }
#'
#' @keywords internal
#'
simulatePhyloPoissonLogNormal <- function(tree, log_means, log_variance_phylo, log_variance_sample, model.process = "BM", selection.strength = 0) {
## Check for packages
if (!requireNamespace("ape", quietly = TRUE) || !requireNamespace("phylolm", quietly = TRUE)) {
stop("Packages 'ape' and 'phylolm' are needed for tree simulations.", call. = FALSE)
}
## Check means
log_means <- checkParamMatrix(log_means, "log means", tree)
log_variance_sample <- checkParamMatrix(log_variance_sample, "log variance sample", tree)
N <- length(tree$tip.label)
P <- nrow(log_means)
## Check Dimension
if (nrow(log_variance_sample) != P || length(log_variance_phylo) != P) {
stop("`log_means` and `log_variance_sample should have as many rows as the length of `log_variance_phylo`.")
}
## Phylogenetic simulation of log(lambda)
resids <- phylolm::rTrait(n = P,
phy = tree,
model = model.process,
parameters = list(sigma2 = 1, ancestral.state = 0, optimal.value = 0, alpha = selection.strength),
plot.tree = FALSE)
log_sd_phylo <- scale_variance_process(log_variance_phylo, tree, model.process, selection.strength)
## phylo variances
resids <- resids * log_sd_phylo
## resid variances
resids <- t(resids) + matrix(rnorm(N * P, mean = 0, sd = sqrt(log_variance_sample)), ncol = N)
## Expectations
log_lambda <- resids + log_means
## Poisson
counts <- matrix(rpois(N * P, exp(log_lambda)), ncol = N)
## Names
colnames(counts) <- colnames(log_lambda) <- tree$tip.label
return(list(log_lambda = log_lambda,
counts = counts))
}
#' @title Check Matrix Parameter
#'
#' @description
#' Check that the parameters are compatible with the tree. Throws an error if not.
#'
#' @param x matrix of parameters being tested.
#' @param name name of the parameter.
#' @param tree A phylogenetic tree with n tips.
#'
#' @keywords internal
#'
checkParamMatrix <- function(x, name, tree) {
N <- length(tree$tip.label)
if (ncol(x) != N) {
stop(paste0("`", name, "` should have as many columns as the number of taxa in the tree."))
}
if (is.null(tree$tip.label)) stop("The tips of the phylogeny are not named.")
if (is.null(colnames(x))){
warning(paste0("Columns of `", name, "` are not named. I'm naming them, assuming they are in the same order as the tree."))
colnames(x) <- tree$tip.label
} else {
if (!all(tree$tip.label == colnames(x))){
correspondances <- match(tree$tip.label, colnames(x))
if (length(na.omit(unique(correspondances))) != length(tree$tip.label)){
stop(paste0("`", name, "` names do not match the tip labels."))
}
warning(paste0("`", name, "` was not sorted in the correct order, when compared with the tips label. I am re-ordering it."))
x <- x[, correspondances, drop = FALSE]
}
}
return(x)
}
#' @title Check Vector Parameter
#'
#' @description
#' Check that the parameters are compatible with the tree. Throws an error if not.
#'
#' @param x vector of parameters being tested.
#' @param name name of the parameter.
#' @param tree A phylogenetic tree with n tips.
#'
#' @keywords internal
#'
checkParamVector <- function(x, name, tree) {
N <- length(tree$tip.label)
if (length(x) != N) {
stop(paste0("`", name, "` should have the same length as the number of taxa in the tree."))
}
if (is.null(tree$tip.label)) stop("The tips of the phylogeny are not named.")
if (is.null(names(x))){
warning(paste0("`", name, "` is not named. I'm naming them, assuming they are in the same order as the tree."))
names(x) <- tree$tip.label
} else {
if (!all(tree$tip.label == names(x))){
correspondances <- match(tree$tip.label, names(x))
if (length(na.omit(unique(correspondances))) != length(tree$tip.label)){
stop(paste0("`", name, "` names do not match the tip labels."))
}
warning(paste0("`", name, "` was not sorted in the correct order, when compared with the tips label. I am re-ordering it."))
x <- x[correspondances]
}
}
return(x)
}
#' @title Check Species
#'
#' @description
#' Check that the parameters are compatible with the tree. Throws an error if not.
#'
#' @inheritParams checkParamVector
#'
#' @keywords internal
#'
checkSpecies <- function(x, name, tree, tol, check.id.species) {
x <- checkParamVector(x, name, tree)
if (check.id.species) {
related_matrices <- sapply(x, function(i) i == x)
if (!all(related_matrices == (ape::cophenetic.phylo(tree) <= tol))) {
stop("The provided species do not match with the tree branch lengths. Please check the 'id.species' vector.")
}
}
return(x)
}
#' @title Add replicates to a tree
#'
#' @description
#' Utility function to add replicates to a tree, as tips with zero length branches.
#'
#' @param tree A phylogenetic tree with n tips.
#' @param r the number of replicates too add at each species.
#'
#' @return A phylogenetic tree with n * r tips, and clusters of tips with zero branch lengths.
#'
#' @keywords internal
#'
add_replicates <- function(tree, r) {
if (!requireNamespace("phytools", quietly = TRUE)) {
stop("Package 'phytools' is needed for function 'add_replicates'.", call. = FALSE)
}
tree_rep <- tree
# Add replicates
for (tip_label in tree$tip.label) {
for (rep in seq_len(r)) {
tree_rep <- phytools::bind.tip(tree_rep, tip.label = paste0(tip_label, "_", rep),
where = which(tree_rep$tip.label == tip_label))
}
}
# Remove original tips
tree_rep <- ape::drop.tip(tree_rep, tree$tip.label)
return(tree_rep)
}
#' @title Compute log means and variances
#'
#' @description
#' From the parameters of a negative binomial (count_means and count_dispersions),
#' compute the parameters of a phylogenetic Poisson log-normal with the
#' same expectations and variances.
#'
#' @param count_means a matrix with the number of genes p rows and the number of
#' species n columns. Column names should match the tree taxa names.
#' @param count_dispersions a matrix of size p x n, for each gene and species.
#' Column names should match the tree taxa names.
#'
#' @return A list, with:
#' \describe{
#' \item{log_means}{the p x n matrix of log-means for Poisson-lognormal simulations.}
#' \item{log_variance_phylo}{the p vector of phylogenetic log-variances for Poisson-lognormal simulations.}
#' \item{log_variance_sample}{the p x n matrix of environmental log-variances for Poisson-lognormal simulations.}
#' }
#'
#' @keywords internal
#'
get_poisson_log_normal_parameters <- function(count_means, count_dispersions, prop.var.tree) {
N <- ncol(count_means)
P <- nrow(count_means)
## Check Dimension
if (nrow(count_dispersions) != P || ncol(count_dispersions) != N) {
stop("`count_means` and `count_dispersions` should have the same dimensions.")
}
## Check Proportion
if (!is.vector(prop.var.tree)) {
stop("`prop.var.tree` should be a vector.")
}
if (length(prop.var.tree) != P) {
if (length(prop.var.tree) != 1) stop("`prop.var.tree` should be a vector of length the number of genes, or a scalar (in which case it will be recycled).")
}
if (any(prop.var.tree > 1.0 | prop.var.tree < 0.0)) {
stop("`prop.var.tree` should be between 0 and 1.")
}
## Parameters of the PLN
params_PLN <- NB_to_PLN(count_means, count_dispersions)
log_means <- params_PLN$log_means_pln
log_variances_all <- params_PLN$log_variances_pln
## Phylo variances
## Check for packages
if (!requireNamespace("matrixStats", quietly = TRUE)) {
log_variance_phylo <- apply(log_variances_all, 1, min) * prop.var.tree
} else {
log_variance_phylo <- matrixStats::rowMins(log_variances_all) * prop.var.tree
}
## Sample variances
log_variance_sample <- log_variances_all - log_variance_phylo %*% t(rep(1, N))
return(list(log_means = log_means,
log_variance_phylo = log_variance_phylo,
log_variance_sample = log_variance_sample))
}
#' @title Negative Binomial to Poisson Log-Normal
#'
#' @description
#' From the parameters of a negative binomial (mean and dispersion),
#' compute the parameters of a Poisson log-normal with the
#' same expectation and variance.
#'
#' @param mean mean of the negative binomial.
#' @param dispersion dispersion of the negative binomial.
#'
#' @return A list, with:
#' \describe{
#' \item{log_means_pln}{Mean of the Poisson log normal in the log space.}
#' \item{log_variances_pln}{Variance of the Poisson log normal in the log space.}
#' }
#'
#' @keywords internal
#'
NB_to_PLN <- function(mean, dispersion) {
## Variance in log space
log_variances_pln <- log(1 + dispersion)
## Mean in log space
log_means_pln <- log(mean) - 0.5 * log_variances_pln
return(list(log_means_pln = log_means_pln,
log_variances_pln = log_variances_pln))
}
#' @title Simulate the Data using the tree
#'
#' @description
#' Use the Phylogenetic Poisson Log Normal model to simulate the data.
#'
#' @inheritParams generateSyntheticData
#' @inheritParams simulateData
#'
#' @return Z a matrix with the data
#'
#' @keywords internal
#'
simulateDataPhylo <- function(count_means,
count_dispersions,
tree, prop.var.tree,
model.process = "BM", selection.strength = 0) {
### Initialize data matrix
colnames(count_means) <- colnames(count_dispersions) <- tree$tip.label
### Get Equivalent Phylogenetic Poisson log-normal parameters
params_poisson_lognormal <- get_poisson_log_normal_parameters(count_means, count_dispersions, prop.var.tree)
### Simulate Data
res_sim <- simulatePhyloPoissonLogNormal(tree,
params_poisson_lognormal$log_means,
params_poisson_lognormal$log_variance_phylo,
params_poisson_lognormal$log_variance_sample,
model.process = model.process,
selection.strength = selection.strength)
return(res_sim$counts)
}
#' @title Scale the variances
#'
#' @description
#' Scale the variances of the process simulation so that they are equal to log_variance_phylo.
#'
#' @inheritParams generateSyntheticData
#' @inheritParams simulateData
#'
#' @return A matrix N * P of factors to multiply the simulated phylogenetic residuals.
#'
#' @keywords internal
#'
scale_variance_process <- function(log_variance_phylo, tree, model.process, selection.strength) {
fac <- get_model_factor(model.process, selection.strength, tree)
return(fac %*% t(sqrt(log_variance_phylo)))
}
#' @title Get the scaling factor
#'
#' @description
#' Get the scaling factors.
#'
#' @inheritParams generateSyntheticData
#'
#' @return A vector N of factors.
#'
#' @keywords internal
#'
get_model_factor <- function(model.process, selection.strength, tree) {
heights <- ape::node.depth.edgelength(tree)[1:length(tree$tip.label)]
if (model.process == "BM" || selection.strength == 0) {
return(sqrt(1 / heights))
} else if (model.process == "OU") {
return(sqrt(1 / expm1(-2 * selection.strength * heights) * (-2 * selection.strength)))
} else {
stop("Process not yet implemented.")
}
}
#' @title Simulate a length matrix with a phylo model
#'
#' @description
#' Simulate a length matrix of size n.vars times n.sample, with the length of
#' each gene in each sample.
#'
#' @param tree The phylogeneti tree.
#' @param id.species An n.sample vector, indicating the species of each sample.
#' @param lengths.relmeans A vector of mean values to use in the simulation of
#' lengths from the Negative Binomial distribution.
#' @param lengths.dispersions A vector or matrix of dispersions to use in the
#' simulation of data from the Negative Binomial distribution.
#' @param lengths.lambda A vector of heritability parameters to use in the
#' simulation of data from the lambda model.
#'
#' @return A matrix of the same size as 'length_matrix', with normalization
#' factors to be applied for each sample and each gene.
#'
#' @keywords internal
#'
generateLengthsPhylo <- function(tree, id.species, lengths.relmeans, lengths.dispersions) {
tree_sp <- trim_tree(tree, id.species)
ntaxa <- length(unique(id.species))
params_simus <- getNegativeBinomialParameters(n.vars = length(lengths.relmeans),
S1 = 1:ntaxa, prob.S1 = lengths.relmeans, sum.S1 = 1.0, truedispersions.S1 = lengths.dispersions, nfact_length.S1 = matrix(NA, 0, 0),
S2 = NULL, prob.S2 = 0, sum.S2 = 0, truedispersions.S2 = 0, nfact_length.S2 = matrix(NA, 0, 0),
seq.depths = rep(1.0, ntaxa))
lengths_unique <- simulateDataPhylo(params_simus$count_means,
params_simus$count_dispersions,
tree = tree_sp, prop.var.tree = 1.0,
model.process = "BM", selection.strength = 0)
length_matrix <- lengths_unique[, id.species]
return(length_matrix)
}
trim_tree <- function(tree, id.species) {
tips_to_keep <- tree$tip.label[!duplicated(id.species)]
tree_sp <- ape::keep.tip(tree, tips_to_keep)
tree_sp$tip.label <- as.character(unique(id.species))
return(tree_sp)
}
#' @title Effective Sample Size
#'
#' @description
#' Sample size so that the variance of the estimator is sigma^2 / nEff.
#'
#' @param tree A phylogenetic tree. If \code{NULL}, samples are assumed to be iid.
#' @param id.condition A named vector giving the state of each tip (sample).
#' @param model The trait evolution model. One of "BM" or "OU".
#' @param selection.strength If \code{model="OU"}, the selection strength parameter.
#'
#' @return The effective sample size.
#'
#' @keywords internal
#'
nEffNaive <- function(tree, id.condition, model, selection.strength) {
if (length(unique(id.condition)) != 2) stop("There should be only two conditions")
if (is.null(tree)) {
n <- length(id.condition)
id_cond_uni <- id.condition
Vinv <- diag(1, n)
} else {
# Check vector order
id.condition <- checkParamVector(id.condition, "id.condition", tree)
# tree
tree_uni <- prune_tree_one_obs(tree)
id_cond_uni <- id.condition[tree_uni$tip.label]
n <- length(tree_uni$tip.label)
if (model == "OU") model <- "OUfixedRoot"
V <- ape::vcv(phylolm::transf.branch.lengths(tree_uni, model = model, list(alpha = selection.strength))$tree)
Vinv <- try(solve(V))
if (inherits(Vinv, 'try-error')) Vinv <- ginv(V)
}
# Intercept
X <- rep(1, n)
# Model matrix
Z <- stats::model.matrix(~factor(id_cond_uni))[, -1, drop = FALSE]
# Variance of the condition
X <- cbind(X, Z)
nEff <- 1 / solve(t(X) %*% Vinv %*% X)[2, 2]
return(nEff)
}
nEffSchur <- function(tree, id.condition, model, selection.strength) {
if (length(unique(id.condition)) != 2) stop("There should be only two conditions")
if (is.null(tree)) {
n <- length(id.condition)
id_cond_uni <- id.condition
Vinv <- diag(1, n)
} else {
# Check vector order
id.condition <- checkParamVector(id.condition, "id.condition", tree)
# tree
tree_uni <- prune_tree_one_obs(tree)
id_cond_uni <- id.condition[tree_uni$tip.label]
n <- length(tree_uni$tip.label)
if (model == "OU") model <- "OUfixedRoot"
V <- ape::vcv(phylolm::transf.branch.lengths(tree_uni, model = model, list(alpha = selection.strength))$tree)
Vinv <- try(solve(V))
if (inherits(Vinv, 'try-error')) Vinv <- ginv(V)
}
# Intercept
X <- rep(1, n)
# Model matrix
Z <- stats::model.matrix(~factor(id_cond_uni))[, -1, drop = FALSE]
# Variance of the condition
X <- cbind(X, Z)
M <- t(X) %*% Vinv %*% X
nEff <- det(M) / M[1,1]
return(nEff)
}
nEffPhylolm <- function(tree, id.condition, model, selection.strength) {
if (length(unique(id.condition)) != 2) stop("There should be only two conditions")
if (is.null(tree)) return(nEffNaive(tree, id.condition, model, selection.strength))
# Check vector order
id.condition <- checkParamVector(id.condition, "id.condition", tree)
# tree
tree_uni <- prune_tree_one_obs(tree)
id_cond_uni <- id.condition[tree_uni$tip.label]
n <- length(tree_uni$tip.label)
# phylolm
e1 <- rnorm(n)
names(e1) <- tree_uni$tip.label
if (model == "OU") model <- "OUfixedRoot"
tree_trans <- phylolm::transf.branch.lengths(tree_uni, model = model, list(alpha = selection.strength))$tree
phyfit <- phylolm::phylolm(e1 ~ 1 + as.factor(id_cond_uni), phy = tree_trans)
nEff <- 1 / (phyfit$vcov[2, 2] / phyfit$sigma2 / n * (n - 2))
return(nEff)
}
#' @title Effective Sample Size Ratio
#'
#' @description
#' Ratio between the tree sample size and the sample size of the equivalent problem
#' with independent measures. A result larger than one indicates a problem that is
#' made "easier" by the tree structure. Note that it strongly depends on the
#' tip conditions (see examples).
#'
#' @param tree A phylogenetic tree. If \code{NULL}, samples are assumed to be iid.
#' @param id.condition A named vector giving the state of each tip (sample).
#' @param model The trait evolution model. One of "BM" or "OU".
#' @param selection.strength If \code{model="OU"}, the selection strength parameter.
#'
#' @return The ratio of sample sizes.
#'
#' @examples
#' set.seed(1289)
#' ## Ballanced tree
#' ntips <- 2^5
#' tree <- ape::compute.brlen(ape::stree(ntips, "balanced"))
#' ## Alt cond : nEff greater than 1
#' id_cond <- rep(rep(0:1, each = 2), ntips / 4)
#' names(id_cond) <- tree$tip.label
#' plot(tree); ape::tiplabels(pch = 21, col = id_cond, bg = id_cond)
#' compcodeR:::nEffRatio(tree, id_cond, "BM", 0)
#' ## Bloc cond : nEff smaller than 1
#' id_cond <- rep(0:1, each = ntips / 2)
#' names(id_cond) <- tree$tip.label
#' plot(tree); ape::tiplabels(pch = 21, col = id_cond, bg = id_cond)
#' compcodeR:::nEffRatio(tree, id_cond, "BM", 0)
#'
#' @keywords internal
#'
nEffRatio <- function(tree, id.condition, model, selection.strength) {
if (is.null(tree)) {
n <- length(id.condition)
} else {
n <- length(prune_tree_one_obs(tree)$tip.label)
}
return(nEffNaive(tree, id.condition, model, selection.strength) / (n / 4))
}
prune_tree_one_obs <- function(tree) {
C <- ape::cophenetic.phylo(tree)
C <- apply(C, c(1, 2), function(x) isTRUE(all.equal(x, 0)))
duplicated_species <- colnames(C)[duplicated(C)]
tree_unique <- ape::drop.tip(tree, duplicated_species)
return(tree_unique)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.