#
# pmlPart + pmlCluster
#
optimPartQGeneral <- function(object, Q = c(1, 1, 1, 1, 1, 1),
subs = rep(1, length(Q)), ...) {
m <- length(Q)
n <- max(subs)
ab <- numeric(n)
for (i in 1:n) ab[i] <- log(Q[which(subs == i)[1]])
fn <- function(ab, object, m, n, subs, ...) {
Q <- numeric(m)
for (i in 1:n) Q[subs == i] <- ab[i]
Q <- exp(Q)
result <- 0
for (i in seq_along(object)) result <- result + update(object[[i]],
Q = Q, ...)$logLik
result
}
res <- optim(par = ab, fn = fn, gr = NULL, method = "L-BFGS-B",
lower = -Inf, upper = Inf, control = list(fnscale = -1,
maxit = 25), object = object, m = m, n = n, subs = subs, ...)
Q <- rep(1, m)
for (i in 1:n) Q[subs == i] <- exp(res[[1]][i])
res[[1]] <- Q
res
}
optimPartBf <- function(object, bf = c(0.25, 0.25, 0.25, 0.25), ...) {
l <- length(bf)
nenner <- 1 / bf[l]
lbf <- log(bf * nenner)
lbf <- lbf[-l]
fn <- function(lbf, object, ...) {
result <- 0
bf <- exp(c(lbf, 0))
bf <- bf / sum(bf)
n <- length(object)
for (i in 1:n) result <- result + update(object[[i]],
bf = bf, ...)$logLik
result
}
res <- optim(par = lbf, fn = fn, gr = NULL, method = "Nelder-Mead",
control = list(fnscale = -1, maxit = 500), object, ...)
bf <- exp(c(res[[1]], 0))
bf <- bf / sum(bf)
}
optimPartInv <- function(object, inv = 0.01, ...) {
fn <- function(inv, object, ...) {
result <- 0
n <- length(object)
for (i in 1:n) result <- result + update(object[[i]], inv = inv,
...)$logLik
result
}
res <- optimize(f = fn, interval = c(0, 1), lower = 0, upper = 1,
maximum = TRUE, tol = 1e-04, object, ...)
res[[1]]
}
optimPartGamma <- function(object, shape = 1, ...) {
fn <- function(shape, object, ...) {
result <- 0
n <- length(object)
for (i in 1:n) result <- result + update(object[[i]], shape = shape,
...)$logLik
result
}
res <- optimize(f = fn, interval = c(0, 100), lower = 0, upper = 100,
maximum = TRUE, tol = 0.01, object, ...)
res
}
dltmp <- function(fit, i = 1, transform = transform) {
tree <- fit$tree
data <- getCols(fit$data, tree$tip.label)
if (is.null(attr(tree, "order")) || attr(tree, "order") != "postorder")
tree <- reorder(tree, "postorder")
q <- length(tree$tip.label)
node <- tree$edge[, 1]
edge <- tree$edge[, 2]
m <- max(edge)
dat <- vector(mode = "list", length = m)
eig <- fit$eig
w <- fit$w[i]
g <- fit$g[i]
bf <- fit$bf
el <- tree$edge.length
P <- getP(el, eig, g)
nr <- as.integer(attr(data, "nr"))
nc <- as.integer(attr(data, "nc"))
node <- as.integer(node - min(node))
edge <- as.integer(edge - 1)
nTips <- as.integer(length(tree$tip.label))
mNodes <- as.integer(max(node) + 1)
contrast <- attr(data, "contrast")
nco <- as.integer(dim(contrast)[1])
dat[(q + 1):m] <- .Call("LogLik2", data, P, nr, nc, node, edge, nTips,
mNodes, contrast, nco)
parent <- tree$edge[, 1]
child <- tree$edge[, 2]
nTips <- min(parent) - 1
datp <- vector("list", m)
el <- tree$edge.length
if (transform) dP <- getdP(tree$edge.length, eig, g)
else dP <- getdP2(tree$edge.length, eig, g)
datp[(nTips + 1)] <- dat[(nTips + 1)]
l <- length(child)
dl <- matrix(0, nr, l)
for (j in (m - 1):1) {
# tips have factor format, internal edges are matrices
if (child[j] > nTips) {
tmp2 <- (datp[[parent[j]]] / (dat[[child[j]]] %*% P[[j]]))
dl[, j] <- (tmp2 * (dat[[child[j]]] %*% dP[[j]])) %*% (w * bf)
datp[[child[j]]] <- (tmp2 %*% P[[j]]) * dat[[child[j]]]
}
else {
tmp2 <- datp[[parent[j]]] / ((contrast %*% P[[j]])[data[[child[j]]], ])
dl[, j] <- (tmp2 * ((contrast %*% dP[[j]])[data[[child[j]]], ])) %*%
(w * bf)
}
}
dl
}
dl <- function(x, transform = TRUE) {
l <- length(x$w)
dl <- dltmp(x, 1, transform)
i <- 2
while (i < (l + 1)) {
dl <- dl + dltmp(x, i, transform)
i <- i + 1
}
dl
}
# add control and change edge
optimPartEdge <- function(object, ...) {
tree <- object[[1]]$tree
theta <- tree$edge.length
theta <- pmax(theta, 1e-8)
tree$edge.length <- theta
tmptree <- tree
n <- length(object)
l <- length(theta)
nrv <- numeric(n)
for (i in 1:n) nrv[i] <- attr(object[[i]]$data, "nr")
cnr <- cumsum(c(0, nrv))
weight <- numeric(sum(nrv))
dl <- matrix(NA, sum(nrv), l)
for (i in 1:n) weight[(cnr[i] + 1):cnr[i + 1]] <- attr(object[[i]]$data,
"weight")
ll0 <- 0
for (i in 1:n) object[[i]] <- update(object[[i]], tree = tree)
for (i in 1:n) ll0 <- ll0 + object[[i]]$logLik
eps <- 1
scalep <- 1
k <- 1
while (eps > 0.001 & k < 50) {
if (scalep == 1) {
for (i in 1:n) {
lv <- drop(exp(object[[i]]$siteLik))
dl[(cnr[i] + 1):cnr[i + 1], ] <- dl(object[[i]], TRUE) / lv
}
sc <- colSums(weight * dl)
F <- crossprod(dl * weight, dl) + diag(l) * 1e-10
# add small ridge penalty for numerical stability
}
thetaNew <- log(theta) + scalep * solve(F, sc)
thetaNew <- pmax(thetaNew, log(1e-8))
tmptree$edge.length <- as.numeric(exp(thetaNew))
for (i in 1:n) object[[i]] <- update(object[[i]], tree = tmptree)
ll1 <- 0
for (i in 1:n) ll1 <- ll1 + object[[i]]$logLik
eps <- ll1 - ll0
if (eps < 0 || is.nan(eps)) {
scalep <- scalep / 2
eps <- 1
thetaNew <- log(theta)
ll1 <- ll0
}
else {
scalep <- 1
tree <- tmptree
}
theta <- exp(thetaNew)
theta <- pmax(theta, 1e-8)
ll0 <- ll1
k <- k + 1
}
for (i in 1:n) object[[i]] <- update(object[[i]], tree = tree)
object
}
makePart <- function(fit, rooted, weight = ~index + genes) {
if (inherits(fit, "phyDat")) {
x <- fit
dm <- dist.ml(x)
if (!rooted) tree <- NJ(dm)
else tree <- upgma(dm)
fit <- pml(tree, x, k = 4)
}
dat <- fit$data
if (class(weight)[1] == "formula")
weight <- xtabs(weight, data = attr(dat, "index"))
fits <- NULL
for (i in 1:dim(weight)[2]) {
ind <- which(weight[, i] > 0)
dat2 <- getRows(dat, ind)
attr(dat2, "weight") <- weight[ind, i]
fits[[i]] <- update(fit, data = dat2)
}
names(fits) <- colnames(fits)
fits
}
#' @rdname pmlPart
#' @export
multiphyDat2pmlPart <- function(x, method="unrooted", tip.dates=NULL, ...) {
if(is.list(x) && all(sapply(x,inherits, "phyDat"))) seq <- x
else if(inherits(x, "multiphyDat")) seq <- x@seq
else stop("x must be of class multiphyDat or a list of phyDat objects")
shared_tree <- TRUE
if (shared_tree) {
concatenate_x <- do.call(cbind.phyDat, seq)
tree <- candidate_tree(concatenate_x, method=method, tip.dates=tip.dates)
}
else tree <- NULL
fun <- function(x, method, tip.dates, tree, ...) {
if (is.null(tree)) {
tree <- candidate_tree(x, method=method, tip.dates=tip.dates)
}
pml(tree, x, ...)
}
fits <- lapply(seq, fun, tree = tree, ...)
fits
}
#' @rdname pmlPart
#' @export
pmlPart2multiPhylo <- function(x) {
res <- lapply(x$fits, FUN = function(x) x$tree)
class(res) <- "multiPhylo"
res
}
#' @export
plot.pmlPart <- function(x, ...) {
plot(pmlPart2multiPhylo(x), ...)
}
#' Partition model.
#'
#' Model to estimate phylogenies for partitioned data.
#'
#' The \code{formula} object allows to specify which parameter get optimized.
#' The formula is generally of the form \code{edge + bf + Q ~ rate + shape +
#' \dots{}}, on the left side are the parameters which get optimized over all
#' partitions, on the right the parameter which are optimized specific to each
#' partition. The parameters available are \code{"nni", "bf", "Q", "inv",
#' "shape", "edge", "rate"}. Each parameters can be used only once in the
#' formula. \code{"rate"} is only available for the right side of the formula.
#'
#' For partitions with different edge weights, but same topology, \code{pmlPen}
#' can try to find more parsimonious models (see example).
#'
#' \code{pmlPart2multiPhylo} is a convenience function to extract the trees out
#' of a \code{pmlPart} object.
#'
#' @aliases pmlPart
#' @param formula a formula object (see details).
#' @param object an object of class \code{pml} or a list of objects of class
#' \code{pml} .
#' @param control A list of parameters for controlling the fitting process.
#' @param model A vector containing the models containing a model for each
#' partition.
#' @param method One of "unrooted", "ultrametric" or "tiplabeled". Only unrooted
#' is properly supported right now.
#' @param tip.dates A named vector of sampling times associated to the
#' tips/sequences. Leave empty if not estimating tip dated phylogenies.
#' @param \dots Further arguments passed to or from other methods.
#' @param x an object of class \code{pmlPart}
#' @return \code{kcluster} returns a list with elements
#' \item{logLik}{log-likelihood of the fit} \item{trees}{a list of all trees
#' during the optimization.} \item{object}{an object of class \code{"pml"} or
#' \code{"pmlPart"}}
#' @author Klaus Schliep \email{klaus.schliep@@gmail.com}
#' @seealso
#' \code{\link{pml}},\code{\link{pmlCluster}},\code{\link{pmlMix}},
#' \code{\link{SH.test}}
#' @keywords cluster
#' @examples
#'
#' data(yeast)
#' dm <- dist.logDet(yeast)
#' tree <- NJ(dm)
#' fit <- pml(tree,yeast)
#' fits <- optim.pml(fit)
#'
#' weight=xtabs(~ index+genes,attr(yeast, "index"))[,1:10]
#'
#' sp <- pmlPart(edge ~ rate + inv, fits, weight=weight)
#' sp
#'
#' \dontrun{
#' sp2 <- pmlPart(~ edge + inv, fits, weight=weight)
#' sp2
#' AIC(sp2)
#'
#' sp3 <- pmlPen(sp2, lambda = 2)
#' AIC(sp3)
#' }
#'
#' @rdname pmlPart
#' @export pmlPart
pmlPart <- function(formula, object, control = pml.control(epsilon = 1e-8,
maxit = 10, trace = 1), model = NULL, method="unrooted", ...) {
method <- match.arg(method, c("unrooted", "ultrametric", "tipdated"))
if(method=="unrooted") rooted <- FALSE
else rooted <- TRUE
call <- match.call()
form <- phangornParseFormula(formula)
opt <- c("nni", "bf", "Q", "inv", "shape", "edge", "rate")
optAll <- match(opt, form$left)
optPart <- match(opt, form$right)
AllNNI <- !is.na(optAll[1])
AllBf <- !is.na(optAll[2])
AllQ <- !is.na(optAll[3])
AllInv <- !is.na(optAll[4])
AllGamma <- !is.na(optAll[5])
AllEdge <- !is.na(optAll[6])
PartNni <- !is.na(optPart[1])
PartBf <- !is.na(optPart[2])
PartQ <- !is.na(optPart[3])
PartInv <- !is.na(optPart[4])
PartGamma <- !is.na(optPart[5])
PartEdge <- !is.na(optPart[6])
PartRate <- !is.na(optPart[7])
if (PartNni) PartEdge <- TRUE
if(AllNNI) AllEdge <- TRUE
if (inherits(object, "multiphyDat")) {
if (AllNNI || AllEdge) object <- do.call(cbind.phyDat, object@seq)
else fits <- multiphyDat2pmlPart(object, rooted = rooted, ...)
}
if (inherits(object, "pml")) fits <- makePart(object, rooted = rooted, ...)
if (inherits(object, "phyDat")) fits <- makePart(object, rooted = rooted, ...)
if (inherits(object, "pmlPart")) fits <- object$fits
if (inherits(object, "list")) fits <- object
if(AllNNI) if(Ntip(fits[[1]]$tree) < (3 + !rooted)) AllNNI <- FALSE
trace <- control$trace
epsilon <- control$epsilon
maxit <- control$maxit
p <- length(fits)
# if(length(model)<p) model = rep(model, length = p)
if (AllQ) {
Q <- fits[[1]]$Q
for (i in 1:p) fits[[i]] <- update(fits[[i]], Q = Q)
}
if (AllBf) {
bf <- fits[[1]]$bf
for (i in 1:p) fits[[i]] <- update(fits[[i]], bf = bf)
}
if (AllInv) {
inv <- fits[[1]]$inv
for (i in 1:p) fits[[i]] <- update(fits[[i]], inv = inv)
}
if (AllGamma) {
shape <- fits[[1]]$shape
for (i in 1:p) fits[[i]] <- update(fits[[i]], shape = shape)
}
if (AllEdge || AllNNI) {
tree <- fits[[1]]$tree
tree$edge.length <- pmax(tree$edge.length, 1e-6)
for (i in 1:p) fits[[i]] <- update(fits[[i]], tree = tree)
}
m <- 1
logLik <- 0
for (i in 1:p) logLik <- logLik + fits[[i]]$log
eps <- 10
on.exit({
df <- matrix(1, 6, 2)
colnames(df) <- c("#df", "group")
rownames(df) <- c("Edge", "Shape", "Inv", "Bf", "Q", "Rate")
df[1, 1] <- length(fits[[1]]$tree$edge.length)
df[2, 1] <- fits[[1]]$k > 1
df[3, 1] <- fits[[1]]$inv > 0
df[4, 1] <- length(unique(fits[[1]]$bf)) - 1
df[5, 1] <- length(unique(fits[[1]]$Q)) - 1
df[6, 1] <- 0 # rates
if (PartEdge) df[1, 2] <- p
if (PartGamma) df[2, 2] <- p
if (PartInv) df[3, 2] <- p
if (PartBf) df[4, 2] <- p
if (PartQ) df[5, 2] <- p
if (PartRate) df[6, 1] <- p - 1
attr(logLik, "df") <- sum(df[, 1] * df[, 2])
object <- list(logLik = logLik, fits = fits, call = call, df = df)
class(object) <- "pmlPart"
return(object)
})
while (eps > epsilon & m < maxit) {
loli <- 0
# if (AllEdge) fits <- optimPartEdge(fits)
if (any(c(PartNni, PartBf, PartInv, PartQ, PartGamma, PartEdge, PartRate))){
for (i in 1:p) {
fits[[i]] <- optim.pml(fits[[i]], optNni = PartNni, optBf = PartBf,
optQ = PartQ, optInv = PartInv, optGamma = PartGamma,
optEdge = PartEdge, optRate = PartRate, optRooted = rooted,
control = pml.control(maxit = 3, epsilon = 1e-8, trace - 1),
model = model[i])
}
}
if (AllQ) {
Q <- fits[[1]]$Q
subs <- c(1:(length(Q) - 1), 0)
newQ <- optimPartQGeneral(fits, Q = Q, subs = subs)
for (i in 1:p) fits[[i]] <- update(fits[[i]], Q = newQ[[1]])
}
if (AllBf) {
bf <- fits[[1]]$bf
newBf <- optimPartBf(fits, bf = bf)
for (i in 1:p) fits[[i]] <- update(fits[[i]], bf = newBf)
}
if (AllInv) {
inv <- fits[[1]]$inv
newInv <- optimPartInv(fits, inv = inv)
for (i in 1:p) fits[[i]] <- update(fits[[i]], inv = newInv)
}
if (AllGamma) {
shape <- fits[[1]]$shape
newGamma <- optimPartGamma(fits, shape = shape)[[1]]
for (i in 1:p) fits[[i]] <- update(fits[[i]], shape = newGamma)
}
if (AllNNI) {
fits <- optimPartEdge(fits)
fits <- optimPartNNI(fits, AllEdge)
if (trace > 0) cat(attr(fits, "swap"), " NNI operations performed")
if(attr(fits, "swap") == 0) AllNNI <- FALSE
}
if (AllEdge) fits <- optimPartEdge(fits)
if (PartRate) {
tree <- fits[[1]]$tree
rate <- numeric(p)
wp <- numeric(p)
for (i in 1:p) {
wp[i] <- sum(fits[[i]]$weight)
rate[i] <- fits[[i]]$rate
}
ratemult <- sum(wp) / sum(wp * rate)
tree$edge.length <- tree$edge.length / ratemult
for (i in 1:p) fits[[i]] <- update(fits[[i]], tree = tree,
rate = rate[i] * ratemult)
}
loli <- 0
for (i in 1:p) loli <- loli + fits[[i]]$log
eps <- (logLik - loli) / loli
if (trace > 0) cat("loglik:", logLik, "-->", loli, "\n")
logLik <- loli
m <- m + 1
}
}
#
# pmlCluster
#
pmlCluster.fit <- function(formula, fit, weight, p = 4, part = NULL,
control = pml.control(epsilon = 1e-8, maxit = 10,
trace = 1), ...) {
call <- match.call()
form <- phangornParseFormula(formula)
opt <- c("nni", "bf", "Q", "inv", "shape", "edge", "rate")
optAll <- match(opt, form$left)
optPart <- match(opt, form$right)
AllNNI <- !is.na(optAll[1])
AllBf <- !is.na(optAll[2])
AllQ <- !is.na(optAll[3])
AllInv <- !is.na(optAll[4])
AllGamma <- !is.na(optAll[5])
AllEdge <- !is.na(optAll[6])
PartNni <- !is.na(optPart[1])
PartBf <- !is.na(optPart[2])
PartQ <- !is.na(optPart[3])
PartInv <- !is.na(optPart[4])
PartGamma <- !is.na(optPart[5])
PartEdge <- !is.na(optPart[6])
PartRate <- !is.na(optPart[7])
if (PartNni) PartEdge <- TRUE
nrw <- dim(weight)[1]
ncw <- dim(weight)[2]
if (is.null(part)) {
part <- rep(1:p, length = ncw)
part <- sample(part)
}
Part <- part
Gtrees <- vector("list", p)
dat <- fit$data
attr(fit$orig.data, "index") <- attr(dat, "index") <- NULL
for (i in 1:p) Gtrees[[i]] <- fit$tree
fits <- vector("list", p)
for (i in 1:p) fits[[i]] <- fit
trace <- control$trace
eps <- 0
m <- 1
logLik <- fit$log
trees <- list()
weights <- matrix(0, nrw, p)
lls <- matrix(0, nrw, p)
loli <- fit$log
oldpart <- part
eps2 <- 1
iter <- 0
swap <- 1
on.exit({
df <- matrix(1, 6, 2)
colnames(df) <- c("#df", "group")
rownames(df) <- c("Edge", "Shape", "Inv", "Bf", "Q", "Rate")
df[1, 1] <- length(fits[[1]]$tree$edge.length)
df[2, 1] <- fits[[1]]$k - 1
df[3, 1] <- fits[[1]]$inv > 0
df[4, 1] <- length(unique(fits[[1]]$bf)) - 1
df[5, 1] <- length(unique(fits[[1]]$Q)) - 1
df[6, 1] <- 0
if (PartEdge)
df[1, 2] <- p
if (PartGamma)
df[2, 2] <- p
if (PartInv)
df[3, 2] <- p
if (PartBf)
df[4, 2] <- p
if (PartQ)
df[5, 2] <- p
if (PartRate)
df[6, 1] <- p - 1
attr(logLik, "df") <- sum(df[, 1] * df[, 2])
res <- list(logLik = logLik, Partition = Part, trees = trees)
result <- list(logLik = loli, fits = fits, Partition = part, df = df,
res = res, call = call)
class(result) <- c("pmlPart")
return(result)
})
while (eps < ncw || abs(eps2) > control$eps) {
df2 <- 0
if (any(c(PartNni, PartBf, PartInv, PartQ, PartGamma, PartEdge, PartRate))){
for (i in 1:p) {
weights[, i] <- rowSums(weight[, which(part == i),
drop = FALSE])
ind <- which(weights[, i] > 0)
dat2 <- getRows(dat, ind)
attr(dat2, "weight") <- weights[ind, i]
fits[[i]] <- update(fits[[i]], data = dat2)
fits[[i]] <- optim.pml(fits[[i]], PartNni, PartBf,
PartQ, PartInv, PartGamma, PartEdge, PartRate,
control = pml.control(epsilon = 1e-8, maxit = 3, trace - 1))
lls[, i] <- update(fits[[i]], data = dat)$siteLik
Gtrees[[i]] <- fits[[i]]$tree
}
}
if (AllQ) {
Q <- fits[[1]]$Q
subs <- c(1:(length(Q) - 1), 0)
newQ <- optimPartQGeneral(fits, Q = Q, subs = subs)[[1]]
for (i in 1:p) fits[[i]] <- update(fits[[i]], Q = newQ)
df2 <- df2 + length(unique(newQ)) - 1
}
if (AllBf) {
bf <- fits[[1]]$bf
newBf <- optimPartBf(fits, bf = bf)
for (i in 1:p) fits[[i]] <- update(fits[[i]], bf = newBf)
df2 <- df2 + length(unique(newBf)) - 1
}
if (AllInv) {
inv <- fits[[1]]$inv
newInv <- optimPartInv(fits, inv = inv)
for (i in 1:p) fits[[i]] <- update(fits[[i]], inv = newInv)
# there was an Error
df2 <- df2 + 1
}
if (AllGamma) {
shape <- fits[[1]]$shape
newGamma <- optimPartGamma(fits, shape = shape)[[1]]
for (i in 1:p) fits[[i]] <- update(fits[[i]], shape = newGamma)
df2 <- df2 + 1
}
if (AllNNI) {
fits <- optimPartEdge(fits)
fits <- optimPartNNI(fits, AllEdge)
if (trace > 0) cat(attr(fits, "swap"), " NNI operations performed")
swap <- attr(fits, "swap")
}
if (AllEdge) {
fits <- optimPartEdge(fits)
df2 <- df2 + length(fits[[1]]$tree$edge.length)
}
if (PartRate) {
tree <- fits[[1]]$tree
rate <- numeric(p)
wp <- numeric(p)
for (i in 1:p) {
wp[i] <- sum(fits[[i]]$weight)
rate[i] <- fits[[i]]$rate
}
ratemult <- sum(wp) / sum(wp * rate)
tree$edge.length <- tree$edge.length / ratemult
for (i in 1:p) fits[[i]] <- update(fits[[i]], tree = tree,
rate = rate[i] * ratemult)
}
for (i in 1:p) lls[, i] <- update(fits[[i]], data = dat)$siteLik
trees[[m]] <- Gtrees
LL <- t(weight) %*% lls
# choose partitions which change
tmp <- (LL[cbind(1:ncw, part)] - apply(LL, 1, max)) / colSums(weight)
fixi <- numeric(p)
for (i in 1:p) {
tmpi <- which(part == i)
fixi[i] <- tmpi[which.max(tmp[tmpi])]
}
oldpart <- part
# restrict the number of elements changing groups
# If more than 25% would change, only the 25% with the highest increase per
# site change
if (sum(tmp == 0) / length(tmp) < .75) {
medtmp <- quantile(tmp, .25)
medind <- which(tmp <= medtmp)
part[medind] <- max.col(LL[medind, ])
}
else part <- max.col(LL)
# else part <- apply(LL, 1, which.max)
# force groups to have at least one member
part[fixi] <- 1:p
Part <- cbind(Part, part)
eps <- sum(diag(table(part, oldpart)))
eps2 <- loli
loli <- sum(apply(LL, 1, max))
eps2 <- (eps2 - loli) / loli
logLik <- c(logLik, loli)
if (trace > 0) print(loli)
Part <- cbind(Part, part)
df2 <- df2 + df2
if (eps == ncw & swap == 0)
AllNNI <- FALSE
m <- m + 1
if (eps == ncw)
iter <- iter + 1
if (iter == 3)
break
}
}
#' Stochastic Partitioning
#'
#' Stochastic Partitioning of genes into p cluster.
#'
#' The \code{formula} object allows to specify which parameter get optimized.
#' The formula is generally of the form \code{edge + bf + Q ~ rate + shape +
#' \dots{}}, on the left side are the parameters which get optimized over all
#' cluster, on the right the parameter which are optimized specific to each
#' cluster. The parameters available are \code{"nni", "bf", "Q", "inv",
#' "shape", "edge", "rate"}. Each parameter can be used only once in the
#' formula. There are also some restriction on the combinations how parameters
#' can get used. \code{"rate"} is only available for the right side. When
#' \code{"rate"} is specified on the left hand side \code{"edge"} has to be
#' specified (on either side), if \code{"rate"} is specified on the right hand
#' side it follows directly that \code{edge} is too.
#'
#' @param formula a formula object (see details).
#' @param fit an object of class \code{pml}.
#' @param weight \code{weight} is matrix of frequency of site patterns for all
#' genes.
#' @param p number of clusters.
#' @param part starting partition, otherwise a random partition is generated.
#' @param nrep number of replicates for each p.
#' @param control A list of parameters for controlling the fitting process.
#' @param \dots Further arguments passed to or from other methods.
#' @return \code{pmlCluster} returns a list with elements
#' \item{logLik}{log-likelihood of the fit} \item{trees}{a list of all trees
#' during the optimization.} \item{fits}{fits for the final partitions}
#' @author Klaus Schliep \email{klaus.schliep@@gmail.com}
#' @seealso
#' \code{\link{pml}},\code{\link{pmlPart}},\code{\link{pmlMix}},
#' \code{\link{SH.test}}
#' @references K. P. Schliep (2009). Some Applications of statistical
#' phylogenetics (PhD Thesis)
#'
#' Lanfear, R., Calcott, B., Ho, S.Y.W. and Guindon, S. (2012) PartitionFinder:
#' Combined Selection of Partitioning Schemes and Substitution Models for
#' Phylogenetic Analyses. \emph{Molecular Biology and Evolution}, \bold{29(6)},
#' 1695-1701
#' @keywords cluster
#' @examples
#'
#' \dontrun{
#' data(yeast)
#' dm <- dist.logDet(yeast)
#' tree <- NJ(dm)
#' fit <- pml(tree,yeast)
#' fit <- optim.pml(fit)
#'
#' weight <- xtabs(~ index+genes,attr(yeast, "index"))
#' set.seed(1)
#'
#' sp <- pmlCluster(edge~rate, fit, weight, p=1:4)
#' sp
#' SH.test(sp)
#' }
#'
#' @export pmlCluster
pmlCluster <- function(formula, fit, weight, p = 1:5, part = NULL, nrep = 10,
control = pml.control(epsilon = 1e-08, maxit = 10,
trace = 1), ...) {
call <- match.call()
form <- phangornParseFormula(formula)
if (any(p == 1)) {
opt2 <- c("nni", "bf", "Q", "inv", "shape", "edge")
tmp1 <- opt2 %in% form$left
tmp1 <- tmp1 | (opt2 %in% form$right)
fit <- optim.pml(fit, tmp1[1], tmp1[2], tmp1[3], tmp1[4],
tmp1[5], tmp1[6], control = control)
}
p <- p[p != 1]
if (length(p) == 0) return(fit)
n <- sum(weight)
k <- 2
BIC <- matrix(0, length(p) + 1, nrep)
BIC[1, ] <- AIC(fit, k = log(n))
LL <- matrix(NA, length(p) + 1, nrep)
LL[1, ] <- logLik(fit)
P <- array(dim = c(length(p) + 1, nrep, dim(weight)[2]))
tmpBIC <- Inf
choice <- c(1, 1)
for (j in p) {
tmp <- NULL
for (i in 1:nrep) {
tmp <- pmlCluster.fit(formula, fit, weight, p = j, part = part,
control = control, ...)
P[k, i, ] <- tmp$Partition
BIC[k, i] <- AIC(tmp, k = log(n))
LL[k, i] <- logLik(tmp)
if (BIC[k, i] < tmpBIC) {
tmpBIC <- BIC[k, i]
result <- tmp
choice <- c(k, i)
}
}
k <- k + 1
}
p <- c(1, p)
result$choice <- choice
result$BIC <- BIC
result$AllPartitions <- P
result$AllLL <- LL
result$p <- p
class(result) <- c("pmlCluster", "pmlPart")
result
}
#' @export
plot.pmlCluster <- function(x, which = c(1L:3L), caption =
list("BIC", "log-likelihood", "Partitions"), ...) {
show <- rep(FALSE, 3)
show[which] <- TRUE
choice <- x$choice
if (show[1]) {
X <- x$AllPartitions[choice[1], , ]
d <- dim(X)
ind <- order(X[choice[2], ])
im <- matrix(0, d[2], d[2])
for (j in 1:d[1]) {
for (i in 1:d[2]) im[i, ] <- im[i, ] + (X[j, ] == X[j, i])
}
image(im[ind, ind], ...)
}
if (show[1]) matplot(x$p, x$BIC, ylab = "BIC", xlab = "number of clusters")
if (show[1]) matplot(x$p, x$AllLL, ylab = "log-likelihood",
xlab = "number of clusters")
}
#' @export
print.pmlPart <- function(x, ...) {
levels <- attr(x$fits[[1]]$data, "levels")
r <- length(x$fits)
nc <- attr(x$fits[[1]]$data, "nc")
k <- x$fits[[1]]$k
lbf <- x$df["Bf", 2]
bf <- matrix(0, lbf, nc)
if (lbf > 1) dimnames(bf) <- list(1:r, levels)
lQ <- x$df["Q", 2]
Q <- matrix(0, lQ, nc * (nc - 1) / 2)
if (lQ > 1) dimnames(Q) <- list(1:r, NULL)
type <- attr(x$fits[[1]]$data, "type")
loli <- numeric(r)
rate <- numeric(r)
shape <- numeric(r)
sizes <- numeric(r)
inv <- numeric(r)
for (i in 1:r) {
loli[i] <- x$fits[[i]]$logLik
if (i <= lbf) bf[i, ] <- x$fits[[i]]$bf
if (i <= lQ) Q[i, ] <- x$fits[[i]]$Q
rate[i] <- x$fits[[i]]$rate
shape[i] <- x$fits[[i]]$shape
inv[i] <- x$fits[[i]]$inv
sizes[i] <- sum(attr(x$fits[[i]]$data, "weight"))
}
cat("\nloglikelihood:", x$logLik, "\n")
cat("\nloglikelihood of partitions:\n ", loli, "\n")
cat("AIC: ", AIC(x), " BIC: ", AIC(x, k = log(sum(sizes))), "\n\n")
cat("Proportion of invariant sites:", inv, "\n")
cat("\nRates:\n")
cat(rate, "\n")
if (k > 1) {
cat("\nShape parameter:\n")
cat(shape, "\n")
}
if (type == "AA") cat("Rate matrix:", x$fits[[1]]$model, "\n")
else {
cat("\nBase frequencies: \n")
print(bf)
cat("\nRate matrix:\n")
print(Q)
}
}
#' @export
logLik.pmlPart <- function(object, ...) {
res <- object$logLik
attr(res, "df") <- sum(object$df[, 1] * object$df[, 2])
class(res) <- "logLik"
res
}
optim.quartet <- function(old.el, eig, bf, dat, g = 1, w = 1, weight,
ll.0 = weight * 0, control = list(eps = 1e-08,
maxit = 5, trace = 0, tau=1e-8), llcomp = -Inf) {
eps <- 1
iter <- 0
evi <- (t(eig[[3]]) * bf)
tau <- control$tau
while (eps > control$eps && iter < control$maxit) {
tmp <- fn.quartet(old.el = old.el, eig = eig, bf = bf, dat = dat,
g = g, w = w, weight = weight, ll.0 = ll.0)
old.ll <- tmp$ll
el1 <- fs(old.el[1], eig, tmp$res[, 1], dat[, 1], weight, g = g, w = w,
bf = bf, ll.0 = ll.0, evi, tau = tau, getA = TRUE, getB = FALSE)
el2 <- fs(old.el[2], eig, el1[[2]], dat[, 2], weight, g = g, w = w,
bf = bf, ll.0 = ll.0, evi, tau = tau, getA = TRUE, getB = FALSE)
el5 <- fs(old.el[5], eig, el2[[2]], tmp$res[, 2], weight, g = g, w = w,
bf = bf, ll.0 = ll.0, evi, tau = tau, getA = FALSE, getB = TRUE)
el3 <- fs(old.el[3], eig, el5[[3]], dat[, 3], weight, g = g, w = w,
bf = bf, ll.0 = ll.0, evi, tau = tau, getA = TRUE, getB = FALSE)
el4 <- fs(old.el[4], eig, el3[[2]], dat[, 4], weight, g = g, w = w,
bf = bf, ll.0 = ll.0, evi, tau = tau, getA = FALSE, getB = FALSE)
old.el[1] <- el1[[1]]
old.el[2] <- el2[[1]]
old.el[3] <- el3[[1]]
old.el[4] <- el4[[1]]
old.el[5] <- el5[[1]]
iter <- iter + 1
ll <- el4[[4]]
eps <- (old.ll - ll) / ll
if (ll < llcomp) return(list(old.el, ll))
old.ll <- ll
}
list(old.el, ll)
}
optNNI <- function(fit, INDEX) {
tree <- fit$tree
ll.0 <- fit$ll.0
bf <- fit$bf
eig <- fit$eig
# k <- fit$k
w <- fit$w
g <- fit$g
rootEdges <- attr(INDEX, "root")
.dat <- NULL
parent <- tree$edge[, 1]
child <- tree$edge[, 2]
data <- getCols(fit$data, tree$tip.label)
datp <- rnodes(tree, data, w, g, eig, bf)
tmp <- length(tree$tip.label)
for (i in seq_along(w)) .dat[i, 1:tmp] <- new2old.phyDat(data)
evector <- numeric(max(parent))
evector[child] <- tree$edge.length
m <- dim(INDEX)[1]
loglik <- numeric(2 * m)
edgeMatrix <- matrix(0, 2 * m, 5)
for (i in 1:m) {
ei <- INDEX[i, ]
el0 <- evector[INDEX[i, ]]
l <- length(datp[, 1])
weight <- fit$weight
datn <- vector("list", 4 * l)
attr(datn, "dim") <- c(l, 4)
datn <- .dat[, ei[1:4], drop = FALSE]
if (!(ei[5] %in% rootEdges))
datn[, 1] <- datp[, ei[1], drop = FALSE]
new1 <- optim.quartet(el0[c(1, 3, 2, 4, 5)],
eig, bf, datn[, c(1, 3, 2, 4), drop = FALSE], g,
w, weight, ll.0, llcomp = fit$log)
new2 <- optim.quartet(el0[c(1, 4, 3, 2, 5)],
eig, bf, datn[, c(1, 4, 3, 2), drop = FALSE], g,
w, weight, ll.0, llcomp = fit$log)
loglik[(2 * i) - 1] <- new1[[2]]
loglik[(2 * i)] <- new2[[2]]
edgeMatrix[(2 * i) - 1, ] <- new1[[1]]
edgeMatrix[(2 * i), ] <- new2[[1]]
}
list(loglik = loglik, edges = edgeMatrix)
}
optimPartNNI <- function(object, AllEdge = TRUE, trace=0, ...) {
tree <- object[[1]]$tree
INDEX <- indexNNI(tree)
l <- length(object)
loglik0 <- 0
for (i in 1:l) loglik0 <- loglik0 + logLik(object[[i]])
l <- length(object)
TMP <- vector("list", l)
for (i in 1:l) {
TMP[[i]] <- optNNI(object[[i]], INDEX)
}
loglik <- TMP[[1]][[1]]
for (i in 2:l) loglik <- loglik + TMP[[i]][[1]]
swap <- 0
candidates <- loglik > loglik0
while (any(candidates)) {
ind <- which.max(loglik)
loglik[ind] <- -Inf
if (ind %% 2)
swap.edge <- c(2, 3)
else swap.edge <- c(2, 4)
tree2 <- changeEdge(tree, INDEX[(ind + 1) %/% 2, swap.edge],
INDEX[(ind + 1) %/% 2, ], TMP[[1]][[2]][ind, ])
tmpll <- 0
for (i in 1:l) {
if (!AllEdge) tree2 <- changeEdge(object[[i]]$tree, INDEX[(ind + 1) %/% 2,
swap.edge], INDEX[(ind + 1) %/% 2, ], TMP[[i]][[2]][ind, ])
tmpll <- tmpll + update(object[[i]], tree = tree2)$logLik
}
if (tmpll < loglik0)
candidates[ind] <- FALSE
if (tmpll > loglik0 + 1e-8) {
swap <- swap + 1
tree <- tree2
indi <- which(rep(colSums(apply(INDEX, 1, match,
INDEX[(ind + 1) %/% 2, ], nomatch = 0)) > 0, each = 2))
candidates[indi] <- FALSE
loglik[indi] <- -Inf
for (i in 1:l) {
if (!AllEdge) tree2 <- changeEdge(object[[i]]$tree,
INDEX[(ind + 1) %/% 2, swap.edge],
INDEX[(ind + 1) %/% 2, ], TMP[[i]][[2]][ind, ])
object[[i]] <- update(object[[i]], tree = tree2)
}
loglik0 <- 0
for (i in 1:l) loglik0 <- loglik0 + logLik(object[[i]])
if(trace>0) cat(loglik0, "\n")
}
}
if (AllEdge) object <- optimPartEdge(object)
attr(object, "swap") <- swap
object
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.