#' Parsimony tree.
#'
#' \code{pratchet} implements the parsimony ratchet (Nixon, 1999) and is the
#' preferred way to search for the best parsimony tree. For small number of taxa
#' the function \code{\link{bab}} can be used to compute all most parsimonious
#' trees.
#'
#' \code{parsimony} returns the parsimony score of a tree using either the
#' sankoff or the fitch algorithm.
#' \code{optim.parsimony} optimizes the topology using either Nearest Neighbor
#' Interchange (NNI) rearrangements or sub tree pruning and regrafting (SPR) and
#' is used inside \code{pratchet}. \code{random.addition} can be used to produce
#' starting trees and is an option for the argument perturbation in
#' \code{pratchet}.
#'
#' The "SPR" rearrangements are so far only available for the "fitch" method,
#' "sankoff" only uses "NNI". The "fitch" algorithm only works correct for
#' binary trees.
#'
#' @aliases parsimony
#' @param data A object of class phyDat containing sequences.
#' @param tree tree to start the nni search from.
#' @param method one of 'fitch' or 'sankoff'.
#' @param cost A cost matrix for the transitions between two states.
#' @param site return either 'pscore' or 'site' wise parsimony scores.
#' @param trace defines how much information is printed during optimization.
#' @param rearrangements SPR or NNI rearrangements.
#' @param start a starting tree can be supplied.
#' @param maxit maximum number of iterations in the ratchet.
#' @param minit minimum number of iterations in the ratchet.
#' @param k number of rounds ratchet is stopped, when there is no improvement.
#' @param all return all equally good trees or just one of them.
#' @param perturbation whether to use "ratchet", "random_addition" or
#' "stochastic" (nni) for shuffling the tree.
#' @param ... Further arguments passed to or from other methods (e.g.
#' model="sankoff" and cost matrix).
#' @return \code{parsimony} returns the maximum parsimony score (pscore).
#' \code{optim.parsimony} returns a tree after NNI rearrangements.
#' \code{pratchet} returns a tree or list of trees containing the best tree(s)
#' found during the search. \code{acctran} returns a tree with edge length
#' according to the ACCTRAN criterion.
#' @author Klaus Schliep \email{klaus.schliep@@gmail.com}
#' @seealso \code{\link{bab}}, \code{\link{CI}}, \code{\link{RI}},
#' \code{\link{ancestral.pml}}, \code{\link{nni}}, \code{\link{NJ}},
#' \code{\link{pml}}, \code{\link{getClans}}, \code{\link{ancestral.pars}},
#' \code{\link{bootstrap.pml}}
#' @references Felsenstein, J. (2004). \emph{Inferring Phylogenies}. Sinauer
#' Associates, Sunderland.
#'
#' Nixon, K. (1999) The Parsimony Ratchet, a New Method for Rapid Parsimony
#' Analysis. \emph{Cladistics} \bold{15}, 407-414
#' @keywords cluster
#' @examples
#'
#' set.seed(3)
#' data(Laurasiatherian)
#' dm <- dist.hamming(Laurasiatherian)
#' tree <- NJ(dm)
#' parsimony(tree, Laurasiatherian)
#' treeRA <- random.addition(Laurasiatherian)
#' treeSPR <- optim.parsimony(tree, Laurasiatherian)
#'
#' # lower number of iterations for the example (to run less than 5 seconds),
#' # keep default values (maxit, minit, k) or increase them for real life
#' # analyses.
#' treeRatchet <- pratchet(Laurasiatherian, start=tree, maxit=100,
#' minit=5, k=5, trace=0)
#' # assign edge length (number of substitutions)
#' treeRatchet <- acctran(treeRatchet, Laurasiatherian)
#' # remove edges of length 0
#' treeRatchet <- di2multi(treeRatchet)
#'
#' plot(midpoint(treeRatchet))
#' add.scale.bar(0,0, length=100)
#'
#' parsimony(c(tree,treeSPR, treeRatchet), Laurasiatherian)
#'
#' @rdname parsimony
#' @export
## parsimony <- function(tree, data, cost=NULL, method = NULL)
parsimony <- function(tree, data, method = "fitch", cost=NULL, site = "pscore"){
if(inherits(data, "DNAbin") | inherits(data, "AAbin")) data <- as.phyDat(data)
if (!inherits(data, "phyDat")) stop("data must be of class phyDat")
method <- match.arg(method, c("fitch", "sankoff"))
if(!any(is.binary(tree)) || !is.null(cost)) method <- "sankoff"
if (method == "sankoff") result <- sankoff(tree, data, cost=cost, site = site)
if (method == "fitch") result <- fitch(tree, data, site = site)
result
}
compressSites <- function(data) {
attrData <- attributes(data)
lev <- attr(data, "levels")
LEV <- attr(data, "allLevels")
l <- length(lev)
nr <- attr(data, "nr")
nc <- length(data)
data <- unlist(data, FALSE, FALSE)
attr(data, "dim") <- c(nr, nc)
uni <- match(lev, LEV)
fun <- function(x, uni) {
u <- unique.default(x)
res <- if (any(is.na(match(u, uni)))) return(x)
match(x, u)
}
data <- apply(data, 1, fun, uni)
index <- grp_duplicated(data, MARGIN=2L)
pos <- which(!duplicated(index))
ntaxa <- nrow(data)
res <- vector("list", ntaxa)
for(i in seq_len(ntaxa)) res[[i]] <- data[i, pos]
attrData$weight <- as.vector(tapply(attrData$weight, index, sum))
attrData$index <- NULL
attrData$nr <- length(attrData$weight)
attrData$compressed <- TRUE
attributes(res) <- attrData
res
}
parsinfo <- function(x, exact=TRUE) {
nstates <- attr(x, "nc")
up <- upperBound(x)
eps <- 1e-8
low <- up
low[up > (nstates - eps)] <- nstates - 1
if(exact){
ind <- which( (up > (1+eps)) & (up < (nstates-eps)) )
if(length(ind)>0) low[ind] <- lowerBound(getRows( x, ind ))
ind <- which(low == up)
}
else ind <- which(up < (1+eps) )
cbind(ind, low[ind])
}
# greedy algorithm of Maximum Set Packing (MSP) problem
# (should work in most instances)
lowerBound <- function(x, cost = NULL) {
nc <- attr(x, "nc")
nr <- attr(x, "nr")
contrast <- attr(x, "contrast")
rownames(contrast) <- attr(x, "allLevels")
colnames(contrast) <- attr(x, "levels")
nmax <- nrow(contrast)
z <- matrix(unlist(x, FALSE, FALSE), length(x), length(attr(x, "weight")),
byrow = TRUE)
states <- apply(z, 2, unique.default, nmax = nmax)
if(inherits(states, "matrix"))states <- asplit(states, 2)
singles <- which(rowSums(contrast) == 1)
noinfo <- which(rowSums(contrast) == nc)
ambiguous <- which( (rowSums(contrast) > 1) & (rowSums(contrast) < nc))
fun <- function(states, contrast, singles, noinfo, ambiguous) {
if (length(states) == 1) return(0)
states <- setdiff(states, noinfo) # get rid of "-", "?" in DNA
if ( (length(states) == 0) | (length(states) == 1)) return(0)
if (any(states %in% ambiguous)) {
n <- 0L
contrast <- contrast[states, , drop = FALSE]
while (nrow(contrast) > 0) {
m <- which.max(colSums(contrast))
contrast <- contrast[contrast[, m] == 0, , drop = FALSE]
n <- n + 1L
}
return(n - 1L)
}
else return(length(states) - 1L)
}
res <- sapply(states, fun, contrast, singles, noinfo, ambiguous)
res
}
upperBound <- function(x, cost = NULL) {
tree <- stree(length(x), tip.label = names(x))
if (is.null(cost)) cost <- 1 - diag(attr(x, "nc"))
sankoff(tree, x, cost = cost, site = "site")
}
#' Consistency Index and Retention Index
#'
#' \code{CI} and \code{RI} compute the Consistency Index (CI) and Retention
#' Index (RI).
#'
#' @details The Consistency Index is defined as minimum number of changes
#' divided by the number of changes required on the tree (parsimony score). The
#' Consistency Index is equal to one if there is no homoplasy.
#' And the Retention Index is defined as
#' \deqn{RI = \frac{MaxChanges - ObsChanges}{MaxChanges - MinChanges}}{RI = (MaxChanges - ObsChanges) / (MaxChanges - MinChanges)}
#'
#' @param data A object of class phyDat containing sequences.
#' @param tree a phylogenetic tree, i.e. an object of class \code{phylo}.
#' @param cost A cost matrix for the transitions between two states.
#' @param sitewise return CI/RI for alignment or sitewise
#' @returns a scalar or vector with the CI/RI vector.
#' @seealso \code{\link{parsimony}}, \code{\link{pratchet}},
#' \code{\link{fitch}}, \code{\link{sankoff}}, \code{\link{bab}},
#' \code{\link{ancestral.pars}}
#' @examples
#' example(as.phylo.formula)
#' lab <- tr$tip.label
#' lab[79] <- "Herpestes fuscus"
#' tr$tip.label <- abbreviateGenus(lab)
#' X <- matrix(0, 112, 3, dimnames = list(tr$tip.label, c("Canis", "Panthera",
#' "Canis_Panthera")))
#' desc_canis <- Descendants(tr, "Canis")[[1]]
#' desc_panthera <- Descendants(tr, "Panthera")[[1]]
#' X[desc_canis, c(1,3)] <- 1
#' X[desc_panthera, c(2,3)] <- 1
#' col <- rep("black", 112)
#' col[desc_panthera] <- "red"
#' col[desc_canis] <- "blue"
#' X <- phyDat(X, "USER", levels=c(0,1))
#' plot(tr, "f", tip.color=col)
#' # The first two sites are homoplase free!
#' CI(tr, X, sitewise=TRUE)
#' RI(tr, X, sitewise=TRUE)
#'
#' @rdname CI
#' @export
CI <- function(tree, data, cost = NULL, sitewise = FALSE) {
data <- subset(data, tree$tip.label)
pscore <- sankoff(tree, data, cost, ifelse(sitewise, "site", "pscore"))
weight <- attr(data, "weight")
m <- lowerBound(data, cost = cost)
if (sitewise) {
return( (m / pscore)[attr(data, "index")])
}
sum(m * weight) / pscore
}
#' @rdname CI
#' @export
RI <- function(tree, data, cost = NULL, sitewise = FALSE) {
data <- subset(data, tree$tip.label)
pscore <- sankoff(tree, data, cost, ifelse(sitewise, "site", "pscore"))
weight <- attr(data, "weight")
m <- lowerBound(data, cost = cost)
g <- upperBound(data, cost = cost)
if (sitewise) {
res <- (g - pscore) / (g - m)
return(res[attr(data, "index")])
}
m <- sum(m * weight)
g <- sum(g * weight)
(g - pscore) / (g - m)
}
old2new.phyDat <- function(obj) {
att <- attributes(obj)
l <- length(obj)
contrast <- attr(obj, "contrast")
nr <- attr(obj, "nr")
X <- matrix(rep(rowSums(contrast), each = nr), nrow = nr)
for (i in 1:l) obj[[i]][obj[[i]] > 0] <- 1
res <- vector("list", l)
contrast[contrast == 0] <- 1e6
for (i in 1:l) {
tmp <- tcrossprod(obj[[i]], contrast) - X
res[[i]] <- unlist(apply(tmp, 1, function(x) which(x < 1e-6)[1]))
}
attributes(res) <- att
res
}
new2old.phyDat <- function(data) {
contrast <- attr(data, "contrast")
for (i in seq_along(data)) data[[i]] <- contrast[data[[i]], , drop = FALSE]
data
}
indexNNI <- function(tree) {
parent <- tree$edge[, 1]
child <- tree$edge[, 2]
ind <- which(child %in% parent)
Nnode <- tree$Nnode
edgeMatrix <- matrix(0, (Nnode - 1), 5)
pvector <- integer(max(parent))
pvector[child] <- parent
tips <- !logical(max(parent))
tips[parent] <- FALSE
# wahrscheinlich schneller: cvector <- allChildren(tree)
cvector <- vector("list", max(parent))
for (i in seq_along(parent)) cvector[[parent[i]]] <- c(cvector[[parent[i]]],
child[i])
k <- 0
for (i in ind) {
p1 <- parent[i]
p2 <- child[i]
e34 <- cvector[[p2]]
ind1 <- cvector[[p1]]
e12 <- ind1[ind1 != p2]
if (pvector[p1]) e12 <- c(p1, e12)
edgeMatrix[k + 1, ] <- c(e12, e34, p2)
k <- k + 1
}
# vielleicht raus
attr(edgeMatrix, "root") <- cvector[[min(parent)]]
edgeMatrix
}
#' @rdname parsimony
#' @export
optim.parsimony <- function(tree, data, method = "fitch", cost = NULL,
trace = 1, rearrangements = "SPR", ...) {
if (method == "fitch") result <- optim.fitch(tree = tree, data = data,
trace = trace, rearrangements = rearrangements, ...)
if (method == "sankoff") result <- optim.sankoff(tree = tree, data = data,
cost = cost, trace = trace, ...)
result
}
#' @rdname parsimony
#' @export
pratchet <- function(data, start = NULL, method = "fitch", maxit = 1000,
minit = 100, k = 10, trace = 1, all = FALSE,
rearrangements = "SPR", perturbation = "ratchet", ...) {
if(inherits(data, "DNAbin") || inherits(data, "AAbin"))
data <- as.phyDat(data)
printevery <- 10L
eps <- 1e-08
trace <- trace - 1
ref <- names(data)
start_trees <- vector("list", maxit)
search_trees <- vector("list", maxit)
tree <- NULL
mp <- Inf
# TODO use rooted trees if cost is not symmetric
ROOTED <- FALSE
weight <- attr(data, "weight")
v <- rep(seq_along(weight), weight)
w <- logical(length(weight))
# remove parsimony uniformative sites or duplicates
# check for symmetric or
attr(data, "informative") <- NULL
if(method=="fitch") data <- removeParsimonyUninfomativeSites(data,
recursive=TRUE)
else data <- unique(data)
if(!is.null(attr(data, "informative"))) w[attr(data, "informative")] <- TRUE
else w[] <- TRUE
star_tree <- ifelse(attr(data, "nr") == 0, TRUE, FALSE)
add_taxa <- ifelse(is.null(attr(data, "duplicated")), FALSE, TRUE)
nTips <- length(data)
# check for trivial trees
if (nTips < (3L + !ROOTED) || star_tree) {
nam <- names(data)
if (star_tree) tree <- stree(length(nam), tip.label = nam)
else tree <- stree(nTips, tip.label = nam)
if(add_taxa) tree <- addTaxa(tree, attr(data, "duplicated"))
if(!ROOTED) tree <- unroot(tree)
tree <- relabel(tree, ref)
return(tree)
}
if(is.null(start)) start <- optim.parsimony(random.addition(data),
data, trace = trace-1, method = method,
rearrangements = rearrangements, ...)
tree <- start
label <- intersect(tree$tip.label, names(data))
if (!is.binary(tree)){
tree <- multi2di(tree)
if(method=="fitch") tree <- unroot(tree)
}
data <- subset(data, label)
tree <- keep.tip(tree, label)
attr(tree, "pscore") <- parsimony(tree, data, method = method, ...)
mp <- attr(tree, "pscore")
if (trace >= 0)
cat("Parsimony score of initial tree:", attr(tree, "pscore"), "\n")
FUN <- function(data, tree, method, rearrangements, ...)
optim.parsimony(tree, data = data, method = method,
rearrangements = rearrangements, ...)
result <- tree
if(!is.null(attr(data, "duplicated"))){
result <- addTaxa(result, attr(data, "duplicated"))
}
result <- relabel(result, ref)
# if (trace > 1) cat("optimize topology (NNI): ", pscore, "-->", psc, "\n")
hr <- hash(result)
on.exit({
if (!all && inherits(result, "multiPhylo")) result <- result[[1]]
if (length(result) == 1) result <- result[[1]]
env <- new.env()
start_trees <- start_trees[seq_len(i)]
search_trees <- search_trees[seq_len(i)]
class(start_trees) <- "multiPhylo"
class(search_trees) <- "multiPhylo"
start_trees <- .compressTipLabel(start_trees)
search_trees <- .compressTipLabel(search_trees)
assign("start_trees", start_trees, envir=env)
assign("search_trees", search_trees, envir=env)
if(perturbation == "ratchet" && all(Ntip(trees) > 3)) {
spl <- as.splits(start_trees)
result <- addConfidences(result, spl)
if (inherits(result, "multiPhylo")) result <- .compressTipLabel(result)
}
# for ratchet assign bs values
attr(result, "env") <- env
return(result)
})
kmax <- 1
nTips <- length(tree$tip.label)
for (i in seq_len(maxit)) {
if (perturbation == "ratchet") {
# sample and subset more efficient than in bootstrap.phyDat
bsw <- tabulate(sample(v, replace = TRUE), length(weight))[w]
bs_ind <- which(bsw > 0)
bs_data <- getRows(data, bs_ind)
attr(bs_data, "weight") <- bsw[bs_ind]
if(length(bs_ind) > 0){
# p_trees <- random.addition(bs_data) # 3 * ??
p_trees <- optim.parsimony(tree, bs_data,
trace = trace, method = method, rearrangements = rearrangements, ...)
}
else p_trees <- stree(length(data), tip.label = names(data))
}
if (perturbation == "stochastic") p_trees <- rNNI(tree, floor(nTips / 2))
if (perturbation == "random_addition") p_trees <- random.addition(data)
trees <- optim.parsimony(p_trees, data, trace = trace, method = method,
rearrangements = rearrangements, ...)
curr_tree <- trees
if(!is.null(attr(data, "duplicated"))){
p_trees <- addTaxa(p_trees, attr(data, "duplicated"))
trees <- addTaxa(trees, attr(data, "duplicated"))
}
trees <- relabel(trees, ref)
p_trees <- relabel(p_trees, ref)
start_trees[[i]] <- p_trees
search_trees[[i]] <- trees
pscores <- attr(trees, "pscore")
mp1 <- min(pscores)
if ( (mp1 + eps) < mp) {
kmax <- 1
result <- trees
tree <- curr_tree
hr <- hash(trees)
mp <- mp1
}
else{
kmax <- kmax + 1
if( all && (mp1 < (mp + eps))){ # && all(RF.dist(trees, result) > 0)
ht <- hash(trees)
if(!(ht %in% hr)){
hr <- c(hr, ht)
result <- c(result, trees)
}
}
}
if (trace >= 0 && (!i%%printevery))
cat("\rIteration: ", i, ". Best parsimony score so far: ", mp, sep="")
if ( (kmax >= k) && (i >= minit)) break()
} # for
if (trace >= 0)cat("\n")
} # pratchet
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.