Nothing
#### TRONCO: a tool for TRanslational ONCOlogy
####
#### Copyright (c) 2015-2017, Marco Antoniotti, Giulio Caravagna, Luca De Sano,
#### Alex Graudenzi, Giancarlo Mauri, Bud Mishra and Daniele Ramazzotti.
####
#### All rights reserved. This program and the accompanying materials
#### are made available under the terms of the GNU GPL v3.0
#### which accompanies this distribution.
# reconstruct the best topology based on probabilistic causation and maximum likelihood estimation
# @title gabow.fit
# @param dataset a dataset describing a progressive phenomenon
# @param regularization regularizators to be used for the likelihood fit
# @param score the score to be used, could be either pointwise mutual information (pmi) or conditional entropy (entropy)
# @param do.boot should I perform bootstrap? Yes if TRUE, no otherwise
# @param nboot integer number (greater than 0) of bootstrap sampling to be performed
# @param pvalue pvalue for the tests (value between 0 and 1)
# @param min.boot minimum number of bootstrapping to be performed
# @param min.stat should I keep bootstrapping untill I have nboot valid values?
# @param boot.seed seed to be used for the sampling
# @param silent should I be verbose?
# @param epos error rate of false positive errors
# @param eneg error rate of false negative errors
# @param hypotheses hypotheses to be considered in the reconstruction. This should be NA for this algorithms.
# @return topology: the reconstructed tree topology
#
gabow.fit <- function(dataset,
regularization = "no_reg",
score = "pmi",
do.boot = TRUE,
nboot = 100,
pvalue = 0.05,
min.boot = 3,
min.stat = TRUE,
boot.seed = NULL,
silent = FALSE,
epos = 0.0,
eneg = 0.0,
do.raising = TRUE,
hypotheses = NA) {
## Start the clock to measure the execution time.
ptm = proc.time();
## Structure with the set of valid edges
## I start from the complete graph, i.e., I have no prior and all
## the connections are possibly causal.
adj.matrix = array(1, c(ncol(dataset), ncol(dataset)));
colnames(adj.matrix) = colnames(dataset);
rownames(adj.matrix) = colnames(dataset);
## The diagonal of the adjacency matrix should not be considered,
## i.e., no self cause is allowed.
diag(adj.matrix) = 0;
## Consider any hypothesis.
adj.matrix = hypothesis.adj.matrix(hypotheses, adj.matrix);
## Check if the dataset is valid.
valid.dataset = check.dataset(dataset, adj.matrix, FALSE, epos, eneg)
adj.matrix = valid.dataset$adj.matrix;
invalid.events = valid.dataset$invalid.events;
## Reconstruct the prima facie topology
## Should I perform bootstrap? Yes if TRUE, no otherwise.
if (do.boot == TRUE) {
if (!silent)
cat('*** Bootstraping selective advantage scores (prima facie).\n')
prima.facie.parents =
get.prima.facie.parents.do.boot(dataset,
hypotheses,
nboot,
pvalue,
adj.matrix,
min.boot,
min.stat,
boot.seed,
silent,
epos,
eneg);
} else {
if (!silent)
cat('*** Computing selective advantage scores (prima facie).\n')
prima.facie.parents =
get.prima.facie.parents.no.boot(dataset,
hypotheses,
adj.matrix,
silent,
epos,
eneg);
}
## Add back in any connection invalid for the probability raising
## theory.
if (length(invalid.events) > 0) {
# save the correct acyclic matrix
adj.matrix.cyclic.tp.valid = prima.facie.parents$adj.matrix$adj.matrix.cyclic.tp
adj.matrix.cyclic.valid = prima.facie.parents$adj.matrix$adj.matrix.cyclic
adj.matrix.acyclic.valid = prima.facie.parents$adj.matrix$adj.matrix.acyclic
for (i in 1:nrow(invalid.events)) {
prima.facie.parents$adj.matrix$adj.matrix.cyclic.tp[invalid.events[i, "cause"],invalid.events[i, "effect"]] = 1
prima.facie.parents$adj.matrix$adj.matrix.cyclic[invalid.events[i, "cause"],invalid.events[i, "effect"]] = 1
prima.facie.parents$adj.matrix$adj.matrix.acyclic[invalid.events[i, "cause"],invalid.events[i, "effect"]] = 1
}
# if the new cyclic.tp contains cycles use the previously computed matrix
if (!is.dag(graph.adjacency(prima.facie.parents$adj.matrix$adj.matrix.cyclic.tp))) {
prima.facie.parents$adj.matrix$adj.matrix.cyclic.tp = adj.matrix.cyclic.tp.valid
}
# if the new cyclic contains cycles use the previously computed matrix
if (!is.dag(graph.adjacency(prima.facie.parents$adj.matrix$adj.matrix.cyclic))) {
prima.facie.parents$adj.matrix$adj.matrix.cyclic = adj.matrix.cyclic.valid
}
# if the new acyclic contains cycles use the previously computed matrix
if (!is.dag(graph.adjacency(prima.facie.parents$adj.matrix$adj.matrix.acyclic))) {
prima.facie.parents$adj.matrix$adj.matrix.acyclic = adj.matrix.acyclic.valid
}
}
adj.matrix.prima.facie =
prima.facie.parents$adj.matrix$adj.matrix.acyclic
# set the input prior matrix for Gabow
if(do.raising==TRUE) {
adj.matrix.gabow = prima.facie.parents$adj.matrix$adj.matrix.cyclic
}
else {
adj.matrix.gabow = prima.facie.parents$adj.matrix$adj.matrix.cyclic.tp
}
## Perform the likelihood fit with the required strategy
model = list();
for(my_score in score) {
# as Gabow algorithm is computationally expensive, I first reconstruct with "no_reg" for each score
best.parents =
perform.likelihood.fit.gabow(dataset,
adj.matrix.gabow,
regularization = "no_reg",
score = my_score,
marginal.probs = prima.facie.parents$marginal.probs,
joint.probs = prima.facie.parents$joint.probs)
# if Gabow was not performed due to memory issues, use Edmonds instead
fallback.edmonds = FALSE
if(is.null(best.parents)) {
fallback.edmonds = TRUE
best.parents =
perform.likelihood.fit.edmonds(dataset,
adj.matrix.prima.facie,
regularization = "no_reg",
score = my_score,
marginal.probs = prima.facie.parents$marginal.probs,
joint.probs = prima.facie.parents$joint.probs)
}
# now I perform the likelihood fit with each regularizator
for (reg in regularization) {
## Perform the likelihood fit with the chosen regularization
## score on the prima facie topology
if (!silent)
cat('*** Performing likelihood-fit with regularization:', reg, '.\n')
# adjacency matrix of the topology reconstructed by likelihood fit
curr_adj.matrix.fit = array(0,c(nrow(best.parents$adj.matrix$adj.matrix.fit),ncol(best.parents$adj.matrix$adj.matrix.fit)))
rownames(curr_adj.matrix.fit) = colnames(dataset)
colnames(curr_adj.matrix.fit) = colnames(dataset)
curr_adj.matrix.fit = lregfit(as.categorical.dataset(dataset),
best.parents$adj.matrix$adj.matrix.fit,
curr_adj.matrix.fit,
reg,
command = "hc")
## Save the results and return them.
adj.matrix_reg = list(adj.matrix.pf = adj.matrix.gabow,
adj.matrix.fit = curr_adj.matrix.fit)
curr_best.parents = list(adj.matrix = adj.matrix_reg)
## Set the structure to save the conditional probabilities of the reconstructed topology
reconstructed.model = create.model(dataset,
curr_best.parents,
prima.facie.parents)
model.name = paste('gabow', reg, my_score, sep='_')
model[[model.name]] = reconstructed.model
}
}
## Set the execution parameters
parameters =
list(algorithm = "GABOW",
regularization = regularization,
score = score,
do.boot = do.boot,
nboot = nboot,
pvalue = pvalue,
min.boot = min.boot,
min.stat = min.stat,
boot.seed = boot.seed,
silent = silent,
error.rates = list(epos=epos,eneg=eneg),
do.raising = do.raising,
fallback.edmonds = fallback.edmonds)
## Return the results
topology =
list(dataset = dataset,
hypotheses = hypotheses,
adj.matrix.prima.facie = adj.matrix.prima.facie,
confidence = prima.facie.parents$pf.confidence,
model = model,
parameters = parameters,
execution.time = (proc.time() - ptm))
topology = rename.reconstruction.fields(topology, dataset)
return(topology)
}
# reconstruct the best causal topology by Gabow algorithm combined with probabilistic causation
# @title perform.likelihood.fit.gabow
# @param dataset a valid dataset
# @param adj.matrix the adjacency matrix of the prima facie causes
# @param regularization regularization term to be used in the likelihood fit
# @param command type of search, either hill climbing (hc) or tabu (tabu)
# @return topology: the adjacency matrix of both the prima facie and causal topologies
#
perform.likelihood.fit.gabow = function(dataset,
adj.matrix,
regularization,
score,
command = "hc",
marginal.probs,
joint.probs) {
data = as.categorical.dataset(dataset)
adj.matrix.prima.facie = adj.matrix
# adjacency matrix of the topology reconstructed by likelihood fit
adj.matrix.fit = array(0,c(nrow(adj.matrix),ncol(adj.matrix)))
rownames(adj.matrix.fit) = colnames(dataset)
colnames(adj.matrix.fit) = colnames(dataset)
# detect the acyclic parts of the graph,
# being the strongly connected components of degree 1
my_graph = graph_from_adjacency_matrix(adj.matrix)
strongly_connected = clusters(my_graph,mode="strong")
# perform Gabow only if the maximum strongly connected component
# has size less than 9
if(max(strongly_connected$csize)<9) {
valid_nodes = c()
for(i in 1:strongly_connected$no) {
if(length(which(strongly_connected$membership==i))==1) {
valid_nodes = c(valid_nodes,which(strongly_connected$membership==i))
}
}
# set at most one parent per node based on mutual information
# only for the acyclic parts of the graph
if(length(valid_nodes)>0) {
for (i in valid_nodes) {
# consider the parents of i
curr_parents = which(adj.matrix[,i] == 1)
# if I have more then one valid parent
if (length(curr_parents) > 1) {
# find the best parent
curr_best_parent = -1
curr_best_score = NA
for (j in curr_parents) {
# if the event is valid
if(joint.probs[i,j]>=0) {
# compute the chosen score
new_score = compute.edmonds.score(joint.probs[i,j],marginal.probs[i],marginal.probs[j],score)
}
# else, if the two events are indistinguishable
else if(joint.probs[i,j]<0) {
new_score = Inf
}
if (is.na(curr_best_score) || new_score > curr_best_score) {
curr_best_parent = j
curr_best_score = new_score
}
}
# set the best parent
for (j in curr_parents) {
if (j != curr_best_parent) {
adj.matrix[j,i] = 0
}
}
}
}
}
# now consider the acyclic parts of the graph
if(length(valid_nodes)<nrow(adj.matrix)) {
# now I consider each strongly connected componet with size greater then 1
for(i in 1:strongly_connected$no) {
if(length(which(strongly_connected$membership==i))>1) {
adj.matrix = find.best.subtree(adj.matrix,which(strongly_connected$membership==i),marginal.probs,joint.probs,score)
}
}
}
# perform the likelihood fit if requested
adj.matrix.fit = lregfit(data,
adj.matrix,
adj.matrix.fit,
regularization,
command)
## Save the results and return them.
adj.matrix =
list(adj.matrix.pf = adj.matrix.prima.facie,
adj.matrix.fit = adj.matrix.fit)
topology = list(adj.matrix = adj.matrix)
}
else {
warning("Too big strongly connected components: Gabow can not be performed, using Edmonds instead.")
topology = NULL
}
return(topology)
}
find.best.subtree = function( adj.matrix, subtree.nodes, marginal.probs, joint.probs, score ) {
# enumerate the possible orderings within subtree.nodes
possible.orderings = permutations(length(subtree.nodes),length(subtree.nodes))
# consider all the possible trees within subtree.nodes
all_tree_scores = sapply(1:nrow(possible.orderings),FUN=function(x) {
curr.ordering = subtree.nodes[possible.orderings[x,]]
curr.adj.matrix = adj.matrix
# remove any loop accordingly to this ordering
for(j in 2:length(curr.ordering)) {
curr_invalid = curr.ordering[1:(j-1)]
curr.adj.matrix[curr.ordering[j],curr_invalid] = 0
}
# compute the score of curr.adj.matrix
curr_score = get.best.scored.tree(curr.adj.matrix,subtree.nodes,marginal.probs,joint.probs,score)$score
return(curr_score)
})
# pick the best scored tree
best.ordering = subtree.nodes[possible.orderings[which(all_tree_scores==max(all_tree_scores))[1],]]
# remove any loop accordingly to this ordering
best_adj.matrix = adj.matrix
for(j in 2:length(best.ordering)) {
curr_invalid = best.ordering[1:(j-1)]
best_adj.matrix[best.ordering[j],curr_invalid] = 0
}
# compute the score of curr.adj.matrix
best_adj.matrix = get.best.scored.tree(best_adj.matrix,subtree.nodes,marginal.probs,joint.probs,score)$adj.matrix
return(best_adj.matrix)
}
get.best.scored.tree = function( adj.matrix, subtree.nodes, marginal.probs, joint.probs, score_type ) {
score = 0
total.arcs = 0
attached.arcs = 0
# set at most one parent per node based on mutual information
for (i in subtree.nodes) {
# consider the parents of i
curr_parents = which(adj.matrix[,i] == 1)
# if I have more then one valid parent
if (length(curr_parents) > 1) {
# find the best parent
curr_best_parent = -1
curr_best_score = NA
for (j in curr_parents) {
# if the event is valid
if(joint.probs[i,j]>=0) {
# compute the chosen score
new_score = compute.edmonds.score(joint.probs[i,j],marginal.probs[i],marginal.probs[j],score_type)
}
# else, if the two events are indistinguishable
else if(joint.probs[i,j]<0) {
new_score = Inf
}
if (is.na(curr_best_score) || new_score > curr_best_score) {
curr_best_parent = j
curr_best_score = new_score
}
}
if(curr_best_score!=Inf) {
score = score + curr_best_score
total.arcs = total.arcs + 1
}
# set the best parent
invalid.curr.parents = which(curr_parents!=curr_best_parent)
if(length(invalid.curr.parents)>0) {
adj.matrix[curr_parents[invalid.curr.parents],i] = 0
}
attached.arcs = attached.arcs + 1
}
}
# if I have an empty topology, it has the worst score
if(attached.arcs==0) {
score = -Inf
}
# if I have only arcs between indistinguishable events
else if(attached.arcs>0 && total.arcs==0) {
score = Inf
}
# if I have a valid topology which I want to weight
# for the number of arcs
else {
mean_score = score / total.arcs
score = score + (mean_score * (attached.arcs-total.arcs))
}
res = list(adj.matrix=adj.matrix,score=score)
return(res)
}
#### end of file -- gabow.algorithm.R
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.