R/pathRank.R

Defines functions format_path processNetwork rankPvalue rankShortestPaths pathRanker getPaths getPathsAsEIDs extractPathNetwork

Documented in extractPathNetwork getPathsAsEIDs pathRanker

###############################################################################
#
# pathRank.R:     This file contains functions to rank paths provided with a w
# weigthed igraph object.
#
# author: Ahmed Mohamed <mohamed@kuicr.kyoto-u.ac.jp>
#
# This is released under GPL-2.
#
# Documentation was created using roxygen
#
###############################################################################

#' Creates a subnetwork from a ranked path list
#'
#' Creates a subnetwork from a ranked path list generated by \code{\link{pathRanker}}.
#'
#' @param paths The paths extracted by \code{\link{pathRanker}}.
#' @param graph A annotated igraph object.
#'
#' @return A subnetwork from all paths provided. If paths are computed for several
#' labels (sample categories), a subnetwork is returned for each label.
#'
#' @author Ahmed Mohamed
#' @family Path ranking methods
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get ranked paths using probabilistic shortest paths.
#'  ranked.p <- pathRanker(rgraph, method="prob.shortest.path",
#' 					K=20, minPathSize=6)
#'
#' 	## Get the subnetwork of paths in reaction graph.
#' 	reaction.sub <- getPathsAsEIDs(ranked.p, rgraph)
#'
#' 	## Get the subnetwork of paths in the original metabolic graph.
#' 	metabolic.sub <- getPathsAsEIDs(ranked.p, ex_sbml)
#'
extractPathNetwork <- function(paths, graph){
    p.eids <- getPathsAsEIDs(paths, graph)
    if(length(paths$y.labels)>1){
        graph.ls <- lapply(p.eids, function(x)
                    subgraph.edges(graph, as.integer(unlist(x))) )
        names(graph.ls) <- paths$y.labels
        return(graph.ls)
    }else
        return(subgraph.edges(graph,as.integer(unlist(p.eids))))
}

#' Convert a ranked path list to edge ids of a graph
#'
#' Convert a ranked path list to Edge ids of a graph, where paths can come from
#' a different representation (for example matching path from a reaction network to
#' edges on a metabolic network).
#'
#' @param paths The paths extracted by \code{\link{pathRanker}}.
#' @param graph A annotated igraph object.
#'
#' @return A list of edge ids on the provided graph.
#'
#' @author Ahmed Mohamed
#' @family Path ranking methods
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get ranked paths using probabilistic shortest paths.
#'  ranked.p <- pathRanker(rgraph, method="prob.shortest.path",
#' 					K=20, minPathSize=6)
#'
#' 	## Get the edge ids along paths in the reaction graph.
#' 	path.eids <- getPathsAsEIDs(ranked.p, rgraph)
#'
#' 	## Get the edge ids along paths in the original metabolic graph.
#' 	path.eids <- getPathsAsEIDs(ranked.p, ex_sbml)
#'
getPathsAsEIDs <- function(paths, graph){
    if(length(paths$y.labels)>1){
        eids <- lapply(paths$paths, getPaths, graph, paths$source.net)
        names(eids) = paths$y.labels

    }else{
        eids <- getPaths(paths$paths, graph, paths$source.net)
    }
    return(eids)
}


getPaths <- function(paths, graph, source.net){
    if(length(paths)==0) return(list())

    # graph source unknown
    if(is.null(graph$type) || is.null(source.net))
        return(lapply(paths, function(x) E(graph,path=x$genes)))

    # (G.graph,G.graph) or (R.graph, R.graph)
    if(graph$type == source.net)
        return(lapply(paths, function(x) E(graph,path=x$genes)))

    # (G.graph, R.graph)
    if(graph$type=="G.graph" && source.net=="R.graph"){
        gene.nodes <- lapply(paths, function(x) V(graph)[sub("^(.*)##", "",V(graph)$name) %in% x$genes])
        eid <- lapply(gene.nodes,function(x) E(graph)[x %--% x])

        return(eid)
    }

    # (R.graph, G.graph) or (MR.graph, G.graph) else (MR.graph, R.graph)
    if(source.net=="G.graph"){
        gene.nodes = lapply(paths, function(x) sub("^(.*)##", "",x$genes))
    }else{
        gene.nodes = lapply(paths, "[[", "genes")
    }

    # (R.graph, G.graph)
    if(graph$type=="R.graph")
        return(lapply(gene.nodes, function(x) E(graph, path=x)))

    # It must be MR.graph
    eid <- list()
    for(i in 1:length(paths)){
        deleted.edges <- unlist(lapply(lapply(grep("->",paths[[i]]$compounds),
                                function(x) unlist(strsplit(paths[[i]]$compounds[[x]], "->"))),
                        function(y) mapply(
                                    function(from,to)get.shortest.paths(graph, from, to, mode="out", output="epath"),
                                    head(y,-1), y[-1])
                ))

        paths[[i]]$compounds <- unlist(strsplit(paths[[i]]$compounds, "(->)|( \\+ )"))
        paths[[i]]$genes <- gene.nodes[[i]]
        eid[[i]] = c(as.numeric(deleted.edges),
                    E(graph)[.to(paths[[i]]$genes[1]) |
                        paths[[i]]$genes %--% paths[[i]]$compounds|
                        .from(tail(paths[[i]]$genes,1)) ]
                    )
    }
    return(eid)


}

# TODO: pvalue description
#' Extracting and ranking paths from a network
#'
#' Given a weighted igraph object, path ranking finds a set of node/edge sequences (paths) to
#' maximize the sum of edge weights.
#' \code{pathRanker(method="prob.shortest.path")} extracts the K most probable paths within
#' a weighted network.
#' \code{pathRanker(method="pvalue")} extracts a list of paths whose sum of edge weights are
#' significantly higher than random paths of the same length.
#'
#'
#' The input here is \code{graph}. A weight must be assigned to each edge. Bootstrapped Pearson correlation edge weights
#' can be assigned to each edge by \code{\link{assignEdgeWeights}}. However the specification of the edge weight is flexible
#' with the condition that increasing values indicate stronger relationships between vertices.
#'
#' \subsection{Probabilistic Shortest Paths}{
#' \code{pathRanker(method="prob.shortest.path")} finds the K most probable loopless paths given a weighted network.
#' Before the paths are ranked the edge weights are converted into probabilistic edge weights using the Empirical
#' Cumulative Distribution (ECDF) over all edge weights.  This is called ECDF edge weight. The ECDF edge weight
#' serves as a probabilistic rank of the most important gene-gene interactions.  The probabilistic nature of the ECDF
#' edge weights allow for a significance test to determine if a path contains any functional structure or is
#' simply a random walk.  The probability of a path is simily the product of all ECDF weights along the path.
#' This is computed as a sum of the logs of the ECDF edge weights.
#'
#' The follwing arguments can be passed to \code{pathRanker(method="prob.shortest.path")}:
#' \describe{
#' \item{\code{K}}{Maximum number of paths to extract. Defaults to 10.}
#' \item{\code{minPathSize}}{The minimum number of edges for each extracted path. Defualts to 1.}
#' \item{\code{normalize}}{Specify if you want to normalize the probabilistic edge weights (across different labels)
#' before extracting the paths. Defaults to TRUE.}
#' }
#' }
#'
#' \subsection{P-value method}{
#' \code{pathRanker(method="pvalue")} is deprecated. Please use \code{prob.shortest.path} instead.
#' }
#' @param graph A weighted igraph object. Weights must be in \code{edge.weights} or \code{weight}
#' edge attributes.
#' @param method Which path ranking method to use.
#' @param start A list of start vertices, given by their vertex id.
#' @param end  A list of terminal vertices, given by their vertex id.
#' @param verbose Whether to display the progress of the function.
#' @param ... Method-specific parameters. See Details section.
#'
#' @return
#' A list of paths where each path has the following items:
#' \item{gene}{The ordered sequence of genes visited along the path.}
#' \item{compounds}{The ordered sequence of compounds visited along the path.}
#' \item{weights}{The ordered sequence of the log(ECDF edge weights) along the path.}
#' \item{distance}{The sum of the log(ECDF edge weights) along each path.  (a sum of logs is a product)}
#'
#'
#' @author Timothy Hancock, Ichigaku Takigawa, Nicolas Wicker and Ahmed Mohamed
#' @family Path ranking methods
#' @seealso getPathsAsEIDs, extractPathNetwork
#' @export
#' @examples
#' 	## Prepare a weighted reaction network.
#' 	## Conver a metabolic network to a reaction network.
#'  data(ex_sbml) # bipartite metabolic network of Carbohydrate metabolism.
#'  rgraph <- makeReactionNetwork(ex_sbml, simplify=TRUE)
#'
#' 	## Assign edge weights based on Affymetrix attributes and microarray dataset.
#'  # Calculate Pearson's correlation.
#' 	data(ex_microarray)	# Part of ALL dataset.
#' 	rgraph <- assignEdgeWeights(microarray = ex_microarray, graph = rgraph,
#' 		weight.method = "cor", use.attr="miriam.uniprot",
#' 		y=factor(colnames(ex_microarray)), bootstrap = FALSE)
#'
#' 	## Get ranked paths using probabilistic shortest paths.
#'  ranked.p <- pathRanker(rgraph, method="prob.shortest.path",
#' 					K=20, minPathSize=6)
#'
pathRanker <- function(graph, method="prob.shortest.path" ,start, end, verbose=TRUE, ...){
    # Checking the graph
    if(is.null(E(graph)$edge.weights)){
        if(!is.null(E(graph)$weight))
            E(graph)$edge.weights <- E(graph)$weight
        else{
            stop("No edge weights provided.")
        }
    }

    if(method == "prob.shortest.path")
        return(rankShortestPaths(graph, start=start, end=end, verbose=verbose, ...))
    else return(rankPvalue(graph, start=start, end=end, verbose=verbose, ...))

}

rankShortestPaths <- function(graph, K=10, minPathSize=1, start, end, normalize = TRUE, verbose) {
    pg = processNetwork(graph, start, end, scale="ecdf" ,normalize)

    zret <- list()
    for (i in 1:ncol(pg$weights)) {
        if(verbose){
            if (ncol(pg$weights) > 1) message("Extracting the ",K," most probable paths for ",graph$y.labels[i])
            else message("Extracting the ",K," most probable paths.")
        }

        # Min path size is increased by 2 to include start and end compounds.
        # Currently not in use.
        minpathsize = minPathSize + 2
        ps <- igraph::k_shortest_paths(pg$graph, from = "s", to = "t", weights = pg$weights[,i], k = K)
        paths <- list()
        for (j in 1:length(ps$vpaths)){
          path <- format_path(pg$graph, ps$epath[[j]], ps$vpath[[j]], pg$weights[,1])
          paths[[j]] <- path
        }


        idx <- which(sapply(paths,is.null))
        if (length(idx)>0) paths <- paths[-idx]
        if(length(paths)==0){
            if(ncol(pg$weights) > 1)
                message("  Warning:Counldn't find paths matching the criteria for ",graph$y.labels[i])
            else message("  Warning:Counldn't find paths matching the criteria.")
        }

        paths <- paths[order(sapply(paths,"[[", "distance"))] #order paths by distance

        if (ncol(pg$weights) > 1){
            zret[[i]] <- paths
        }else zret <- paths
    }

    if(length(zret)==0)return(NULL)

    if (ncol(pg$weights) > 1) {
        names(zret) <- graph$y.labels
        colnames(pg$weights) <- paste("prob",graph$y.labels,sep = ":")
    } else colnames(pg$weights) <- "prob"

    return(list(paths = zret,edge.weights = pg$weights, y.labels=graph$y.labels, source.net=graph$type))
}

rankPvalue <- function(graph, sampledpaths, start, end, alpha = 0.01, verbose) {
    msg <- "P-value ranking method is deprecated. Please use 'prob.shortest.path' method instead."
    .Deprecated(msg=msg)
}

processNetwork <- function(graph, start, end, scale=c("ecdf", "rescale"), normalize){
    # Add S, T vetrices for the shortest path algorithm.
    snodes <- if(missing(start)) V(graph)[degree(graph,mode="in")==0]$name else V(graph)[start]$name
    enodes <- if(missing(end)) V(graph)[degree(graph,mode="out")==0]$name else V(graph)[end]$name
    enodes <- enodes[!enodes %in% snodes]

    graph <- graph + vertices("s", "t")
    graph <- add_edges(graph,
            c(rbind("s",snodes),
              rbind(enodes,"t")),
            attr=list(edge.weights=list(rep(1, length(unlist(graph$y.labels)) )),
                    compound=""))

    # Get edge.weights and apply ecdf on each column
    edge.weights <- do.call("rbind", as.list(E(graph)$edge.weights))

    if(sum(!is.finite(edge.weights))>0){
        warning("Edge weights contain non-finite numbers. Setting them to the minimum edge weight")

        edge.weights <- apply(edge.weights, 2,
                            function(x){
                                x[ !is.finite(x) ] <- min( x[ is.finite(x) ] )
                                return(x)
                            })
    }
    if(scale=="ecdf")
        edge.probs <- apply(edge.weights, 2, function(x) ecdf(x[1:ecount(graph)])(x))
    else edge.probs <- apply(edge.weights, 2, rescale, c(1,0))


    # Normalize across Y labels
    if (ncol(edge.probs) > 1 & normalize == TRUE) {
        edge.probs <- edge.probs / rowSums(edge.probs)
    }

    if(scale=="ecdf")
        edge.probs <- -log(edge.probs)

    # E(graph)$edge.probs <- edge.probs
    return(list(graph=graph, weights=edge.probs))
}

#Adapted from scales package
rescale <- function (x, to = c(0, 1), from = range(x, na.rm = TRUE)){
    (x - from[1])/diff(from) * diff(to) + to[1]
}

format_path <- function(graph, epath, vpath, edge.probs) {
    # Remove S and T nodes
    epath = epath[2: (length(epath)-1)]
    vpath = vpath[2: (length(vpath)-1)]

    # Apply the edge weights
    E(graph)$temp.weights = edge.probs
    
    compounds = unlist(edge_attr(graph, "compound", epath))
    weights = edge_attr(graph, "temp.weights", epath)
    genes = vertex_attr(graph, "name", vpath)
    return(list(genes=genes, compounds=compounds, weights=weights, distance=sum(weights)))
}
ahmohamed/NetPathMiner documentation built on Nov. 30, 2024, 1:18 a.m.