FELLA: Interpretation and enrichment for metabolomics data

Documented in buildDataFromGraph

#' @include buildGraphFromKEGGREST.R
#' 
#' @description  
#' Function \code{buildDataFromGraph} takes as input the KEGG graph 
#' generated by \code{buildGraphFromKEGGREST} 
#' and writes the KEGG knowledge model in the desired permanent directory. 
#' 
#' @details
#' Using \code{buildDataFromGraph} is the second step 
#' to use the \code{\link[=FELLA]{FELLA}} package. 
#' The knoledge graph is used to compute other internal variables that are 
#' required to run any enrichment. 
#' The main point behind the enrichment is to provide a small 
#' part of the knowledge graph relevant to the supplied metabolites. 
#' This is accomplished through diffusion processes and random walks, 
#' followed by a statistical normalisation, 
#' as described in [Picart-Armada, 2017].
#' When building the internal files, 
#' the user can choose whether to store (i) matrices for each 
#' provided method, and (ii) vectors derived from such matrices 
#' to use the parametric approaches. 
#' These are optional but enable (i) faster permutations and custom 
#' metabolite backgrounds, and (ii) parametric approaches. 
#' WARNING: diffusion and PageRank matrices in (i) 
#' can allocate up to 250MB each. 
#' On the other hand, the \code{niter} parameter 
#' controls the amount of trials to approximate the 
#' distribution of the connected component size under 
#' uniform node sampling. 
#' For further info, see the option \code{thresholdConnectedComponent} 
#' in the details from \code{?generateResultsGraph}. 
#' Regarding the destination, the user can specify 
#' the name of the directory. 
#' Otherwise a name containing the creation date, the organism 
#' and the KEGG release will be used. 
#' The database can be stored within the library path or in a 
#' custom location. 
#' 
#'
#' @inheritParams .params
#' @param keggdata.graph An \pkg{igraph} 
#' object generated by the function 
#' \code{buildGraphFromKEGGREST}
#' @param databaseDir Character containing the directory to save KEGG files. 
#' It is a relative directory inside the library location
#' if \code{internalDir = TRUE}. If left to \code{NULL}, 
#' an automatic name containing the date, organism and 
#' the KEGG release is generated.
#' @param internalDir Logical, should the directory be internal 
#' in the package directory?
#' @param matrices A character vector, containing any of these: 
#' \code{"hypergeom"}, \code{"diffusion"}, \code{"pagerank"}
#' @param normality A character vector, containing any of these: 
#' \code{"diffusion"}, \code{"pagerank"}
#' @param niter Numeric value, number of iterations to estimate the p-values 
#' for the CC size. Between 10 and 1e3.
#'
#' @return \code{buildDataFromGraph} returns 
#' \code{invisible(TRUE)} if successful. 
#' As a side effect, the
#' directory \code{outdir} is created, containing 
#' the internal data.
#' 
#' @rdname data-funs 
#' 
#' @import igraph
#' @import Matrix
#' @export
buildDataFromGraph <- function(
    keggdata.graph = NULL, 
    databaseDir = NULL, 
    internalDir = TRUE, 
    matrices = c("hypergeom", "diffusion", "pagerank"), 
    normality = c("diffusion", "pagerank"), 
    dampingFactor = 0.85, 
    niter = 1e2) {
    
    # Checking the input #
    ######################
    checkArgs <- checkArguments(
        dampingFactor = dampingFactor, 
        databaseDir = databaseDir, 
        internalDir = internalDir)
    if (!checkArgs$valid)
        stop("Bad argument when calling function 'buildDataFromGraph'.")  
    
    # Check the graph
    graph <- keggdata.graph
    
    if (!is.igraph(graph)) 
        stop("'keggdata.graph' is not a valid igraph object.")
    
    if (!is.connected(graph)) 
        stop("'keggdata.graph' is not connected!")
    
    if (!("com" %in% list.vertex.attributes(graph))) 
        stop(
            "'keggdata.graph' is not a valid graph: ",
            "'com' attribute is missing.")
    
    # Check other arguments
    if (!is.null(matrices) & !is.character(matrices))
        stop(
            "'matrices' should be a character vector containing one or more: ", 
            "'hypergeom', 'diffusion', 'pagerank'.")
    
    if (!is.null(normality) & !is.character(normality))
        stop(
            "'normality' should be a character vector containing one or more: ",
            "'diffusion', 'pagerank'.")
    
    if (!is.numeric(niter)) 
        stop("'niter' must be an integer greater than 10 and smaller than 1e4.")
    niter <- round(niter)
    
    if (niter < 10 | niter > 1e3) 
        stop("'niter' must be an integer between 10 and 1e4.")
    
    ###################
    # Database directory 
    if (is.null(databaseDir)) {
        # create a meaningful name for the database containing 
        # creation date, organism and kegg release
        prefix <- as.Date(Sys.Date(), "%Y/%m/%d")
        suffix <- gsub(
            "[^0-9a-zA-Z.]+", "_", 
            strsplit(comment(keggdata.graph), split = "\n")[[1]][2]
        )
        
        databaseDir <- paste0(
            "created__", prefix, ";meta__", suffix
        )
    }
    # Further sanity of databaseDir is not checked at the moment
    
    if (internalDir) {
        # Save the database in an internal directory, given by
        # the package installation
        internal <- paste0(system.file(package = "FELLA"), "/database/")
        
        if (!dir.exists(paste0(internal))) {
            message("Creating internal databases directory: ", internal)
            dir.create(internal)
        }
        outputDir <- paste0(internal, databaseDir)
        
    } else {
        # User-defined directory
        outputDir <- databaseDir
    }
    
    if (dir.exists(outputDir)) {
        stop(
            "Directory ", databaseDir, 
            " already exists. Please use another directory.")
    }
    
    # See the probs of getting at least 
    # a connected component with a given size
    subgraph.size <- seq(250)
    component.size <- seq(250)
    
    message(
        "Computing probabilities for random subgraphs... ", 
        "(this may take a while)")
    
    # Run as many trials as specified
    array.null <- plyr::laply(
        seq(niter), 
        function(dummy) {
            # Sample the maximum amount of nodes 
            select <- sample(
                vcount(keggdata.graph), 
                tail(subgraph.size, 1))
            
            # For each k.sub <= subgraph.size, see the distribution of CCs
            plyr::laply(
                subgraph.size, 
                function(k.sub) {
                    # sample a subgraph every time
                    # it is important to use the head of the sampled values
                    # without any further sorting to avoid introducing biases
                    sel.g <- induced.subgraph(
                        graph = keggdata.graph, 
                        vids = head(select, k.sub))
                    # largest cc
                    csize.max <- max(clusters(sel.g)$csize)
                    
                    # sizes for which the random trial reported any 
                    # cc as large as them
                    component.size <= csize.max
                }
            )
        }, 
        .progress = "text"
    )
    keggdata.pvalues.size <- apply(
        array.null, 
        MARGIN = c(3, 2), # This ensures rows are component sizes
        function(run) {
            # empirical pvalue-like quantity
            (sum(run) + 1)/(length(run) + 1)
        }
    )
    colnames(keggdata.pvalues.size) <- subgraph.size
    rownames(keggdata.pvalues.size) <- component.size
    
    if (!dir.exists(outputDir)) {
        message(
            paste0(
                "Directory ", 
                outputDir, 
                " does not exist. Creating it..."))
        dir.create(path = outputDir, recursive = TRUE)
        message("Done.")
    }
    
    save(
        keggdata.graph, keggdata.pvalues.size, 
        file = paste0(outputDir, "/keggdata.graph.RData"))
    
    message("Done.")
    
    id.pathway <- which(V(graph)$com == 1)
    id.compound <- which(V(graph)$com == 5)
    
    
    ####################
    # Hypergeom matrix #
    ####################
    
    if ("hypergeom" %in% matrices) {
        message("Computing hypergeom.matrix...")
        
        graph <- keggdata.graph
        E(graph)$weight <- 1/E(graph)$weight
        
        hypergeom.matrix <- shortest.paths(
            graph = graph, 
            v = id.compound, 
            to = id.pathway, 
            mode = "out") == 4
        hypergeom.matrix <- Matrix(data = hypergeom.matrix, sparse = TRUE)
        
        save(
            hypergeom.matrix, 
            file = paste0(outputDir, "/hypergeom.matrix.RData"))
        
        message("Done.")
    }
    
    ###################
    # Diffusion files #
    ###################
    
    if ("diffusion" %in% c(matrices, normality)) {
        message(
            "Computing diffusion.matrix... ", 
            "(this may take a while and use some memory)")
        
        graph <- as.undirected(keggdata.graph)
        
        # Laplacian matrix
        K <- graph.laplacian(graph, normalized = FALSE, sparse = TRUE)
        
        # Boundary conditions
        Matrix::diag(K)[id.pathway] <- Matrix::diag(K)[id.pathway] + 1.0
        
        # The inverse (WARNING: it uses a large amount of memory)
        R <- as.matrix(solve(K))
        rownames(R) <- V(graph)$name
        colnames(R) <- V(graph)$name
        
        # Row sums
        diffusion.matrix <- R[, id.compound]
        
        if ("diffusion" %in% matrices) {
            save(
                diffusion.matrix, 
                file = paste0(outputDir, "/diffusion.matrix.RData"))
            
            message("Done")
        }
        
        if ("diffusion" %in% normality) {
            message("Computing diffusion.rowSums...")
            
            diffusion.rowSums <- rowSums(diffusion.matrix)
            diffusion.squaredRowSums <- rowSums(diffusion.matrix ^ 2)
            
            save(
                diffusion.rowSums, diffusion.squaredRowSums, 
                file = paste0(outputDir, "/diffusion.rowSums.RData"))
            
            message("Done.")
        }
    }
    
    
    
    ##################
    # Pagerank files #
    ##################
    
    if ("pagerank" %in% c(matrices, normality)) {
        message(
            "Computing pagerank.matrix... ", 
            "(this may take a while and use some memory)")
        
        graph <- keggdata.graph
        
        # Damping factor
        d <- dampingFactor
        
        # Laplacian matrix
        K <- -t(graph.laplacian(graph, normalized = FALSE, sparse = TRUE))
        Matrix::diag(K) <- 0.0
        
        K <- apply(K, 2, function(x) {
            if (sum(x) != 0) return(x/sum(x))
            else return(rep(1/dim(K)[1], dim(K)[1]))}
        )
        
        # The inverse (WARNING: it uses a large amount of memory)
        R <- (1 - d) * solve(diag(nrow(K)) - d * K)
        rownames(R) <- V(graph)$name
        colnames(R) <- V(graph)$name
        pagerank.matrix <- R[, id.compound]
        
        if ("pagerank" %in% matrices) {
            save(
                pagerank.matrix, 
                file = paste0(outputDir, "/pagerank.matrix.RData"))
            
            message("Done.")
        }
        
        if ("pagerank" %in% normality) {
            message("Computing pagerank.rowSums...")
            
            pagerank.rowSums <- rowSums(pagerank.matrix)
            pagerank.squaredRowSums <- rowSums(pagerank.matrix ^ 2)
            
            save(
                pagerank.rowSums, 
                pagerank.squaredRowSums, 
                file = paste0(outputDir, "/pagerank.rowSums.RData"))
            message("Done.")
        }
    }
    invisible(TRUE)
}