#' @include buildGraphFromKEGGREST.R
#'
#' @description
#' Function \code{buildDataFromGraph} takes as input the KEGG graph
#' generated by \code{buildGraphFromKEGGREST}
#' and writes the KEGG knowledge model in the desired permanent directory.
#'
#' @details
#' Using \code{buildDataFromGraph} is the second step
#' to use the \code{\link[=FELLA]{FELLA}} package.
#' The knoledge graph is used to compute other internal variables that are
#' required to run any enrichment.
#' The main point behind the enrichment is to provide a small
#' part of the knowledge graph relevant to the supplied metabolites.
#' This is accomplished through diffusion processes and random walks,
#' followed by a statistical normalisation,
#' as described in [Picart-Armada, 2017].
#' When building the internal files,
#' the user can choose whether to store (i) matrices for each
#' provided method, and (ii) vectors derived from such matrices
#' to use the parametric approaches.
#' These are optional but enable (i) faster permutations and custom
#' metabolite backgrounds, and (ii) parametric approaches.
#' WARNING: diffusion and PageRank matrices in (i)
#' can allocate up to 250MB each.
#' On the other hand, the \code{niter} parameter
#' controls the amount of trials to approximate the
#' distribution of the connected component size under
#' uniform node sampling.
#' For further info, see the option \code{thresholdConnectedComponent}
#' in the details from \code{?generateResultsGraph}.
#' Regarding the destination, the user can specify
#' the name of the directory.
#' Otherwise a name containing the creation date, the organism
#' and the KEGG release will be used.
#' The database can be stored within the library path or in a
#' custom location.
#'
#'
#' @inheritParams .params
#' @param keggdata.graph An \pkg{igraph}
#' object generated by the function
#' \code{buildGraphFromKEGGREST}
#' @param databaseDir Character containing the directory to save KEGG files.
#' It is a relative directory inside the library location
#' if \code{internalDir = TRUE}. If left to \code{NULL},
#' an automatic name containing the date, organism and
#' the KEGG release is generated.
#' @param internalDir Logical, should the directory be internal
#' in the package directory?
#' @param matrices A character vector, containing any of these:
#' \code{"hypergeom"}, \code{"diffusion"}, \code{"pagerank"}
#' @param normality A character vector, containing any of these:
#' \code{"diffusion"}, \code{"pagerank"}
#' @param niter Numeric value, number of iterations to estimate the p-values
#' for the CC size. Between 10 and 1e3.
#'
#' @return \code{buildDataFromGraph} returns
#' \code{invisible(TRUE)} if successful.
#' As a side effect, the
#' directory \code{outdir} is created, containing
#' the internal data.
#'
#' @rdname data-funs
#'
#' @import igraph
#' @import Matrix
#' @export
buildDataFromGraph <- function(
keggdata.graph = NULL,
databaseDir = NULL,
internalDir = TRUE,
matrices = c("hypergeom", "diffusion", "pagerank"),
normality = c("diffusion", "pagerank"),
dampingFactor = 0.85,
niter = 1e2) {
# Checking the input #
######################
checkArgs <- checkArguments(
dampingFactor = dampingFactor,
databaseDir = databaseDir,
internalDir = internalDir)
if (!checkArgs$valid)
stop("Bad argument when calling function 'buildDataFromGraph'.")
# Check the graph
graph <- keggdata.graph
if (!is.igraph(graph))
stop("'keggdata.graph' is not a valid igraph object.")
if (!is.connected(graph))
stop("'keggdata.graph' is not connected!")
if (!("com" %in% list.vertex.attributes(graph)))
stop(
"'keggdata.graph' is not a valid graph: ",
"'com' attribute is missing.")
# Check other arguments
if (!is.null(matrices) & !is.character(matrices))
stop(
"'matrices' should be a character vector containing one or more: ",
"'hypergeom', 'diffusion', 'pagerank'.")
if (!is.null(normality) & !is.character(normality))
stop(
"'normality' should be a character vector containing one or more: ",
"'diffusion', 'pagerank'.")
if (!is.numeric(niter))
stop("'niter' must be an integer greater than 10 and smaller than 1e4.")
niter <- round(niter)
if (niter < 10 | niter > 1e3)
stop("'niter' must be an integer between 10 and 1e4.")
###################
# Database directory
if (is.null(databaseDir)) {
# create a meaningful name for the database containing
# creation date, organism and kegg release
prefix <- as.Date(Sys.Date(), "%Y/%m/%d")
suffix <- gsub(
"[^0-9a-zA-Z.]+", "_",
strsplit(comment(keggdata.graph), split = "\n")[[1]][2]
)
databaseDir <- paste0(
"created__", prefix, ";meta__", suffix
)
}
# Further sanity of databaseDir is not checked at the moment
if (internalDir) {
# Save the database in an internal directory, given by
# the package installation
internal <- paste0(system.file(package = "FELLA"), "/database/")
if (!dir.exists(paste0(internal))) {
message("Creating internal databases directory: ", internal)
dir.create(internal)
}
outputDir <- paste0(internal, databaseDir)
} else {
# User-defined directory
outputDir <- databaseDir
}
if (dir.exists(outputDir)) {
stop(
"Directory ", databaseDir,
" already exists. Please use another directory.")
}
# See the probs of getting at least
# a connected component with a given size
subgraph.size <- seq(250)
component.size <- seq(250)
message(
"Computing probabilities for random subgraphs... ",
"(this may take a while)")
# Run as many trials as specified
array.null <- plyr::laply(
seq(niter),
function(dummy) {
# Sample the maximum amount of nodes
select <- sample(
vcount(keggdata.graph),
tail(subgraph.size, 1))
# For each k.sub <= subgraph.size, see the distribution of CCs
plyr::laply(
subgraph.size,
function(k.sub) {
# sample a subgraph every time
# it is important to use the head of the sampled values
# without any further sorting to avoid introducing biases
sel.g <- induced.subgraph(
graph = keggdata.graph,
vids = head(select, k.sub))
# largest cc
csize.max <- max(clusters(sel.g)$csize)
# sizes for which the random trial reported any
# cc as large as them
component.size <= csize.max
}
)
},
.progress = "text"
)
keggdata.pvalues.size <- apply(
array.null,
MARGIN = c(3, 2), # This ensures rows are component sizes
function(run) {
# empirical pvalue-like quantity
(sum(run) + 1)/(length(run) + 1)
}
)
colnames(keggdata.pvalues.size) <- subgraph.size
rownames(keggdata.pvalues.size) <- component.size
if (!dir.exists(outputDir)) {
message(
paste0(
"Directory ",
outputDir,
" does not exist. Creating it..."))
dir.create(path = outputDir, recursive = TRUE)
message("Done.")
}
save(
keggdata.graph, keggdata.pvalues.size,
file = paste0(outputDir, "/keggdata.graph.RData"))
message("Done.")
id.pathway <- which(V(graph)$com == 1)
id.compound <- which(V(graph)$com == 5)
####################
# Hypergeom matrix #
####################
if ("hypergeom" %in% matrices) {
message("Computing hypergeom.matrix...")
graph <- keggdata.graph
E(graph)$weight <- 1/E(graph)$weight
hypergeom.matrix <- shortest.paths(
graph = graph,
v = id.compound,
to = id.pathway,
mode = "out") == 4
hypergeom.matrix <- Matrix(data = hypergeom.matrix, sparse = TRUE)
save(
hypergeom.matrix,
file = paste0(outputDir, "/hypergeom.matrix.RData"))
message("Done.")
}
###################
# Diffusion files #
###################
if ("diffusion" %in% c(matrices, normality)) {
message(
"Computing diffusion.matrix... ",
"(this may take a while and use some memory)")
graph <- as.undirected(keggdata.graph)
# Laplacian matrix
K <- graph.laplacian(graph, normalized = FALSE, sparse = TRUE)
# Boundary conditions
Matrix::diag(K)[id.pathway] <- Matrix::diag(K)[id.pathway] + 1.0
# The inverse (WARNING: it uses a large amount of memory)
R <- as.matrix(solve(K))
rownames(R) <- V(graph)$name
colnames(R) <- V(graph)$name
# Row sums
diffusion.matrix <- R[, id.compound]
if ("diffusion" %in% matrices) {
save(
diffusion.matrix,
file = paste0(outputDir, "/diffusion.matrix.RData"))
message("Done")
}
if ("diffusion" %in% normality) {
message("Computing diffusion.rowSums...")
diffusion.rowSums <- rowSums(diffusion.matrix)
diffusion.squaredRowSums <- rowSums(diffusion.matrix ^ 2)
save(
diffusion.rowSums, diffusion.squaredRowSums,
file = paste0(outputDir, "/diffusion.rowSums.RData"))
message("Done.")
}
}
##################
# Pagerank files #
##################
if ("pagerank" %in% c(matrices, normality)) {
message(
"Computing pagerank.matrix... ",
"(this may take a while and use some memory)")
graph <- keggdata.graph
# Damping factor
d <- dampingFactor
# Laplacian matrix
K <- -t(graph.laplacian(graph, normalized = FALSE, sparse = TRUE))
Matrix::diag(K) <- 0.0
K <- apply(K, 2, function(x) {
if (sum(x) != 0) return(x/sum(x))
else return(rep(1/dim(K)[1], dim(K)[1]))}
)
# The inverse (WARNING: it uses a large amount of memory)
R <- (1 - d) * solve(diag(nrow(K)) - d * K)
rownames(R) <- V(graph)$name
colnames(R) <- V(graph)$name
pagerank.matrix <- R[, id.compound]
if ("pagerank" %in% matrices) {
save(
pagerank.matrix,
file = paste0(outputDir, "/pagerank.matrix.RData"))
message("Done.")
}
if ("pagerank" %in% normality) {
message("Computing pagerank.rowSums...")
pagerank.rowSums <- rowSums(pagerank.matrix)
pagerank.squaredRowSums <- rowSums(pagerank.matrix ^ 2)
save(
pagerank.rowSums,
pagerank.squaredRowSums,
file = paste0(outputDir, "/pagerank.rowSums.RData"))
message("Done.")
}
}
invisible(TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.