Nothing
# Constructing Gene Regulatory Network (GRN)
# @description \code{GRN} is a function to construct the GRN network
# using random forest algorithm. This method was initially introduced in
# GENIE3, but here, GRN support parallel computing. It can control the
# model accuracy, and define the regulators in the network.
# @param expr Gene expression data, either a matrix or a data frame.
# By default (\code{rowSample = FALSE}), each row represents a gene,
# each column represents a sample.
# @param reg vector of charactors, representing gene regulators.
# By default, these are transcription factors and co-factors,
# defined by three literatures/databases, namely RegNet, TRRUST, and
# Marbach2016.
# @param rowSample logical If \code{TRUE}, each row represents a sample.
# The default is \code{FALSE}.
# @param K integer or character. The number of features in each tree,
# can be either a integer number, \code{sqrt}, or \code{all}.
# \code{sqrt} denotes sqrt(the number of \code{reg}), \code{all}
# means the number of \code{reg}. The default is \code{sqrt}.
# @param nbTrees integer. The number of trees. The default is 1000.
# @param importanceMeasure character. The importance type in
# \code{importance}. importanceMeasure can be \code{\%IncMSE} or
# \code{IncNodePurity}, corresponding to type = 1 and 2 in
# \code{importance}. The default is \code{IncNodePurity}
# (decrease in node impurity), which is faster than \code{\%IncMSE}
# (decrease in accuracy).
# @param trace logical. To show the progress or not (default).
# @param BPPARAM parameters for parallel computing (default is
# \code{bpparam()}).
# @param maxMSE numeric. The maximum out-of-bag MSE is to control model
# accuracy. The default is NULL, which means no filtering by this parameter.
# @param minR numeric. The minimum correlation coefficient of prediction is to
# control model accuracy. The default is 0.3.
# @param ... the rest parameters in \code{\link{randomForest}} function.
#
# @return A list of 'weightHi' and 'performanceHi'. 'weightHi' is the edge
# weights by high accurate model, and 'performanceHi' is the performances
# of high accurate models.
#' @import randomForest
#' @import BiocParallel
#' @import tibble
#' @include globals.R
# @export
GRN = function(expr, reg = TFs$TF_name, rowSample = FALSE,
K = "sqrt", nbTrees = 1000, importanceMeasure = "IncNodePurity",
trace = FALSE, BPPARAM = bpparam(), maxMSE = NULL,
minR = 0.3, ...) {
stopifnot(is.null(maxMSE) || maxMSE > 0)
stopifnot(is.null(minR) || (minR > 0 & minR < 1))
inout = inOutput(expr = expr, reg = reg, rowSample = rowSample)
inputMatrix = inout$inputMatrix
outputMatrix = inout$outputMatrix
net = grNet(inputMatrix, outputMatrix, K = K, nbTrees = nbTrees,
importanceMeasure = importanceMeasure, trace = trace,
BPPARAM = bpparam(), ...)
weight = net$weight
performance = net$performance
# Only to use the good models
performanceHi = performance
if (!is.null(maxMSE)) {
# performanceHi = performanceHi[mse <= maxMSE]
performanceHi = filter(performanceHi, mse <= maxMSE)
}
if (!is.null(minR)) {
# performanceHi = performanceHi[r >= minR]
performanceHi = filter(performanceHi, r >= minR)
}
modelHi = performanceHi$gene
if (length(modelHi) == 0) {
stop("All models are not good enough, please check if 'maxMSE' ",
"and 'minR' were properly set, or try other ",
"network inference methods.")
}
# weightHi = weight[to.gene %in% modelHi]
weightHi = filter(weight, to.gene %in% modelHi)
return(list(weightHi = weightHi, performanceHi = performanceHi))
}
.local_grNet = function(targetGeneIdx, inputGeneNames,
outputGeneNames, inputMatrix, outputMatrix, K,
nbTrees, importanceMeasure, trace, nodesizeInArgs,
...) {
num.samples = nrow(outputMatrix)
num.genes = ncol(outputMatrix)
if (trace) {
cat(paste("Computing gene ", targetGeneIdx,
"/", num.genes, "\n", sep = ""))
utils::flush.console()
}
targetGeneName = outputGeneNames[targetGeneIdx]
theseInputGeneNames = setdiff(inputGeneNames, targetGeneName)
x = inputMatrix[, theseInputGeneNames]
numInputGenes = length(theseInputGeneNames)
y = outputMatrix[, targetGeneName]
if (is(K, "numeric")) {
mtry = K
} else if (K == "sqrt") {
mtry = round(sqrt(numInputGenes))
} else if (K == "all") {
mtry = numInputGenes
} else {
stop("Parameter K must be \"sqrt\", or \"all\", or an integer")
}
if (trace) {
cat(paste("K = ", mtry, ", ", nbTrees, " trees\n\n",
sep = ""))
utils::flush.console()
}
if (importanceMeasure == "%IncMSE") {
y = y
importance0 = TRUE
type0 = 1
} else {
y = y/sd(y)
importance0 = FALSE
type0 = 2
}
if (nodesizeInArgs) {
rf = randomForest(x, y, mtry = mtry, ntree = nbTrees,
importance = importance0, ...)
} else {
rf = randomForest(x, y, mtry = mtry, ntree = nbTrees,
importance = importance0, nodesize = 1,
...)
}
im = importance(rf, type = type0)
weight = tibble(from.gene = rownames(im),
to.gene = rep(targetGeneName, nrow(im)),
imp = as.vector(im)/num.samples)
y_hat = stats::predict(rf)
return(list(weight = weight, mse = sum((y_hat - y)^2)/length(y),
r = as.numeric(cor(y_hat, y)),
p = stats::cor.test(y_hat, y, alternative = "greater")$p.value,
pVarExplaned = rf$rsq[length(rf$rsq)] * 100))
}
grNet = function(inputMatrix, outputMatrix, K = "sqrt",
nbTrees = 1000, importanceMeasure = "IncNodePurity",
trace = FALSE, BPPARAM = bpparam(), ...) {
## All of the matrixs are sample * gene inputMatrix
## is Gene Expression matrix of TFs outputMatrix is
## Gene Expression matrix of All genes (or TFs)
# Report when parameter importanceMeasure is not
# correctly spelled
if (importanceMeasure != "IncNodePurity" && importanceMeasure !=
"%IncMSE") {
stop("Parameter importanceMeasure must be ",
"\"IncNodePurity\" or \"%IncMSE\"")
}
# Check if nodesize parameter is in the input
# arguments
args = list(...)
nInArgs = "nodesize" %in% names(args)
outputGeneNames = colnames(outputMatrix)
inputGeneNames = colnames(inputMatrix)
idx = stats::setNames(seq(ncol(outputMatrix)),
nm = outputGeneNames)
tic = system.time({
res = bplapply(idx, function(targetGeneIdx0) {
.local_grNet(targetGeneIdx0, inputGeneNames,
outputGeneNames, inputMatrix, outputMatrix,
K, nbTrees, importanceMeasure, trace,
nodesizeInArgs = nInArgs, ...)
}, BPPARAM = BPPARAM)
})
if (trace) {
cat("GRN network inference costs", tic[3],
"seconds.\n")
}
weight = do.call("rbind", lapply(res, "[[", "weight"))
colnames(weight) = c("from.gene", "to.gene", "weight")
performance = tibble(gene = outputGeneNames,
mse = do.call("c", lapply(res, "[[", "mse")),
r = do.call("c", lapply(res, "[[", "r")),
p = do.call("c", lapply(res, "[[", "p")),
pVarExplaned = do.call("c", lapply(res, "[[",
"pVarExplaned")))
return(list(weight = weight, performance = performance))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.