#' Run Python implementation PANDA in R
#'
#' \strong{PANDA}(Passing Attributes between Networks for Data Assimilation) is a message-passing model
#' to reconstruct gene regulatory network, which integrates multiple sources of biological data-including protein-protein interaction data,
#' gene expression data, and transcription factor binding motifs data to reconstruct genome-wide, condition-specific regulatory networks.
#' \href{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064832}{[(Glass et al. 2013)])}
#' This function is designed to run the a derived PANDA implementation in Python Library "netZooPy" \href{https://github.com/netZoo/netZooPy}{netZooPy}.
#'
#' @param expr_file Character string indicating the file path of expression values file, with each gene(in rows) across samples(in columns).
#' @param motif_file An optional character string indicating the file path of a prior transcription factor binding motifs dataset.
#' When this argument is not provided, analysis will continue with Pearson correlation matrix.
#' @param ppi_file An optional character string indicating the file path of protein-protein interaction edge dataset.
#' Also, this can be generated with a list of proteins of interest by \code{\link{sourcePPI}}.
#'
#' @param computing 'cpu' uses Central Processing Unit (CPU) to run PANDA; 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. The default value is "cpu".
#'
#' @param precision 'double' computes the regulatory network in double precision (15 decimal digits); 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. The default value is 'double'.
#' @param save_memory 'TRUE' removes temporary results from memory. The result network is weighted adjacency matrix of size (nTFs, nGenes); 'FALSE' keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge. PANDA indegree/outdegree of panda network, only if save_memory = FALSE. The default value is 'FALSE'.
#' @param save_tmp 'TRUE' saves middle data like expression matrix and normalized networks; 'FALSE' deletes the middle data. The default value is 'TURE'.
#' @param keep_expression_matrix 'TRUE' keeps the input expression matrix as an attribute in the result Panda object.'FALSE' deletes the expression matrix attribute in the Panda object. The default value is 'FALSE'.
#' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'.
#' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'.
#' @param with_header Boolean to read gene expression file with a header for sample names
#' @return When save_memory=FALSE(default), this function will return a list of three items:
#' Use \code{$panda} to access the standard output of PANDA as data frame, which consists of four columns:
#' "TF", "Gene", "Motif" using 0 or 1 to indicate if this edge belongs to prior motif dataset, and "Score".
#'
#' Use \code{$indegree} to access the indegree of PANDA network as data frame, which consists of two columns: "Gene", "Score".
#'
#' Use \code{$outdegree} to access the outdegree of PANDA network as data frame, which consists of two columns: "TF", "Score".
#'
#' When save_memory=TRUE, this function will return a weigheted adjacency matirx of size (nTFs, nGenes), use \code{$WAMpanda} to access.
#'
#' @examples
#' # take the treated TB dataset as example here.
#' # refer to the datasets files path in inst/extdat
#'
#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt",
#' package = "netZooR", mustWork = TRUE)
#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt",
#' package = "netZooR", mustWork = TRUE)
#' motif_file_path <- system.file("extdata", "chip_matched.txt", package = "netZooR", mustWork = TRUE)
#' ppi_file_path <- system.file("extdata", "ppi_matched.txt", package = "netZooR", mustWork = TRUE)
#'
#'
#' # Run PANDA for treated and control network
#' \donttest{
#' treated_all_panda_result <- pandaPy(expr_file = treated_expression_file_path,
#' motif_file = motif_file_path, ppi_file = ppi_file_path,
#' modeProcess="legacy", remove_missing = TRUE )
#'
#' # access PANDA regulatory network
#' treated_net <- treated_all_panda_result$panda
#'
#' # access PANDA regulatory indegree network.
#' indegree_net <- treated_all_panda_result$indegree
#'
#' # access PANDA regulatory outdegree networks
#' outdegree_net <- treated_all_panda_result$outdegree
#' }
#'
#' @import reticulate
#' @export
#'
pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double",save_memory=FALSE, save_tmp=TRUE, keep_expression_matrix=FALSE, modeProcess="union", remove_missing=FALSE, with_header=FALSE){
if(missing(expr_file)){
stop("Please provide the path of gene expression data file to 'expr_file' variable") }
else{ expr.str <- paste("\'", expr_file, "\'", sep = '') }
if(is.null(motif_file)){
motif.str <- 'None'
message("The prior motif network is not provided, so analysis continues with Pearson correlation matrix and gene coexpression mastrix is returned as a result network")
} else{ motif.str <- paste("\'", motif_file,"\'", sep = '') }
if(is.null(ppi_file)){
ppi.str <- 'None'
message("No protein-protein interaction network provided.")
} else{ ppi.str <- paste("\'", ppi_file, "\'", sep = '') }
# computing variable
if(computing=="gpu"){
computing.str <- "computing='gpu'"
} else{ computing.str <- "computing='cpu'"}
# precision variable
if(precision == "single" ){
precision.str <- "precision='single'"
} else{ precision.str <- "precision='double'"}
# save_memory variable
if(save_memory==TRUE){
savememory.str <- "save_memory= True"
} else{ savememory.str <- "save_memory=False" }
# save_tmp variable
if(save_tmp==FALSE){
savetmp.str <- "save_tmp= False"
} else{ savetmp.str <- "save_tmp=True" }
# keep_expression_matrix variable
if(keep_expression_matrix==TRUE){
keepexpression.str <- "keep_expression_matrix=True"
} else{ keepexpression.str <- "keep_expression_matrix=False" }
# with header option
if(with_header==FALSE){
withheader.str <- "with_header=False"
}else if (with_header==TRUE){
withheader.str <- "with_header=True"
}
# when pre-processing mode is legacy
if(modeProcess == "legacy"){
if(remove_missing == TRUE){
message("Use the legacy mode to pre-process the input dataset and keep only the matched TFs or Genes")
mode.str <- "modeProcess ='legacy', remove_missing = True"
} else{ message("Use the legacy mode (netZooPy version <= 0.5) to pre-process the input dataset and keep all the unmatched TFs or Genes")
mode.str <- "modeProcess ='legacy', remove_missing = False"}
}
# when pre-processing mode is union
else if(modeProcess == "union"){
mode.str <- "modeProcess ='union'"
}
else if(modeProcess == "intersection"){
mode.str <- "modeProcess ='intersection'"
}
# source the pypanda from github raw website.
pandapath <- system.file("extdata", "panda.py", package = "netZooR", mustWork = TRUE)
reticulate::source_python(pandapath,convert = TRUE)
# invoke Python script to create a Panda object
obj.str <- paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",",
computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," ,
keepexpression.str, ",", mode.str, "," , withheader.str, ")", sep ='')
# run Python code
py_run_string(obj.str)
# run PAMDA
if(save_memory == FALSE){
py_run_string("panda_network=panda_obj.export_panda_results",local = FALSE, convert = TRUE)
# convert python object to R vector
panda_net <- py$panda_network
# re-assign data type
panda_net$tf <- as.character(panda_net$tf)
panda_net$gene <- as.character(panda_net$gene)
if("motif" %in% names(panda_net)){
panda_net$motif <- as.numeric(panda_net$motif)
}
panda_net$force <- as.numeric(panda_net$force)
if("motif" %in% names(panda_net)){
# adjust column order
panda_net <- panda_net[,c("tf","gene","motif","force")]
# rename the PANDA output colnames
colnames(panda_net) <- c("TF","Gene","Motif","Score")
}else{
# adjust column order
panda_net <- panda_net[,c("tf","gene","force")]
# rename the PANDA output colnames
colnames(panda_net) <- c("TF","Gene","Score")
}
# in-degree of panda network
py_run_string(paste("indegree=panda_obj.return_panda_indegree()"))
indegree_net <- py$indegree
indegree_net <- as.data.frame(cbind(Target = rownames(indegree_net), Target_Score = indegree_net$force), stringsAsFactors =FALSE)
indegree_net$`Target_Score` <- as.numeric(indegree_net$`Target_Score`)
# out-degree of panda netwook
py_run_string(paste("outdegree=panda_obj.return_panda_outdegree()"))
outdegree_net <- py$outdegree
outdegree_net <- as.data.frame(cbind(Regulator = rownames(outdegree_net), Regulator_Score = outdegree_net$force), stringsAsFactors =FALSE)
outdegree_net$`Regulator_Score` <- as.numeric(outdegree_net$`Regulator_Score`)
if( length(intersect(panda_net$Gene, panda_net$TF))>0){
panda_net$TF <- paste('reg_', panda_net$TF, sep='')
panda_net$Gene <- paste('tar_', panda_net$Gene, sep='')
message("Rename the content of first two columns with prefix 'reg_' and 'tar_' as there are some duplicate node names between the first two columns" )
}
output <- list("panda" = panda_net, "indegree" = indegree_net, "outdegree" = outdegree_net)
} else{
py_run_string("panda_network=panda_obj.panda_network",local = FALSE, convert = TRUE)
panda_net_py <- py$panda_network
# weighted adjacency matrix of PANDA network
output <- list("WAMpanda" = panda_net_py)
}
message ("...Finish PANDA...")
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.